In [1]:
import pandas as pd
import os

In [9]:
dump_files = ["20230622WG6_AllElectivesPart1-export-20230705-123616.csv", "20230622WG6_AllElectivesPart2-export-20230705-123639.csv"]
dump_contents = []
for dump_file in dump_files:
    raw = pd.read_csv(os.path.join("coded",dump_file), dtype='string').fillna("")
    dump_contents.append(raw)
raw_coded = pd.concat(dump_contents)
annotated = raw_coded[raw_coded['Annotations'].str.len() >0]
annotated.columns

Index(['Title', 'Text', '[M] elective: ', '[M] institution: ', '[M] title: ',
       '[C] Barker, Lecia', '[C] Bradley, Steven', '[C] Hooshangi, Sara',
       '[C] Kunkeler, Thom', '[C] Lennon, Ruth', '[C] Parkinson, Jack',
       '[C] Sibia, Naaz (Naaz Sibia)', 'Annotations',
       '[C] Altin, Rukiye (rukiye)', '[C] McNeill, Fiona',
       '[C] Minguillón, Julià (Julià Minguillón)',
       '[C] Peltsverger, Svetlana'],
      dtype='object')

In [10]:
meta_cols = []
renamer = {}
for col in annotated.columns:
    shortened = col
    if col[0:3] == "[M]":
        print ("[M] " )
        shortened = col[4:-2]
        meta_cols.append(shortened)
    renamer[col] = shortened

renamer

[M] 
[M] 
[M] 


{'Title': 'Title',
 'Text': 'Text',
 '[M] elective: ': 'elective',
 '[M] institution: ': 'institution',
 '[M] title: ': 'title',
 '[C] Barker, Lecia': '[C] Barker, Lecia',
 '[C] Bradley, Steven': '[C] Bradley, Steven',
 '[C] Hooshangi, Sara': '[C] Hooshangi, Sara',
 '[C] Kunkeler, Thom': '[C] Kunkeler, Thom',
 '[C] Lennon, Ruth': '[C] Lennon, Ruth',
 '[C] Parkinson, Jack': '[C] Parkinson, Jack',
 '[C] Sibia, Naaz (Naaz Sibia)': '[C] Sibia, Naaz (Naaz Sibia)',
 'Annotations': 'Annotations',
 '[C] Altin, Rukiye (rukiye)': '[C] Altin, Rukiye (rukiye)',
 '[C] McNeill, Fiona': '[C] McNeill, Fiona',
 '[C] Minguillón, Julià (Julià Minguillón)': '[C] Minguillón, Julià (Julià Minguillón)',
 '[C] Peltsverger, Svetlana': '[C] Peltsverger, Svetlana'}

In [19]:
# first two meta columns are elective code and institution, which we use as the index
# third meta column is title
index_cols = meta_cols[0:2]
keep_cols = index_cols + ["Annotations"]
export = annotated.rename(columns=renamer)[keep_cols]

# Ignore the Uppsala electives with no content
export = export[~(export.institution=="Uppsala_Sweden")]
export = export[~(export.elective.isin(["cxinfr11245","cxinfr11246","cxinfr11248","cxinfr11083"]))]
export.to_csv("annotated_electives.csv")
export

Unnamed: 0,elective,institution,Annotations
1,COMP3491,Durham_England,Annotation: I think this is security because o...
2,COMP3507,Durham_England,Annotation: Low confidence on this. Complexity...
10,COMP3677,Durham_England,"Annotation: This is related to AI, but also to..."
12,COMP4117,Durham_England,Annotation: quantum computing; User: Sara Hoos...
13,COMP4127,Durham_England,Annotation: very low confidence; User: Lecia B...
16,COMP4187,Durham_England,Annotation: could be one of many sciences; Use...
18,cxinfr08033,Edinburgh_UK,Annotation: ; User: Jack Parkinson; Timestamp:...
19,cxinfr10052,Edinburgh_UK,Annotation: According to the ACM 2023 curricul...
23,cxinfr10084,Edinburgh_UK,Annotation: As with previous experiential lear...
27,cxinfr10083,Edinburgh_UK,Annotation: data science is not always about A...


In [12]:
long1 = base.melt(index_cols)
long1

Unnamed: 0,elective,institution,variable,value
0,COMP3477,Durham_England,"Altin, Rukiye (rukiye)",GIT
1,COMP3487,Durham_England,"Altin, Rukiye (rukiye)",AI; MSF
2,COMP3517,Durham_England,"Altin, Rukiye (rukiye)",DM; 15
3,COMP3527,Durham_England,"Altin, Rukiye (rukiye)",GIT
4,COMP3547,Durham_England,"Altin, Rukiye (rukiye)",AI
...,...,...,...,...
705,CS4504,VirginiaTech_USA,"Peltsverger, Svetlana",AR
706,CS4624,VirginiaTech_USA,"Peltsverger, Svetlana",HCI
707,CS4644,VirginiaTech_USA,"Peltsverger, Svetlana",SPD; 21
708,CS4804,VirginiaTech_USA,"Peltsverger, Svetlana",AI


In [13]:
long2 = long1.join(long1['value'].str.split('; ', expand=True))
long2

Unnamed: 0,elective,institution,variable,value,0,1,2,3,4
0,COMP3477,Durham_England,"Altin, Rukiye (rukiye)",GIT,GIT,,,,
1,COMP3487,Durham_England,"Altin, Rukiye (rukiye)",AI; MSF,AI,MSF,,,
2,COMP3517,Durham_England,"Altin, Rukiye (rukiye)",DM; 15,DM,15,,,
3,COMP3527,Durham_England,"Altin, Rukiye (rukiye)",GIT,GIT,,,,
4,COMP3547,Durham_England,"Altin, Rukiye (rukiye)",AI,AI,,,,
...,...,...,...,...,...,...,...,...,...
705,CS4504,VirginiaTech_USA,"Peltsverger, Svetlana",AR,AR,,,,
706,CS4624,VirginiaTech_USA,"Peltsverger, Svetlana",HCI,HCI,,,,
707,CS4644,VirginiaTech_USA,"Peltsverger, Svetlana",SPD; 21,SPD,21,,,
708,CS4804,VirginiaTech_USA,"Peltsverger, Svetlana",AI,AI,,,,


In [14]:
long3 = long2.drop('value', axis=1)
long3 = long3.rename(columns={'variable': 'coder'})
long3

Unnamed: 0,elective,institution,coder,0,1,2,3,4
0,COMP3477,Durham_England,"Altin, Rukiye (rukiye)",GIT,,,,
1,COMP3487,Durham_England,"Altin, Rukiye (rukiye)",AI,MSF,,,
2,COMP3517,Durham_England,"Altin, Rukiye (rukiye)",DM,15,,,
3,COMP3527,Durham_England,"Altin, Rukiye (rukiye)",GIT,,,,
4,COMP3547,Durham_England,"Altin, Rukiye (rukiye)",AI,,,,
...,...,...,...,...,...,...,...,...
705,CS4504,VirginiaTech_USA,"Peltsverger, Svetlana",AR,,,,
706,CS4624,VirginiaTech_USA,"Peltsverger, Svetlana",HCI,,,,
707,CS4644,VirginiaTech_USA,"Peltsverger, Svetlana",SPD,21,,,
708,CS4804,VirginiaTech_USA,"Peltsverger, Svetlana",AI,,,,


In [15]:
long4 = long3.melt(index_cols + ['coder'])
long4 = long4.rename(columns={'value': 'code'})[index_cols + ['coder','code']]
long4.fillna("",inplace = True)
long4 = long4[long4.code.str.len()>0]
long4

Unnamed: 0,elective,institution,coder,code
0,COMP3477,Durham_England,"Altin, Rukiye (rukiye)",GIT
1,COMP3487,Durham_England,"Altin, Rukiye (rukiye)",AI
2,COMP3517,Durham_England,"Altin, Rukiye (rukiye)",DM
3,COMP3527,Durham_England,"Altin, Rukiye (rukiye)",GIT
4,COMP3547,Durham_England,"Altin, Rukiye (rukiye)",AI
...,...,...,...,...
2110,5.604,UOC_Spain,"Peltsverger, Svetlana",DM
2538,22.4,UOC_Spain,"McNeill, Fiona",SE
2556,COMP3477,Durham_England,"Minguillón, Julià (Julià Minguillón)",15
2779,CS4524,Kennesaw_USA,"Peltsverger, Svetlana",AI


In [16]:
long5 = long4.groupby(index_cols + ['code']).count().unstack().fillna(0)
long5.columns = long5.columns.get_level_values(1)
long5.to_csv(os.path.join("coded","codes_" + dump_file))
long5

Unnamed: 0_level_0,code,01,02,03,04,07,09,10,13,14,15,...,MSF,NC,OS,PDC,SDF,SE,SEC,SEP,SF,SPD
elective,institution,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1DL034,Uppsala_Sweden,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1DL231,Uppsala_Sweden,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1DL311,Uppsala_Sweden,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1DT075,Uppsala_Sweden,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
1TD062,Uppsala_Sweden,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
cxinfr11217,Edinburgh_UK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cxinfr11240,Edinburgh_UK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0
cxinfr11241,Edinburgh_UK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
inf-AuLearn,CAU_Germany,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
