In [147]:
import pandas as pd
import os
import numpy as np

In [148]:
institutions = ["CAU_Germany",
                "Durham_England",
                "Edinburgh_Scotland",
                "Glasgow_Scotland",
                "Kennesaw_USA",
                "UOC_Spain",
                "UofT_Canada",
                "Uppsala_Sweden",
                "VirginiaTech_USA"
               ]

In [149]:
# data_checks has rows as institutions and columns as tests
data_checks = pd.DataFrame({"institution": institutions})
data_checks.set_index("institution", inplace=True)
check_should_be_false = []
data_checks

CAU_Germany
Durham_England
Edinburgh_Scotland
Glasgow_Scotland
Kennesaw_USA
UOC_Spain
UofT_Canada
Uppsala_Sweden
VirginiaTech_USA


In [150]:
all_electives = pd.read_csv(os.path.join("curriculum_content", "all_electives.csv"), dtype = 'string')
all_electives['elective'].unique()

<StringArray>
['COMP3421', 'COMP3467', 'COMP3477', 'COMP3487', 'COMP3491', 'COMP3507',
 'COMP3517', 'COMP3527', 'COMP3547', 'COMP3557',
 ...
   'CS4504',   'CS4604',   'CS4234',   'CS4624',   'CS4644',   'CS4704',
   'CS4784',   'CS4804',   'CS4824',   'CS4884']
Length: 287, dtype: string

In [168]:
all_electives['elective'].value_counts().head(20)

COMP3421       1
COMP3647       1
COMP3477       1
COMP3487       1
COMP3491       1
COMP3507       1
COMP3517       1
COMP3527       1
COMP3547       1
COMP3657       1
COMP3567       1
COMP3577       1
COMP3587       1
COMP3607       1
COMP3617       1
COMP3621       1
COMP3637       1
cxinfr08033    1
cxinfr11161    1
COMP3667       1
Name: elective, dtype: Int64

In [169]:
all_electives.groupby('institution')['elective'].count()

institution
CAU_Germany           13
Durham_England        32
Edinburgh_Scotland    67
Glasgow_Scotland      27
Kennesaw_USA          65
UOC_Spain             29
UofT_Canada           12
Uppsala_Sweden        18
VirginiaTech_USA      24
Name: elective, dtype: int64

In [170]:
# check if there is curriculum data

data_checks['elective_curriculum_count'] = all_electives.groupby('institution')['elective'].count()
data_checks.fillna({'elective_curriculum_count': 0}, inplace=True)
data_checks['missing_curriculum_data'] = data_checks.elective_curriculum_count <= 0
check_should_be_false.append('missing_curriculum_data')
data_checks

Unnamed: 0_level_0,elective_curriculum_count,missing_curriculum_data,has_enrolment_data,has_cohort_enrolment_data,missing_enrolment_data,missing_cohort_data,count_missing_elective_descriptions,missing_elective_descriptions,missing_elective_detail,incomplete_elective_descriptions,exceeded_cap,enrolment_exceeded_cap,enrolment_exceeded_cohort,enrolment_exceeds_cohort
institution,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
CAU_Germany,13,False,True,True,False,False,0.0,False,0.0,False,0.0,False,0.0,False
Durham_England,32,False,True,True,False,False,0.0,False,0.0,False,0.0,False,0.0,False
Edinburgh_Scotland,67,False,True,True,False,False,0.0,False,0.0,False,0.0,False,0.0,False
Glasgow_Scotland,27,False,True,True,False,False,0.0,False,2.0,True,0.0,False,0.0,False
Kennesaw_USA,65,False,True,True,False,False,0.0,False,0.0,False,0.0,False,0.0,False
UOC_Spain,29,False,True,True,False,False,0.0,False,0.0,False,0.0,False,0.0,False
UofT_Canada,12,False,True,True,False,False,0.0,False,0.0,False,0.0,False,0.0,False
Uppsala_Sweden,18,False,True,True,False,False,0.0,False,2.0,True,0.0,False,0.0,False
VirginiaTech_USA,24,False,True,True,False,False,0.0,False,0.0,False,0.0,False,0.0,False


In [171]:
# Check if there is enrolment data

data_checks['has_enrolment_data'] = False
data_checks['has_cohort_enrolment_data'] = False
inst_enrolment_dtypes = {"MCode": 'string', "Cohort": 'string', "AcademicYearStart":'int', "Women3": 'int', "Men3": 'int', "Cap": 'object', "institution": 'string'}
inst_enrolment_columns = list(inst_enrolment_dtypes.keys())
cohort_enrolment_dtypes = {"Cohort": 'string', "AcademicYearStart": 'int', "Women3": 'int', "Men3": 'int', "MaxElectives": 'object', "institution": 'string'}
cohort_enrolment_columns = list(cohort_enrolment_dtypes.keys())
inst_enrolments = []
cohort_enrolments = []
for institution in institutions:
    print ("loading " + institution)
    try:
        inst_enrolment = pd.read_csv(os.path.join("enrolment", institution, "elective_enrolment_rounded.csv"), dtype = inst_enrolment_dtypes)
        inst_enrolment['institution'] = institution
        inst_enrolments.append(inst_enrolment)
        if inst_enrolment.columns.tolist() == inst_enrolment_columns:
            data_checks.at[institution,'has_enrolment_data'] = True
        else:
            print ("Wrong columns for elective_enrolment: " + institution)
            print ("Should be ", inst_enrolment_columns, " is ", inst_enrolment.columns.tolist())
    except FileNotFoundError:
        print ("enrolment file Not Found for " + institution)
    try:
        cohort_enrolment = pd.read_csv(os.path.join("enrolment", institution, "cohort_enrolment_rounded.csv"), dtype = cohort_enrolment_dtypes)
        cohort_enrolment['institution'] = institution
        cohort_enrolments.append(cohort_enrolment)
        if cohort_enrolment.columns.tolist() == cohort_enrolment_columns:
            data_checks.at[institution,'has_cohort_enrolment_data'] = True
        else:
            print ("Wrong columns for cohort_enrolment: " + institution)
            print ("Should be ", cohort_enrolment_columns, " is ", cohort_enrolment.columns.tolist())
    except FileNotFoundError:
        print ("cohort enrolment file Not Found for " + institution)        

data_checks['missing_enrolment_data'] = ~data_checks.has_enrolment_data
data_checks['missing_cohort_data'] = ~data_checks.has_cohort_enrolment_data

check_should_be_false.append('missing_enrolment_data')
check_should_be_false.append('missing_cohort_data')

all_enrolments = pd.concat(inst_enrolments)
all_enrolments['Students3'] = all_enrolments['Women3'] + all_enrolments['Men3']

all_cohort_enrolments = pd.concat(cohort_enrolments)


loading CAU_Germany
loading Durham_England
loading Edinburgh_Scotland
loading Glasgow_Scotland
loading Kennesaw_USA
loading UOC_Spain
loading UofT_Canada
loading Uppsala_Sweden
loading VirginiaTech_USA


In [155]:
# all enrolment data modules should have curriculum defined
enrolments_electives_outer = all_enrolments.merge(all_electives, left_on=['institution', 'MCode'], right_on=['institution', 'elective'], how='outer')
missing_elective_descriptions = enrolments_electives_outer[enrolments_electives_outer.elective.isna()]
data_checks['count_missing_elective_descriptions']  = missing_elective_descriptions.groupby('institution')['MCode'].count()
data_checks.fillna({'count_missing_elective_descriptions': 0}, inplace=True)
data_checks['missing_elective_descriptions'] = data_checks.count_missing_elective_descriptions >0
check_should_be_false.append('missing_elective_descriptions')
missing_elective_descriptions


Unnamed: 0,MCode,Cohort,AcademicYearStart,Women3,Men3,Cap,institution,Students3,elective,overview,title,summary,content,ilo


In [156]:
# all curriculum should have summary, content or ilo
missing_elective_detail = all_electives[all_electives.content.isna() & all_electives.ilo.isna() & all_electives.summary.isna() ]
data_checks['missing_elective_detail'] = missing_elective_detail.groupby('institution')['elective'].count()
data_checks.fillna({'missing_elective_detail':0}, inplace=True)
data_checks['incomplete_elective_descriptions'] = data_checks.missing_elective_detail > 0
check_should_be_false.append('incomplete_elective_descriptions')
missing_elective_detail

Unnamed: 0,institution,elective,overview,title,summary,content,ilo
100,Glasgow_Scotland,COMPSCI4071,<h2>Glasgow_Scotland COMPSCI4071</h2><h2>title...,Course information not available,,,
110,Glasgow_Scotland,COMPSCI4082,<h2>Glasgow_Scotland COMPSCI4082</h2><h2>title...,Course information not available,,,
129,Uppsala_Sweden,1TD403,<h2>Uppsala_Sweden 1TD403</h2><h2>title</h2>Nu...,Numerical Methods and Simulation,,,
134,Uppsala_Sweden,1MD001,<h2>Uppsala_Sweden 1MD001</h2><h2>title</h2>Ad...,Advanced Interaction Design,,,


In [157]:
# all module totals should be less than cap
exceeded_cap = all_enrolments[all_enrolments['Students3'] > all_enrolments['Cap'].astype('float').fillna(all_enrolments['Students3'])]
data_checks['exceeded_cap'] = exceeded_cap.groupby('institution')['MCode'].count()
data_checks.fillna({'exceeded_cap':0}, inplace=True)
data_checks['enrolment_exceeded_cap'] = data_checks.exceeded_cap > 0
check_should_be_false.append('enrolment_exceeded_cap')
exceeded_cap

Unnamed: 0,MCode,Cohort,AcademicYearStart,Women3,Men3,Cap,institution,Students3


In [158]:
# enrolment data  2020 <= AY <= 2022
# count of #modules and total enrolment per year


enrolment_group = all_enrolments.groupby(['institution','AcademicYearStart'])
enrolment_years = pd.DataFrame({"module_count": enrolment_group['Students3'].count(), 
                                "Students3": enrolment_group['Students3'].sum(),
                                "Women3": enrolment_group['Women3'].sum(),
                               "Men3": enrolment_group['Men3'].sum()
                               })
#print(enrolment_years.to_latex())
print(enrolment_years)

                                      module_count  Students3  Women3  Men3
institution        AcademicYearStart                                       
CAU_Germany        2022                         13        549      81   468
Durham_England     2020                         24       1485     225  1260
                   2021                         27       1053     129   924
                   2022                         27       1356     354  1002
Edinburgh_Scotland 2020                         23       1695     384  1311
                   2021                         37       1956     510  1446
                   2022                         42       2115     501  1614
Glasgow_Scotland   2020                         24       1851     375  1476
                   2021                         24       2319     519  1800
                   2022                         26       2034     414  1620
Kennesaw_USA       2020                         53       3174     651  2523
            

In [159]:
# elective enrolment does not exceed cohort enrolment

enrolments_joint = all_enrolments.merge(all_cohort_enrolments, on=['institution', 'Cohort', 'AcademicYearStart'], how='outer')
electives_enrolment_exceeded_cohort = enrolments_joint[enrolments_joint.Women3_x > enrolments_joint.Women3_y]
data_checks['enrolment_exceeded_cohort'] = electives_enrolment_exceeded_cohort.groupby('institution')['MCode'].count()
data_checks.fillna({'enrolment_exceeded_cohort':0}, inplace=True)
data_checks['enrolment_exceeds_cohort'] = data_checks.enrolment_exceeded_cohort > 0
check_should_be_false.append('enrolment_exceeds_cohort')

In [160]:
# all enrolment data should be integer, >=0 and multiple of 3
# graph of proportion of module enrolments by gender by institution
# count of distinct cohorts per institution: check against text description
# all curriculum modules should have enrolment data?
# cohortdescription exists
# cohortenrolment exists

In [161]:
data_checks[check_should_be_false]

Unnamed: 0_level_0,missing_curriculum_data,missing_enrolment_data,missing_cohort_data,missing_elective_descriptions,incomplete_elective_descriptions,enrolment_exceeded_cap,enrolment_exceeds_cohort
institution,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
CAU_Germany,False,False,False,False,False,False,False
Durham_England,False,False,False,False,False,False,False
Edinburgh_Scotland,False,False,False,False,False,False,False
Glasgow_Scotland,False,False,False,False,True,False,False
Kennesaw_USA,False,False,False,False,False,False,False
UOC_Spain,False,False,False,False,False,False,False
UofT_Canada,False,False,False,False,False,False,False
Uppsala_Sweden,False,False,False,False,True,False,False
VirginiaTech_USA,False,False,False,False,False,False,False


In [162]:
all_enrolments['Students3'].sum()

51072

In [166]:
set(all_electives['elective']) -set(all_enrolments['MCode'].unique())

{'1DL231',
 '1DL301',
 '1DL311',
 '1DL321',
 'COMP3637',
 'COMPSCI4094',
 'CS4824',
 'CS4884',
 'cxinfr08010',
 'cxinfr08020',
 'cxinfr08033',
 'cxinfr10084',
 'cxinfr11129',
 'cxinfr11158'}

In [167]:
from sklearn import preprocessing

le_cohort = preprocessing.LabelEncoder()
le_elective = preprocessing.LabelEncoder()


cohorts = all_cohort_enrolments
cohorts['Students3'] = cohorts['Men3'] + cohorts['Women3']
cohorts['icy'] = cohorts['institution'] + cohorts['Cohort'].astype('string')  + cohorts['AcademicYearStart'].astype('string')
cohorts['cohort_id'] = le_cohort.fit_transform(cohorts['icy'])

enrolments = all_enrolments
enrolments['icy'] = enrolments['institution'] + enrolments['Cohort'].astype('string')  + enrolments['AcademicYearStart'].astype('string')
enrolments['cohort_id'] = le_cohort.transform(enrolments['icy'])
enrolments['icym'] = enrolments['icy'] + enrolments['MCode'].astype('string')
enrolments['module_id'] = le_elective.fit_transform(enrolments['icym'])
enrolments.sort_values('module_id', inplace=True)
cohort_module_count = enrolments.groupby('cohort_id')['MCode'].count().rename('module_count')
cohort_enrolments_total = enrolments.groupby('cohort_id')['Students3'].sum().rename('enrolments_total')

cohorts = cohorts.merge(cohort_module_count, on ='cohort_id')
cohorts = cohorts.merge(cohort_enrolments_total, on ='cohort_id')
cohorts.sort_values('cohort_id', inplace=True)

cohorts['mu_modules'] = cohorts.enrolments_total.astype('float')/cohorts.Students3

MODULES=list(cohorts['module_count'])
maxModules = max(MODULES)
enrolments.sort_values('module_id', inplace=True)


In [38]:
code_set = pd.read_csv(os.path.join( "curriculum_content", "ACM_2023_CAH_codes.csv"), dtype="string")
le_topics = preprocessing.LabelEncoder()
le_topics.fit(code_set['Abbrev'])
elective_topics = pd.read_csv(os.path.join("curriculum_content", "coded", "stage2_coded_binary.csv"))

# This is a bit naughty because two electives in different institutions 
# might have the same mcode
def mcode_has_topic(mcode, topic_id):
    [topic_code] = le_topics.inverse_transform([topic_id])
    if not topic_code in elective_topics.columns:
        return 0
    [result] = list( elective_topics[elective_topics['elective'] == mcode][topic_code])
    return result

def module_num_has_topic(cohort_num, module_num, topic_id):
    modules = enrolments[enrolments['cohort_id']==cohort_num].sort_values('module_id')
    mcode = modules.iloc[module_num]['MCode']
    return mcode_has_topic(mcode, topic_id)
    
def module_num_women(cohort_num, module_num):
    modules = enrolments[enrolments['cohort_id']==cohort_num].sort_values('module_id')
    women = modules.iloc[module_num]['Women3']
    return women

def module_num_men(cohort_num, module_num):
    modules = enrolments[enrolments['cohort_id']==cohort_num].sort_values('module_id')
    men = modules.iloc[module_num]['Men3']
    return men

In [39]:
# cohort_module_topic_mapping = np.zeros([len(cohorts), maxModules, len(code_set)])
# cohort_module_women = np.zeros([len(cohorts), maxModules])
# cohort_module_men = np.zeros([len(cohorts), maxModules])


# for cohort_id in range(len(cohorts)):
#     cohort_modules = enrolments[enrolments['cohort_id']==cohort_id].sort_values('module_id')
#     for module_num in range(len(cohort_modules.index)):
#         cohort_module_women[cohort_id,module_num] = module_num_women(cohort_id, module_num)
#         cohort_module_men[cohort_id,module_num] = module_num_men(cohort_id, module_num)
#         for topic_id in range(len(code_set)):
#             print ("cohort", cohort_id, "module_num", module_num, "topic_id", topic_id)
#             has_topic = module_num_has_topic(cohort_id, module_num, topic_id)
#             cohort_module_topic_mapping[cohort_id,module_num,topic_id] = has_topic
            


In [40]:
# print (cohort_module_topic_mapping[0,0,21])

# print (module_num_women(0,0))
# print (cohort_module_men[0,0])

In [20]:
# print(le_topics.inverse_transform(range(len(code_set.index))))