In [1]:
import pandas as pd
import os


In [2]:
institutions = ["CAU_Germany",
                "Durham_England",
                "Edinburgh_Scotland",
                "Glasgow_Scotland",
                "Kennesaw_USA",
                "UOC_Spain",
                "UofT_Canada",
                "Uppsala_Sweden",
                "VirginiaTech_USA"
               ]

In [3]:
# data_checks has rows as institutions and columns as tests
data_checks = pd.DataFrame({"institution": institutions})
data_checks.set_index("institution", inplace=True)
check_should_be_false = []
data_checks

CAU_Germany
Durham_England
Edinburgh_Scotland
Glasgow_Scotland
Kennesaw_USA
UOC_Spain
UofT_Canada
Uppsala_Sweden
VirginiaTech_USA


In [4]:
all_electives = pd.read_csv(os.path.join("curriculum_content", "all_electives.csv"), dtype = 'string')
all_electives

Unnamed: 0,institution,elective,overview,title,summary,content,ilo
0,Durham_England,COMP3421,<h2>Durham_England COMP3421</h2><h2>title</h2>...,COMP3421: COMPUTER SCIENCE INTO SCHOOLS,<li>To develop a range of key skills in th...,<li>A competitive interview system will be...,<li>On successful completion of this modu...
1,Durham_England,COMP3477,<h2>Durham_England COMP3477</h2><h2>title</h2>...,COMP3477: ALGORITHMIC GAME THEORY,<li> The aim of the module is to introduce...,<li>Introduction to Game Theory: what is a...,"<li>On completion of the module, students..."
2,Durham_England,COMP3487,<h2>Durham_England COMP3487</h2><h2>title</h2>...,COMP3487: BIOINFORMATICS,<li>To introduce students to applications ...,<li>Dynamic programming algorithms for seq...,"<li>On completion of the module, students..."
3,Durham_England,COMP3467,<h2>Durham_England COMP3467</h2><h2>title</h2>...,COMP3467: ADVANCED COMPUTER SYSTEMS,<li> To provide understanding of the relat...,<li> Computer Architecture including curre...,"<li>On completion of the module, students..."
4,Durham_England,COMP3491,<h2>Durham_England COMP3491</h2><h2>title</h2>...,COMP3491: CODES AND CRYPTOGRAPHY,<li>To give an introduction to the fundame...,<li>Data compression: lossless compression...,"<li>On completion of the module, students..."
...,...,...,...,...,...,...,...
270,VirginiaTech_USA,CS4704,<h2>VirginiaTech_USA CS4704</h2><h2>title</h2>...,Software Engineering Capstone,,Senior project course integrating software eng...,
271,VirginiaTech_USA,CS4784,<h2>VirginiaTech_USA CS4784</h2><h2>title</h2>...,Human-Computer Interaction Capstone,,"Advanced, project-based course in Human-Comput...",
272,VirginiaTech_USA,CS4804,<h2>VirginiaTech_USA CS4804</h2><h2>title</h2>...,Introduction to Artificial Intelligence,,"Overview of the areas of problem solving, game...",
273,VirginiaTech_USA,CS4824,<h2>VirginiaTech_USA CS4824</h2><h2>title</h2>...,Machine Learning,,Algorithms and principles involved in machine ...,


In [5]:
all_electives.groupby('institution')['elective'].count()

institution
CAU_Germany           13
Durham_England        32
Edinburgh_Scotland    45
Glasgow_Scotland      27
Kennesaw_USA          67
UOC_Spain             29
UofT_Canada           12
Uppsala_Sweden        18
VirginiaTech_USA      32
Name: elective, dtype: int64

In [6]:
# check if there is curriculum data

data_checks['elective_curriculum_count'] = all_electives.groupby('institution')['elective'].count()
data_checks.fillna({'elective_curriculum_count': 0}, inplace=True)
data_checks['missing_curriculum_data'] = data_checks.elective_curriculum_count <= 0
check_should_be_false.append('missing_curriculum_data')
data_checks

Unnamed: 0_level_0,elective_curriculum_count,missing_curriculum_data
institution,Unnamed: 1_level_1,Unnamed: 2_level_1
CAU_Germany,13,False
Durham_England,32,False
Edinburgh_Scotland,45,False
Glasgow_Scotland,27,False
Kennesaw_USA,67,False
UOC_Spain,29,False
UofT_Canada,12,False
Uppsala_Sweden,18,False
VirginiaTech_USA,32,False


In [7]:
# Check if there is enrolment data

data_checks['has_enrolment_data'] = False
data_checks['has_cohort_enrolment_data'] = False
inst_enrolment_dtypes = {"MCode": 'string', "Cohort": 'string', "AcademicYearStart":'int', "Women3": 'int', "Men3": 'int', "Cap": 'object', "institution": 'string'}
inst_enrolment_columns = list(inst_enrolment_dtypes.keys())
cohort_enrolment_dtypes = {"Cohort": 'string', "AcademicYearStart": 'int', "Women3": 'int', "Men3": 'int', "MaxElectives": 'object', "institution": 'string'}
cohort_enrolment_columns = list(cohort_enrolment_dtypes.keys())
inst_enrolments = []
cohort_enrolments = []
for institution in institutions:
    print ("loading " + institution)
    try:
        inst_enrolment = pd.read_csv(os.path.join("enrolment", institution, "elective_enrolment_rounded.csv"), dtype = inst_enrolment_dtypes)
        inst_enrolment['institution'] = institution
        inst_enrolments.append(inst_enrolment)
        if inst_enrolment.columns.tolist() == inst_enrolment_columns:
            data_checks.at[institution,'has_enrolment_data'] = True
        else:
            print ("Wrong columns for elective_enrolment: " + institution)
            print ("Should be ", inst_enrolment_columns, " is ", inst_enrolment.columns.tolist())
    except FileNotFoundError:
        print ("enrolment file Not Found for " + institution)
    try:
        cohort_enrolment = pd.read_csv(os.path.join("enrolment", institution, "cohort_enrolment_rounded.csv"), dtype = cohort_enrolment_dtypes)
        cohort_enrolment['institution'] = institution
        cohort_enrolments.append(cohort_enrolment)
        if cohort_enrolment.columns.tolist() == cohort_enrolment_columns:
            data_checks.at[institution,'has_cohort_enrolment_data'] = True
        else:
            print ("Wrong columns for cohort_enrolment: " + institution)
            print ("Should be ", cohort_enrolment_columns, " is ", cohort_enrolment.columns.tolist())
    except FileNotFoundError:
        print ("cohort enrolment file Not Found for " + institution)        

data_checks['missing_enrolment_data'] = ~data_checks.has_enrolment_data
data_checks['missing_cohort_data'] = ~data_checks.has_cohort_enrolment_data

check_should_be_false.append('missing_enrolment_data')
check_should_be_false.append('missing_cohort_data')

all_enrolments = pd.concat(inst_enrolments)
all_enrolments['Students3'] = all_enrolments['Women3'] + all_enrolments['Men3']
all_enrolments

all_cohort_enrolments = pd.concat(cohort_enrolments)
all_cohort_enrolments

loading CAU_Germany
loading Durham_England
loading Edinburgh_Scotland
loading Glasgow_Scotland
loading Kennesaw_USA
loading UOC_Spain
loading UofT_Canada
loading Uppsala_Sweden
loading VirginiaTech_USA


Unnamed: 0,Cohort,AcademicYearStart,Women3,Men3,MaxElectives,institution
0,3,2020,39,168,,CAU_Germany
1,3,2021,48,204,,CAU_Germany
2,3,2022,39,168,,CAU_Germany
0,3,2020,30,153,10.0,Durham_England
1,3,2021,12,84,10.0,Durham_England
2,3,2022,54,123,10.0,Durham_England
3,4,2020,6,36,6.0,Durham_England
4,4,2021,9,63,6.0,Durham_England
5,4,2022,3,27,6.0,Durham_England
0,3/4/5/,2020,126,399,12.0,Edinburgh_Scotland


In [8]:
# all enrolment data modules should have curriculum defined
enrolments_electives_outer = all_enrolments.merge(all_electives, left_on=['institution', 'MCode'], right_on=['institution', 'elective'], how='outer')
missing_elective_descriptions = enrolments_electives_outer[enrolments_electives_outer.elective.isna()]
data_checks['count_missing_elective_descriptions']  = missing_elective_descriptions.groupby('institution')['MCode'].count()
data_checks.fillna({'count_missing_elective_descriptions': 0}, inplace=True)
data_checks['missing_elective_descriptions'] = data_checks.count_missing_elective_descriptions >0
check_should_be_false.append('missing_elective_descriptions')
missing_elective_descriptions


Unnamed: 0,MCode,Cohort,AcademicYearStart,Women3,Men3,Cap,institution,Students3,elective,overview,title,summary,content,ilo


In [9]:
# all curriculum should have summary, content or ilo
missing_elective_detail = all_electives[all_electives.content.isna() & all_electives.ilo.isna() & all_electives.summary.isna() ]
data_checks['missing_elective_detail'] = missing_elective_detail.groupby('institution')['elective'].count()
data_checks.fillna({'missing_elective_detail':0}, inplace=True)
data_checks['incomplete_elective_descriptions'] = data_checks.missing_elective_detail > 0
check_should_be_false.append('incomplete_elective_descriptions')
missing_elective_detail

Unnamed: 0,institution,elective,overview,title,summary,content,ilo
107,Uppsala_Sweden,1TD403,<h2>Uppsala_Sweden 1TD403</h2><h2>title</h2>Nu...,Numerical Methods and Simulation,,,
112,Uppsala_Sweden,1MD001,<h2>Uppsala_Sweden 1MD001</h2><h2>title</h2>Ad...,Advanced Interaction Design,,,


In [10]:
# all module totals should be less than cap
exceeded_cap = all_enrolments[all_enrolments['Students3'] > all_enrolments['Cap'].astype('float').fillna(all_enrolments['Students3'])]
data_checks['exceeded_cap'] = exceeded_cap.groupby('institution')['MCode'].count()
data_checks.fillna({'exceeded_cap':0}, inplace=True)
data_checks['enrolment_exceeded_cap'] = data_checks.exceeded_cap > 0
check_should_be_false.append('enrolment_exceeded_cap')
exceeded_cap

Unnamed: 0,MCode,Cohort,AcademicYearStart,Women3,Men3,Cap,institution,Students3


In [11]:
# enrolment data  2020 <= AY <= 2022
# count of #modules and total enrolment per year


enrolment_group = all_enrolments.groupby(['institution','AcademicYearStart'])
enrolment_years = pd.DataFrame({"module_count": enrolment_group['Students3'].count(), "Student3": enrolment_group['Students3'].sum()})
enrolment_years

Unnamed: 0_level_0,Unnamed: 1_level_0,module_count,Student3
institution,AcademicYearStart,Unnamed: 2_level_1,Unnamed: 3_level_1
CAU_Germany,2022,13,549
Durham_England,2020,24,1485
Durham_England,2021,27,1053
Durham_England,2022,27,1356
Edinburgh_Scotland,2020,11,1053
Edinburgh_Scotland,2021,19,1275
Edinburgh_Scotland,2022,42,2115
Glasgow_Scotland,2020,24,1851
Glasgow_Scotland,2021,24,2319
Glasgow_Scotland,2022,26,2034


In [12]:
# elective enrolment does not exceed cohort enrolment

enrolments_joint = all_enrolments.merge(all_cohort_enrolments, on=['institution', 'Cohort', 'AcademicYearStart'], how='outer')
electives_enrolment_exceeded_cohort = enrolments_joint[enrolments_joint.Women3_x > enrolments_joint.Women3_y]
data_checks['enrolment_exceeded_cohort'] = electives_enrolment_exceeded_cohort.groupby('institution')['MCode'].count()
data_checks.fillna({'enrolment_exceeded_cohort':0}, inplace=True)
data_checks['enrolment_exceeds_cohort'] = data_checks.enrolment_exceeded_cohort > 0
check_should_be_false.append('enrolment_exceeds_cohort')

In [13]:
# all enrolment data should be integer, >=0 and multiple of 3
# graph of proportion of module enrolments by gender by institution
# count of distinct cohorts per institution: check against text description
# all curriculum modules should have enrolment data?
# cohortdescription exists
# cohortenrolment exists

In [14]:
data_checks[check_should_be_false]

Unnamed: 0_level_0,missing_curriculum_data,missing_enrolment_data,missing_cohort_data,missing_elective_descriptions,incomplete_elective_descriptions,enrolment_exceeded_cap,enrolment_exceeds_cohort
institution,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
CAU_Germany,False,False,False,False,False,False,False
Durham_England,False,False,False,False,False,False,False
Edinburgh_Scotland,False,False,False,False,False,False,False
Glasgow_Scotland,False,False,False,False,False,False,False
Kennesaw_USA,False,False,False,False,False,False,False
UOC_Spain,False,False,False,False,False,False,False
UofT_Canada,False,False,False,False,False,False,False
Uppsala_Sweden,False,False,False,False,True,False,True
VirginiaTech_USA,False,False,False,False,False,False,False
