In [None]:
import pandas as pd
import numpy as np

In [84]:
isa = pd.read_csv('./data/ISA_romain/subject_coursecode_mapping_2015_2016.csv', sep='|')
isa_courses = set(isa.CourseCode.values)
isa.head()

Unnamed: 0,SubjectName,SubjectID,CourseCode
0,Histoire de l'architecture I,1773962,AR-123
1,Stéréotomie,1769400055,AR-211
2,Introduction aux outils CAO en architecture,1888400347,AR-219
3,Théorie de l'architecture III,1776787,AR-221
4,Histoire de l'architecture III,1773972,AR-223


In [85]:
max_courses = pd.read_csv('./data/Max/course_list.csv')
max_courses.head()

Unnamed: 0,Code
0,AR-101
1,AR-102
2,AR-111
3,AR-112
4,AR-121


In [86]:
# Note that this contains each course twice, once in english and once in french
max_courses_full = pd.read_csv('./data/Max/courses.csv')[['Code', 'Title']]
max_courses_full.head()

Unnamed: 0,Code,Title
0,AR-101,Studio BA1
1,AR-101,Théorie et critique du projet BA1
2,AR-102,Studio BA2
3,AR-102,Théorie et critique du projet BA2
4,AR-111,Figuration graphique I


In [87]:
# Baseline edges scraped by Max
baseline_edges = pd.read_csv('./data/Max/edges/pre_obl.csv')[['Source', 'Target']]
baseline_edges.head()

Unnamed: 0,Source,Target
0,AR-112,AR-111
1,AR-211,MATH-121
2,AR-219,MATH-126
3,AR-231,CIVIL-122
4,AR-239,CIVIL-122


In [88]:
# How many baseline edges would we loose with courses from ISA? Here we try to match the courses by course codes
n_drops = 0
for row in baseline_edges.values:
    if(row[0] not in isa_courses or row[1] not in isa_courses):
        n_drops += 1
print('We would have to drop ' + str(n_drops) +  '/' + str(baseline_edges.size) + " baseline edges with ISA courses 15-16")

We would have to drop 161/452 baseline edges with ISA courses 15-16


In [89]:
# let's try to join the courses' titles from ISA with the titles from Max:
# This is not better...
remaining_courses = set()
dropped_courses = dict()
for row in max_courses_full.values:
    title_max = row[1]
    code_max = row[0]
    if title_max in set(isa.SubjectName.values):
        remaining_courses.add(code_max)
    else:
        dropped_courses[code_max]= title_max
print('Remaining courses from Max: ' + str(len(remaining_courses)))

# let's check how many baseline edges we would have to drop with this course set
n_drops = 0
for row in baseline_edges.values:
    if(row[0] not in remaining_courses or row[1] not in remaining_courses):
        n_drops += 1
print('Edges to drop: ' + str(n_drops) + '/' + str(baseline_edges.size))

Remaining courses from Max: 733
Edges to drop: 163/452


# Match Max's data with Kshitij's data

The problem here is that Max's data was scraped from the web, and thus the courses do not have a subject ID. Also, the data was scraped in 2017, but the data from ISA that Kshitij used only goes until 2016.
Therefore, the goal is to match the data by course code and/or course name as best as possible, considering that course codes/names can change over the years. Since this might create inconsistencies, I decided to process as follow: 
- The baseline (Max's scraped data) should stay true no matter what
- Add as much edges from Kshitij's data as you can. It's fine if some are missing
- It's okay if Kshitij's data has mistakes/inconsistencies, but try to avoid them as much as possible

### Data Exploration: Kshitij's data

In [90]:
courses_ksh = pd.read_csv('./data/Kshitij_new/isa-subject-id-name-code-hash.csv')
courses_ksh.head()

Unnamed: 0,SubjectID,SubjectName,AssociatedCourseCodes
0,1179233,"Analyse I, II (allemand)",
1,1179240,Algèbre linéaire I,MATH-110; MATH-110(b); MATH-112; MATH-112(a); ...
2,1179245,Analyse numérique,MATH-150; MATH-250; MATH-251(a); MATH-251(b); ...
3,1179253,Plan BA/MA,
4,1186202,Algèbre linéaire II,MATH-115; MATH-117; MATH-117(a); MATH-117(b)


In [91]:
print("There are " + str(courses_ksh.AssociatedCourseCodes.isnull().sum()) + "/" + str(len(courses_ksh)) + " courses with no codes")

There are 3278/6365 courses with no codes


In [73]:
edges_ksh = pd.read_csv('./data/Kshitij_new/correlation-subject-pair.csv')[['sub1', 'sub2', 'cor1', 'pval1', 'cor2', 'pval2', 'sameSemester']]
edges_ksh.head()

Unnamed: 0,sub1,sub2,cor1,pval1,cor2,pval2,sameSemester
0,1179240,1179245,0.479456,1.053786e-71,0.578663,1.057758e-13,0
1,1179240,1186202,0.638191,2.904612e-168,0.562914,1.9252170000000002e-17,2
2,1179240,1705532,0.491199,3.31385e-08,0.534409,7.964026e-13,594
3,1179240,1705538,0.601198,1.168207e-63,0.611785,6.559217e-15,0
4,1179240,1705590,0.488534,1.368027e-23,0.44471,6.189562e-12,1186


In [94]:
# Let's see how many edges we would have to drop if we drop all courses with no courseCode
courses_ksh_no_null = set(courses_ksh.dropna(subset=['AssociatedCourseCodes']).SubjectID.values)
#courses_ksh_no_null = courses_ksh[pd.notnull(courses_ksh['AssociatedCourseCodes'])]
print(len(courses_ksh_no_null))
n_drops_edges = 0
for row in edges_ksh.values:
    if row[0] not in courses_ksh_no_null or row[1] not in courses_ksh_no_null:
        n_drops_edges += 1
print("We would drop " + str(n_drops_edges) + "/" + str(len(edges_ksh)) + " edges")

3087
We would drop 0/5828 edges


It seems there is no edge from/to courses that have no course code. Thus, we can discard all the courses that have no code. \o/

In [76]:
# Drop the data with no course code
courses_ksh.dropna(subset=['AssociatedCourseCodes'], inplace=True)

### Matching the courses
The idea here is to try and assign each scraped course a subjectID, then filter k's edges and keep only those with two endpoits in the scraped courses


In [77]:
courses_ksh_join = courses_ksh
courses_ksh_join.index = courses_ksh.SubjectName
courses_ksh_join
join_courses = max_courses_full.join(courses_ksh, on='Title')
join_courses = join_courses[pd.notnull(join_courses['SubjectName'])]
join_courses.drop(columns=['SubjectName'], inplace=True)
join_courses.head()

Unnamed: 0,Code,Title,SubjectID,AssociatedCourseCodes
9,AR-121,Théorie de l'architecture I,1776777.0,AR-121
11,AR-122,Théorie de l'architecture II,1776782.0,AR-122
12,AR-123,Histoire de l'architecture I,1773962.0,AR-123
14,AR-124,Histoire de l'architecture II,1773967.0,AR-124
17,AR-131,Construction et durabilité I,837374049.0,AR-131


In [78]:
print(max_courses_full.shape)
print(len(join_courses.Code.unique()))

(3078, 2)
1446


In [79]:
join_courses[join_courses.AssociatedCourseCodes == join_courses.Code].shape

(1774, 4)

In [80]:
join_courses.shape

(2144, 4)

In [81]:
print(len(join_courses.Code.unique()))
print(len(max_courses_full.Code.unique()))

1446
1546


In [115]:
# Number of baseline edges we would drop after removing the courses from Max that are not in Kshitij's courses (i.e. after the join)
count = 0
for row in baseline_edges.values:
    if row[0] not in join_courses.Code.values or row[1] not in join_courses.Code.values:
        count += 1
print(str(count) + "/" + str(baseline_edges.shape[0]))

14/226


In [117]:
# Number of k's edges we would keep
count = 0
for row in edges_ksh.values:
    if row[0] in join_courses.SubjectID.values and row[1] in join_courses.values:
        count += 1
print(str(count) + "/" + str(edges_ksh.shape[0]))

3276/5828


In [138]:
thres = 0.5
edges_ksh[(edges_ksh.cor1 >= thres) | (edges_ksh.cor1 <= -thres) | (edges_ksh.cor2 >= thres) | (edges_ksh.cor2 <= -thres)].shape[0]

891