In [1]:
import json
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
import numpy as np
import numpy.linalg as la

In [44]:
with open('Catalogue/Catalogue_course.json') as f:
    data = f.read()
catalog = json.loads(data)

In [3]:
with open('All_Mooc/Mooc_merge.json') as f:
    data = f.read()
mooc = json.loads(data)

In [20]:
mooc = [a for a in mooc if 'name' in a.keys()]

In [23]:
titles = [c['course_name'] for c in catalog] + [c['name'] for c in mooc]
descriptions = [c['description'] for c in catalog] + [c['description'] for c in mooc]

In [24]:
# get rid of noisy course codes
# concat two course (title) set as training set 
for i in range(len(titles)):
    titles[i] = re.sub('[A-Z][A-Z]+ [0-9][0-9]+[a-zA-Z]? ', '', titles[i])

In [25]:
stopWords = stopwords.words('english')

vectorizer = CountVectorizer(stop_words = stopWords)
transformer = TfidfTransformer()

Vectorizer_titles = vectorizer.fit_transform(titles).toarray()
transformer.fit(Vectorizer_titles)
matrix_titles = transformer.transform(Vectorizer_titles).toarray()

Vectorizer_descriptions = vectorizer.fit_transform(descriptions).toarray()
transformer.fit(Vectorizer_descriptions)
matrix_descriptions = transformer.transform(Vectorizer_descriptions).toarray()

In [45]:
# cosine similarity
cx = lambda a, b : round(np.inner(a, b)/(la.norm(a)*la.norm(b)), 3)

for i in range(len(catalog)):
    sims = []
    for j in range(len(mooc)):
        cos_titles = cx(matrix_titles[i], matrix_titles[5286+j])
        cos_descriptions = cx(matrix_descriptions[i], matrix_descriptions[5286+j])
        if 0.5*cos_descriptions + 0.5*cos_titles > 0.3:
            sims.append((mooc[j]['id'], 0.5*cos_descriptions + 0.5*cos_titles))
    sims.sort(key=lambda tup: tup[1],reverse=True)
    print(sims[:10])
    catalog[i]['similarity'] = sims[:10]

  


[('csr00919', 0.64700000000000002), ('csr01036', 0.60750000000000004), ('csr01893', 0.48599999999999999), ('csr00310', 0.32300000000000001), ('csr00881', 0.32300000000000001), ('csr01870', 0.32300000000000001), ('csr00964', 0.30099999999999999), ('csr02533', 0.30099999999999999)]
[('csr00919', 0.34849999999999998), ('csr01036', 0.31599999999999995), ('csr01886', 0.313), ('csr01887', 0.3075)]
[]
[('csr02516', 0.45600000000000002), ('csr01815', 0.42399999999999999), ('csr01800', 0.38750000000000001)]
[('csr01840', 0.40150000000000002), ('csr00964', 0.34899999999999998), ('csr02533', 0.34899999999999998), ('csr01038', 0.32950000000000002)]
[('udc00198', 0.48349999999999999), ('csr01049', 0.42100000000000004), ('csr01825', 0.36649999999999999), ('csr01831', 0.33999999999999997), ('csr01891', 0.33699999999999997), ('csr01035', 0.31900000000000001), ('csr01834', 0.3125)]
[('csr01601', 0.3725), ('csr00706', 0.35500000000000004), ('csr00837', 0.33200000000000002), ('csr00693', 0.31599999999999

[]
[]
[]
[('csr00540', 0.36200000000000004), ('csr02129', 0.36200000000000004), ('csr00645', 0.34000000000000002), ('csr01168', 0.34000000000000002), ('csr01445', 0.34000000000000002), ('csr01581', 0.34000000000000002), ('csr02261', 0.34000000000000002)]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[('udc00135', 0.3125)]
[]
[]
[]
[('csr02557', 0.35250000000000004), ('csr02558', 0.35250000000000004), ('csr02556', 0.34450000000000003)]
[]
[]
[]
[]
[('csr00021', 0.32000000000000001), ('csr00281', 0.32000000000000001), ('udc00047', 0.30249999999999999), ('csr01554', 0.30199999999999999)]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[('udc00045', 0.54500000000000004), ('khn00246', 0.46599999999999997), ('khn00276', 0.46599999999999997), ('khn00286', 0.46599999999999997), ('khn00296', 0.46599999999999997), ('khn00256', 0.42949999999999999), ('khn00266', 0.42949999999999999), ('khn00306', 0.42949999999999999), ('khn00316', 0.42

[('csr00565', 0.39149999999999996), ('csr01223', 0.39149999999999996), ('csr01606', 0.39149999999999996), ('csr01969', 0.39149999999999996), ('csr01969', 0.39149999999999996), ('csr02057', 0.39149999999999996), ('csr02410', 0.39149999999999996), ('csr00588', 0.38300000000000001), ('csr01939', 0.38300000000000001)]
[]
[]
[('csr00392', 0.30100000000000005)]
[('udc00136', 0.4375), ('csr01998', 0.39200000000000002), ('csr01998', 0.39200000000000002), ('udc00137', 0.38100000000000001), ('csr02430', 0.35549999999999998), ('csr00549', 0.34200000000000003), ('csr02136', 0.34200000000000003), ('csr01633', 0.33300000000000002), ('csr00543', 0.33100000000000002), ('csr02033', 0.33100000000000002)]
[]
[]
[]
[('csr02289', 0.46100000000000002), ('csr02265', 0.39700000000000002), ('csr02267', 0.3775), ('csr00333', 0.37), ('csr02270', 0.36549999999999999), ('csr02268', 0.35449999999999998), ('csr00328', 0.34499999999999997), ('csr02269', 0.34300000000000003), ('udc00138', 0.318), ('udc00009', 0.3115)]

[]
[('csr00861', 0.35949999999999999), ('csr00884', 0.35949999999999999), ('csr02017', 0.30599999999999999)]
[]
[]
[]
[]
[]
[('csr01336', 0.31)]
[('csr01336', 0.30349999999999999)]
[]
[]
[]
[]
[]
[('csr00672', 0.33099999999999996), ('csr00770', 0.33099999999999996)]
[]
[('csr01782', 0.316)]
[]
[('csr01336', 0.3075)]
[]
[]
[]
[]
[]
[('csr02379', 0.34250000000000003)]
[]
[]
[('csr02379', 0.30649999999999999)]
[]
[]
[]
[]
[]
[('csr01115', 0.32150000000000001)]
[]
[]
[]
[]
[('csr01378', 0.33100000000000002)]
[('csr00848', 0.44450000000000001), ('csr01567', 0.44450000000000001), ('csr01059', 0.31850000000000001), ('csr01662', 0.31850000000000001), ('csr01736', 0.3085), ('csr01115', 0.30299999999999999)]
[('csr01115', 0.32150000000000001)]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[('csr00990', 0.32350000000000001)]
[]
[]
[]
[]
[]
[]
[('csr01520', 0.309)]
[('csr01114', 0.34550000000000003), ('csr01118', 0.32200000000000001), ('csr01115', 0.32100000000000001)]

[]
[('csr01115', 0.32150000000000001)]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[('csr01339', 0.30349999999999999)]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[('csr00992', 0.34250000000000003), ('csr01517', 0.33250000000000002), ('csr01500', 0.32550000000000001), ('csr01502', 0.3125)]
[]
[('csr01500', 0.31950000000000001), ('csr01517', 0.3135)]
[]
[('khn01262', 0.40999999999999998), ('khn01272', 0.40999999999999998), ('khn01282', 0.40999999999999998), ('khn01292', 0.40999999999999998), ('khn01302', 0.40999999999999998), ('khn01312', 0.40999999999999998), ('khn01322', 0.40999999999999998)]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[('khn00245', 0.311), ('khn00275', 0.311), ('khn00285', 0.311), ('khn00295', 0.311), ('khn00255', 0.309), ('khn00265', 0.309), ('khn00305', 0.309), ('khn00315', 0.309)]
[]
[('khn00328', 0.30049999999999999), ('khn00338', 0.30049999999999999), ('khn00347', 0.30049999999999999), ('khn00357', 0.30049999999999999), ('khn00367', 0.30049999999999999), ('khn00377', 0.30049999999999999), ('

[('csr01489', 0.44700000000000001), ('udc00171', 0.3695)]
[]
[]
[]
[]
[]
[]
[]
[('csr00296', 0.33200000000000002), ('csr01181', 0.31850000000000001)]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[('csr01175', 0.30449999999999999), ('khn01112', 0.30299999999999999), ('khn01122', 0.30299999999999999), ('khn01132', 0.30299999999999999), ('khn01152', 0.30299999999999999)]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[('csr01787', 0.35399999999999998)]
[]
[('csr02557', 0.33850000000000002), ('csr02558', 0.33850000000000002), ('csr02556', 0.33400000000000002)]
[]
[]
[('csr02557', 0.33450000000000002), ('csr02558', 0.33450000000000002), ('csr02556', 0.33050000000000002)]
[]
[]
[]
[]
[('csr02557', 0.39900000000000002), ('csr02558', 0.39900000000000002), ('csr02556', 0.39100000000000001), ('csr02559', 0.313)]
[('csr02557', 0.311), ('csr02558', 0.311), ('csr02556', 0.30649999999999999)]
[]
[]
[]
[]
[('csr01064', 0.32150000000000001), ('khn00328'

[]
[]
[('csr01877', 0.5), ('csr02342', 0.5), ('csr01839', 0.35199999999999998), ('csr02209', 0.35150000000000003)]
[('csr01877', 0.41249999999999998), ('csr02342', 0.41249999999999998), ('csr02209', 0.33349999999999996), ('csr01831', 0.314), ('csr01839', 0.311), ('csr01891', 0.30349999999999999)]
[('csr01877', 0.5), ('csr02342', 0.5), ('csr02209', 0.38400000000000001), ('csr01839', 0.35199999999999998)]
[('csr01877', 0.41249999999999998), ('csr02342', 0.41249999999999998), ('csr01831', 0.30349999999999999), ('csr01891', 0.30349999999999999)]
[('csr00963', 0.308), ('csr02532', 0.308)]
[('csr00868', 0.33799999999999997), ('csr02233', 0.33799999999999997), ('csr00872', 0.33200000000000002), ('csr02237', 0.33200000000000002), ('csr01990', 0.32350000000000001), ('csr01990', 0.32350000000000001), ('csr01963', 0.31950000000000001), ('csr01963', 0.31950000000000001), ('csr02250', 0.30349999999999999)]
[('csr00408', 0.41800000000000004), ('csr01907', 0.41800000000000004), ('udc00183', 0.4040000

[]
[('csr01888', 0.59399999999999997), ('csr01039', 0.48099999999999998), ('csr00308', 0.33550000000000002), ('csr00879', 0.33550000000000002), ('csr01868', 0.33550000000000002), ('csr00465', 0.32750000000000001)]
[]
[]
[]
[]
[]
[]
[('csr02002', 0.51500000000000001), ('csr02002', 0.51500000000000001), ('csr01399', 0.48550000000000004), ('csr01432', 0.46500000000000002), ('csr01420', 0.41100000000000003), ('csr00854', 0.39950000000000002), ('csr01424', 0.38750000000000001), ('csr01417', 0.36849999999999999), ('csr01381', 0.36649999999999999), ('csr01980', 0.36649999999999999)]
[('csr00910', 0.30250000000000005)]
[('csr00676', 0.32250000000000001), ('csr00774', 0.32250000000000001), ('csr02099', 0.32000000000000001), ('csr00677', 0.30099999999999999), ('csr00775', 0.30099999999999999)]
[]
[('csr01392', 0.30049999999999999)]
[]
[]
[]
[('csr02099', 0.375), ('csr00706', 0.32399999999999995)]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[('csr01518', 0.35349999999999998), ('csr01505', 0.3230000000000

[]
[]
[]
[]
[('csr00172', 0.47949999999999998), ('csr02028', 0.47949999999999998), ('csr00109', 0.45550000000000002), ('csr00959', 0.45550000000000002), ('udc00161', 0.42099999999999999), ('csr00030', 0.40150000000000002), ('csr01315', 0.40150000000000002), ('csr01534', 0.40150000000000002), ('csr00041', 0.39000000000000001), ('csr00397', 0.39000000000000001)]
[('csr00026', 0.4325), ('csr00286', 0.4325), ('csr00133', 0.33200000000000002), ('csr00249', 0.33200000000000002), ('csr02393', 0.32450000000000001), ('csr00107', 0.30599999999999999), ('csr01555', 0.30599999999999999), ('csr00024', 0.30249999999999999), ('csr00284', 0.30249999999999999)]
[('csr00091', 0.35249999999999998), ('csr00021', 0.34200000000000003), ('csr00281', 0.34200000000000003), ('csr02571', 0.33399999999999996), ('csr01554', 0.33150000000000002), ('csr00865', 0.316), ('csr00888', 0.316), ('csr00115', 0.314), ('csr00402', 0.314), ('csr02037', 0.314)]
[]
[]
[('csr02097', 0.32500000000000001), ('csr02311', 0.325000000

[('csr00874', 0.37949999999999995), ('csr00864', 0.378), ('csr00887', 0.378)]
[]
[('csr00392', 0.4995)]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[('csr00032', 0.33099999999999996), ('csr01317', 0.33099999999999996), ('csr01536', 0.33099999999999996), ('csr01964', 0.32000000000000001), ('csr01964', 0.32000000000000001), ('csr01077', 0.317), ('csr01688', 0.3075), ('csr01723', 0.3075)]
[('csr00266', 0.41349999999999998), ('csr01437', 0.41349999999999998), ('csr00032', 0.40399999999999997), ('csr01317', 0.40399999999999997), ('csr01536', 0.40399999999999997), ('csr01688', 0.40300000000000002), ('csr01723', 0.40300000000000002), ('csr01077', 0.40250000000000002), ('csr00843', 0.39399999999999996), ('csr01057', 0.39399999999999996)]
[('csr01506', 0.35149999999999998), ('csr01505', 0.33949999999999997), ('csr01507', 0.31)]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[('khn00255', 0.35000000000000003), ('khn00265', 0.35000000000000003), ('khn00305', 0.35000000000000003), ('khn00315', 0

[('csr00320', 0.42299999999999999), ('csr00660', 0.40149999999999997), ('csr00316', 0.3795), ('csr00318', 0.35049999999999998), ('csr00319', 0.32400000000000001)]
[('csr00320', 0.76400000000000001), ('csr00660', 0.73150000000000004), ('csr00318', 0.61549999999999994), ('csr00319', 0.57499999999999996), ('csr00316', 0.47099999999999997), ('csr01892', 0.41849999999999998), ('csr00317', 0.38650000000000001), ('csr00458', 0.35199999999999998)]
[]
[('csr00672', 0.34650000000000003), ('csr00770', 0.34650000000000003)]
[('csr00319', 0.35050000000000003)]
[('csr00316', 0.42799999999999994), ('csr00457', 0.34399999999999997)]
[]
[('csr00965', 0.62149999999999994), ('csr02534', 0.62149999999999994), ('csr00300', 0.48599999999999999), ('csr01498', 0.48599999999999999), ('csr01638', 0.40400000000000003), ('csr00312', 0.38300000000000001), ('csr02137', 0.35250000000000004), ('csr01620', 0.34000000000000002), ('csr00481', 0.32550000000000001), ('csr01386', 0.32199999999999995)]
[]
[('csr00313', 0.31

[]
[]
[]
[]
[]
[]
[('csr02209', 0.39599999999999996), ('csr01800', 0.38400000000000001)]
[]
[]
[]
[]
[]
[]
[]
[]
[('csr01634', 0.33150000000000002)]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[('csr00763', 0.30149999999999999), ('csr00827', 0.30149999999999999), ('csr01355', 0.30149999999999999)]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[('csr01835', 0.30499999999999999)]
[]
[]
[]
[]
[('csr00874', 0.46500000000000002), ('csr01115', 0.39400000000000002), ('csr01059', 0.34050000000000002), ('csr01662', 0.34050000000000002), ('csr01058', 0.30099999999999999), ('csr01661', 0.30099999999999999)]
[]
[]
[]
[]
[('udc00117', 0.32050000000000001)]
[]
[]
[]
[]
[]
[('csr00868', 0.39500000000000002), ('csr02233', 0.39500000000000002), ('csr01689', 0.36850000000000005), ('csr02210', 0.36850000000000005), ('csr00554', 0.31)]
[('khn00877', 0.35949999999999999), ('khn00897', 0.35949999999999999), ('khn00927', 0.35949999999999999), ('khn0

[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[('csr00898', 0.32100000000000001), ('csr01320', 0.32100000000000001), ('csr01467', 0.32100000000000001), ('csr02353', 0.32100000000000001)]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[('csr01063', 0.30149999999999999)]
[]
[]
[]
[]
[]
[('csr01798', 0.313), ('csr01829', 0.307)]
[]
[('csr01063', 0.309)]
[]
[]
[]
[]
[('csr00674', 0.30049999999999999), ('csr00772', 0.30049999999999999)]
[('csr01045', 0.34099999999999997), ('csr02445', 0.34099999999999997), ('csr01830', 0.30349999999999999)]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[('csr01830', 0.35350000000000004), ('csr01045', 0.30349999999999999), ('csr02445', 0.30349999999999999)]
[]
[]
[]
[]
[]
[]
[]
[('csr02002', 0.3145), ('csr02002', 0.3145)]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[('csr00717', 0.51849999999999996), ('csr00972', 0.51849999999999996), ('csr00993', 0.48950000000000005), ('csr00718', 0.3785), ('csr00973', 0.3785), ('csr00715', 0.37), ('csr00970', 0.37), ('csr00999', 0.36649999999999999), ('csr00713', 

In [46]:
catalog

[{'corequisite': [],
  'course_id': 'BUAD 280',
  'course_name': 'BUAD 280 Introduction to Financial Accounting',
  'crosslist': [],
  'description': 'Emphasis on understanding financial statements and the impact of business transactions on information presented to management and interested stakeholders .',
  'duplicate': ['BUAD 250',
   'BUAD 250',
   'BUAD 285',
   'BUAD 286',
   'BUAD 286',
   'BUAD 305'],
  'preparation': [],
  'prerequisite': [],
  'similarity': [('csr00919', 0.64700000000000002),
   ('csr01036', 0.60750000000000004),
   ('csr01893', 0.48599999999999999),
   ('csr00310', 0.32300000000000001),
   ('csr00881', 0.32300000000000001),
   ('csr01870', 0.32300000000000001),
   ('csr00964', 0.30099999999999999),
   ('csr02533', 0.30099999999999999)]},
 {'corequisite': [],
  'course_id': 'BUAD 281',
  'course_name': 'BUAD 281 Introduction to Managerial Accounting',
  'crosslist': [],
  'description': 'An emphasis on how accounting information , combined with a variety of t

In [47]:
with open('Catalogue_sim.json', 'w') as f:
    json.dump(catalog, f, indent=4)