Get personalized recommendations for universities offering programs similar to your chosen course. These suggestions are based on the course subjects, syllabus, and program content, helping you find the best option for your education and career goals.

In [1]:
import pandas as pd
import numpy as np 

In [2]:
import json 

In [3]:

json_path = '../datasets/program_details.json'

with open(json_path, 'r', encoding="utf8") as file:
    program_details = json.load(file)

valid_program_details = [
    entry for entry in program_details if entry is not None and isinstance(entry, dict)
]

pdf = pd.DataFrame(valid_program_details)




In [4]:
pdf

Unnamed: 0,id,Description/content,Course Organisation
0,8305,The Master's in Computer Science offers a comp...,"In the Master's programme, the schedule can be..."
1,4439,Combining the breadth of a traditional course ...,"In the first semester, the module theoretical ..."
2,4870,The Master's in Applied Computer Science progr...,Three semesters:Semesters one and two: theory ...
3,5616,Based on their undergraduate experience the st...,"The programme is organised in modules, with an..."
4,4455,Computer Science is one of the drivers of tech...,You will choose modules from the following fiv...
...,...,...,...
137,6262,The MSc Cognitive Systems programme is a two-y...,The study programme starts off with lectures t...
138,4407,Medical technology is one of the main research...,Besides compulsory and elective modules with a...
139,9595,New digital technologies provide companies wit...,The MSc in Information Engineering programme c...
140,3724,Environmental information technologies such as...,The first semester takes place at the Eberswal...


In [5]:
pdf.isnull().sum()

id                     0
Description/content    0
Course Organisation    0
dtype: int64

In [6]:
pdf.duplicated().sum()

0

# Merge both 


In [7]:
pdf['tags'] = pdf['Course Organisation'] + pdf['Description/content']

In [8]:
pdf.head()

Unnamed: 0,id,Description/content,Course Organisation,tags
0,8305,The Master's in Computer Science offers a comp...,"In the Master's programme, the schedule can be...","In the Master's programme, the schedule can be..."
1,4439,Combining the breadth of a traditional course ...,"In the first semester, the module theoretical ...","In the first semester, the module theoretical ..."
2,4870,The Master's in Applied Computer Science progr...,Three semesters:Semesters one and two: theory ...,Three semesters:Semesters one and two: theory ...
3,5616,Based on their undergraduate experience the st...,"The programme is organised in modules, with an...","The programme is organised in modules, with an..."
4,4455,Computer Science is one of the drivers of tech...,You will choose modules from the following fiv...,You will choose modules from the following fiv...


In [9]:
pdf['tags'][0]

'In the Master\'s programme, the schedule can be arranged flexibly. Students individually choose their areas of specialisation from a wide variety of subjects.Mandatory Modules:Computer ScienceComputer Science LabResearch Methods & EthicsCritical Reading and DiscussionElective Modules: Professional SkillsChoose from:Design ThinkingEntrepreneurship and InnovationLaw and ComplianceManagement and LeadershipTechnology Communication and TransferFocus AreasData and AIAlgorithms and FoundationsSystemsDigital HealthSecurity EngineeringOpen TrackMaster\'s thesisThe Master\'s in Computer Science offers a comprehensive and challenging education. The course content is tailor-made for students who want to take their IT expertise to a new level.Students can choose from six different tracks. This\xa0track structure allows our students to discover the diversity of computer science while specialising in the area that excites them the most. The tracks are based on our interdisciplinary research clusters

In [10]:
df = pd.read_csv("../datasets/MergedData.csv")
df
df.rename(columns={'Id':'id'}, inplace=True)

In [11]:
final_df = pd.merge(pdf,df, on = 'id')

In [12]:
Basic_Df = final_df[['id','tags','CourseNameShort','Academy']]

In [13]:
Basic_Df

Unnamed: 0,id,tags,CourseNameShort,Academy
0,8305,"In the Master's programme, the schedule can be...",Computer Science,University Of Potsdam
1,8305,"In the Master's programme, the schedule can be...",Computer Science,University Of Potsdam
2,8305,"In the Master's programme, the schedule can be...",Computer Science,University Of Potsdam
3,4439,"In the first semester, the module theoretical ...",Computer Science,University Of Stuttgart
4,4439,"In the first semester, the module theoretical ...",Computer Science,University Of Stuttgart
...,...,...,...,...
637,9595,The MSc in Information Engineering programme c...,Information Engineering,Technical University Of Munich
638,9595,The MSc in Information Engineering programme c...,Information Engineering,Technical University Of Munich
639,3724,The first semester takes place at the Eberswal...,Forest Information Technology (FIT),Eberswalde University For Sustainable Development
640,3724,The first semester takes place at the Eberswal...,Forest Information Technology (FIT),Eberswalde University For Sustainable Development


In [14]:
Basic_Df.duplicated().sum()

500

In [15]:
Basic_Df.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Basic_Df.drop_duplicates(inplace=True)


# Text Vectorization 

!pip install scikit-learn

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [17]:
vectors = cv.fit_transform(Basic_Df['tags']).toarray()

In [18]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [19]:
vectors[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [20]:
cv.get_feature_names_out()

array(['000', '0non', '0the', ..., 'yourmaster', 'yourscientific',
       'zwickau'], dtype=object)

## NLTK 

!pip install nltk

In [21]:
import nltk 

In [22]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [23]:
def stem(text):
    y = []

    for i in text.split():
        y.append(ps.stem(i))

    return " ".join(y)

In [24]:
Basic_Df['tags'] = Basic_Df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Basic_Df['tags'] = Basic_Df['tags'].apply(stem)


# Repeat Vectorization Once again 

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [26]:
vectors = cv.fit_transform(Basic_Df['tags']).toarray()

In [27]:
vectors[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [28]:
cv.get_feature_names_out()

array(['000', '0non', '0the', ..., 'yourmaster', 'yourscientific',
       'zwickau'], dtype=object)

### Co sine distance

In [29]:
from sklearn.metrics.pairwise import cosine_similarity

In [30]:
cosine_similarity(vectors).shape

(142, 142)

In [31]:
similarity = cosine_similarity(vectors)


In [32]:
type(similarity)

numpy.ndarray

In [33]:
sorted(list(enumerate(similarity[0])),reverse=True,key = lambda x:x[1])[0:15]

[(0, 1.0),
 (22, 0.4555347757702877),
 (5, 0.4358936772164252),
 (8, 0.43464342883147966),
 (24, 0.43017207307923444),
 (3, 0.42508156776537165),
 (10, 0.4243459418190091),
 (9, 0.4162747405813562),
 (126, 0.4155495499102746),
 (34, 0.4014057314990032),
 (11, 0.39213455619214155),
 (14, 0.3804371616993746),
 (117, 0.37398178954605743),
 (35, 0.37271413032215434),
 (1, 0.3719688824533881)]

In [34]:
Basic_Df
Basic_Df.reset_index(drop=True, inplace=True)


In [35]:
Basic_Df

Unnamed: 0,id,tags,CourseNameShort,Academy
0,8305,"in the master' programme, the schedul can be a...",Computer Science,University Of Potsdam
1,4439,"in the first semester, the modul theoret and m...",Computer Science,University Of Stuttgart
2,4870,three semesters:semest one and two: theori and...,MSc Applied Computer Science,Deggendorf Institute Of Technology
3,5616,"the programm is organis in modules, with an ov...",Master Of Science In Computer Science,Paderborn University
4,4455,you will choos modul from the follow five subj...,MSc Computer Science,University Of Passau
...,...,...,...,...
137,6262,the studi programm start off with lectur that ...,"Cognitive Systems: Language, Learning, And Rea...",University Of Potsdam
138,4407,besid compulsori and elect modul with advanc t...,"Medical Engineering: Imaging, Health & Data An...",FAU Erlangen-Nürnberg
139,9595,the msc in inform engin programm consist of co...,Information Engineering,Technical University Of Munich
140,3724,the first semest take place at the eberswald u...,Forest Information Technology (FIT),Eberswalde University For Sustainable Development


In [36]:
Basic_Df[(Basic_Df['Academy'] == 'University Of Stuttgart') & (Basic_Df['CourseNameShort'] == 'Computer Science')]

Unnamed: 0,id,tags,CourseNameShort,Academy
1,4439,"in the first semester, the modul theoret and m...",Computer Science,University Of Stuttgart


In [37]:
Basic_Df[(Basic_Df['Academy'] == 'University Of Stuttgart') & (Basic_Df['CourseNameShort'] == 'Computer Science')].index[0]

1

In [44]:
def recommend(uni, course_name):
    uni_index = Basic_Df[(Basic_Df['Academy'] == uni) & (Basic_Df['CourseNameShort'] == course_name)].index[0]
    distances = similarity[uni_index]
    uni_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:15]
    print(f"{'ID':<10} {'Course Name':<70} {'Academy':<70} {'Similarity (%)':<20}")
    print("-" * 160)
    
    for i in uni_list:
        course_id = Basic_Df.iloc[i[0]].id
        course_name_short = Basic_Df.iloc[i[0]].CourseNameShort
        academy = Basic_Df.iloc[i[0]].Academy
        similarity_percentage = round(i[1] * 100, 2)  
        
    
        print(f"{course_id:<10} {course_name_short:<70} {academy:<70} {similarity_percentage:<20.2f}")


In [45]:
# recommend('Deggendorf Institute Of Technology')
# recommend('University Of Stuttgart', 'Computer Science')
recommend('Bauhaus-Universität Weimar', 'Human-Computer Interaction (HCI)')

ID         Course Name                                                            Academy                                                                Similarity (%)      
----------------------------------------------------------------------------------------------------------------------------------------------------------------
4238       Computer Science For Digital Media                                     Bauhaus-Universität Weimar                                             61.13               
3727       Computational Engineering (CE), MSc                                    FAU Erlangen-Nürnberg                                                  54.96               
5245       Computer Science (MSc)                                                 University Of Bayreuth                                                 54.58               
3728       Media Informatics                                                      RWTH Aachen University                                       

# TFID

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
vectors_tfidf = tfidf.fit_transform(Basic_Df['tags']).toarray()

similarity_tfidf = cosine_similarity(vectors_tfidf)

def recommend_tfid(uni, course_name):
    uni_index = Basic_Df[(Basic_Df['Academy'] == uni) & (Basic_Df['CourseNameShort'] == course_name)].index[0]
    
    distances = similarity_tfidf[uni_index]
    
    uni_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:15]
    
    print(f"{'ID':<10} {'Course Name':<70} {'Academy':<70} {'Similarity (%)':<20}")
    print("-" * 160)
    
    for i in uni_list:
        course_id = Basic_Df.iloc[i[0]].id
        course_name_short = Basic_Df.iloc[i[0]].CourseNameShort
        academy = Basic_Df.iloc[i[0]].Academy
        similarity_percentage = round(i[1] * 100, 2)  
        
        print(f"{course_id:<10} {course_name_short:<70} {academy:<70} {similarity_percentage:<20.2f}")


In [47]:
# recommend_tfid('University Of Stuttgart', 'Computer Science')
recommend_tfid('Bauhaus-Universität Weimar', 'Human-Computer Interaction (HCI)')

ID         Course Name                                                            Academy                                                                Similarity (%)      
----------------------------------------------------------------------------------------------------------------------------------------------------------------
4686       Human Computer Interaction (HCI)                                       University Of Siegen                                                   41.66               
4238       Computer Science For Digital Media                                     Bauhaus-Universität Weimar                                             32.30               
7040       Engineering Of Socio-Technical Systems                                 Carl Von Ossietzky University Of Oldenburg                             28.22               
4521       MSc In International Software Systems Science                          University Of Bamberg                                        

# For Frontend 


In [None]:
import pickle

In [None]:
pickle.dump(Basic_Df,open('../frontend/university_df.pkl','wb'))
pickle.dump(similarity,open('../frontend/similarity.pkl','wb'))

In [None]:
pickle.dump(Basic_Df.to_dict() ,open('university_dict.pkl','wb'))
