In [9]:
import pandas as pd # Needed for data processing
import numpy as np
import texthero as hero # Needed for data cleaning
from texthero import preprocessing
from nltk.corpus import stopwords # Needed for stopwords
from HanTa import HanoverTagger as ht # Needed for lemmatization
from nltk.tokenize import word_tokenize # Needed for tokenization
import pickle # Needed for object export
import sys # Needed for system settings
from sklearn.feature_extraction.text import TfidfVectorizer # Needed for NLP TF-IDF algorithm
from sklearn.metrics.pairwise import cosine_similarity # Needed for cosine similarity

pd.set_option('display.max_rows', 50)
np.set_printoptions(threshold = sys.maxsize)

hannover = ht.HanoverTagger("morphmodel_ger.pgz") # Needed for German lemmatization

In [2]:
df = pd.read_csv("../data/processed_data.csv", encoding="utf-8")

In [3]:
# Creates empty variable
token_stop = []
# # Defines symbols and other common terms that add no information gain to the algorithm (to clean user input)
characters = ["z.B.", "(", ")", ":", ".", ",", "|", "*", "&", "+", " I ", " II ", " III ", " IV ", " V ", " VI ", " x ", "\x96", "Semester", "ECTS", "Bachelorarbeit", "Abschlussarbeit", "Bachelor", "Studium", "Grundlagen", "Wochen", "Auslandssemester", "Berufspraktikum", "Wahlfach"]

# Defines tokenization and lemmatization function
def tokenizer_lemmatizer(text, stopwords = token_stop, lemmatize = True, user_input = False):
    
    # Cleans data if it is user input, else skips this step
    if user_input:
        for i in range(len(characters)):
            text = text.replace(characters[i], "")
        text = text.replace("/", " ")
        text = text.replace("-", " ")
        
    # Lemmatizes data using the hannover.analyze lemmatization model for German language, else only tokenizes
    if lemmatize:
        tokens = [hannover.analyze(w)[0] for w in word_tokenize(text)]
    else:
        tokens = [w for w in word_tokenize(text)]
    tokens = [w for w in tokens if w not in stopwords]
    
    return(tokens)

In [4]:
# Imports preprocessed pickle objects
vectorizer = pickle.load(open("../data/vectorizer.pkl", "rb")) 
tfidf_mat = pickle.load(open("../data/tfidf_mat.pkl", "rb")) 
token_stop = pickle.load(open("../data/stopwords.pkl", "rb"))

In [5]:
# Defines indices extraction function
def extract_best_indices(m, topk, mask = None):
    # returns the sum on all tokens of cosinus for each sentence
    if len(m.shape) > 1:
        cos_sim = np.mean(m, axis = 0) 
    else: 
        cos_sim = m
    index = np.argsort(cos_sim)[::-1] # from highest index to smallest score
    similarities = np.sort(cos_sim)[::-1]
    
    if mask is not None:
        assert mask.shape == m.shape
        mask = mask[index]
    else:
        mask = np.ones(len(cos_sim))
        
    mask = np.logical_or(cos_sim[index] != 0, mask) # eliminates cosine distance of 0
    best_index = index[mask][:topk]
    best = similarities[mask][:topk]
    
    return best_index, best

In [6]:
# Defines cosine similarity function
def get_recommendations_tfidf(query, tfidf_mat):
    # Embeds the query sentence
    tokens_query = [str(tok).lower() for tok in tokenizer_lemmatizer(text = query, user_input = True)]
    embed_query = vectorizer.transform(tokens_query)
    
    # Creates list with similarity between query and dataset
    mat = cosine_similarity(embed_query, tfidf_mat)

    # Best cosine distance for each token independently
    best_index = extract_best_indices(mat, topk = len(df.index))[0]
    best = extract_best_indices(mat, topk = len(df.index))[1]
    return best_index, best

In [7]:
# Defines main function
def studybuddy(user_input, n_results):
    best_index = get_recommendations_tfidf(query = user_input, tfidf_mat = tfidf_mat)[0]
    best = get_recommendations_tfidf(query = user_input, tfidf_mat = tfidf_mat)[1]
    best = [round(n, 2) for n in best]

    # Creates columns similarity and rank for datatable return
    results = df.iloc[best_index]
    results.insert(2, "similarity", best)
    results.insert(0, 'rank', range(1, len(results) + 1))
    results = results[:n_results]
    
    # Exports results as pickle
    results.to_pickle("results.pkl")
    
    return results

In [8]:
# Example recommendation based on user input - can be played around with
studybuddy("Data Science Machine Learning Künstliche Intelligenz", n_results = 25)

Unnamed: 0,rank,major_id,is_list,similarity,major_name,text,major_description,studycheck_link,university_link,category,...,major_category,university,location,degree_type,degree_label,language,duration_of_study,rating_amount,rating,recommendation_rate
1973,1,2048,True,0.19,Data Science & Management,data science management informatik mathematik ...,\r\n 1. Semester Einführung in das Studium Dat...,https://www.studycheck.de/studium/datenanalyse...,https://dbuas.de/studium/data-science-manageme...,Informatik & Mathematik,...,Datenanalyse und Datenmanagement,DBU Digital Business University of Applied Sci...,Berlin,M.Sc.,Master of Science,Deutsch,4 Semester,1.0,4.7,1.0
1962,2,2037,True,0.16,Computational Business Analytics,computational business analytics informatik ma...,\r\n Semester 1 Principles of Management Finan...,https://www.studycheck.de/studium/computer-sci...,https://www.frankfurt-school.de/en/home/progra...,Informatik & Mathematik,...,Computer Science,Frankfurt School of Finance & Management,Frankfurt am Main,B.Sc.,Bachelor of Science,Englisch,7 Semester,3.0,4.7,1.0
1979,3,2054,True,0.15,Data Science,data science informatik mathematik informatik ...,\r\n 1. Semester Grundlagen der Mathematik Ana...,https://www.studycheck.de/studium/datenwissens...,https://www.hs-aalen.de/courses/103/,Informatik & Mathematik,...,Datenwissenschaft,Hochschule Aalen,Aalen,B.Sc.,Bachelor of Science,Deutsch,7 Semester,5.0,4.3,0.8
1978,4,2053,True,0.15,Data Science,data science informatik mathematik informatik ...,\r\n Data Science Data Structures und Programm...,https://www.studycheck.de/studium/datenwissens...,https://xu-university.com/bachelor-studiengaen...,Informatik & Mathematik,...,Datenwissenschaft,XU Exponential University,Potsdam,B.Sc.,Bachelor of Science,Englisch,6 Semester,0.0,,
1967,5,2042,True,0.13,Data Engineering and Consulting,data engineering and consulting informatik mat...,"\r\n 1. Semester IT-Consulting Python, Java Ma...",https://www.studycheck.de/studium/data-web-eng...,https://www.hs-albsig.de/studienangebot/master...,Informatik & Mathematik,...,Data & Web Engineering,Hochschule Albstadt-Sigmaringen,Albstadt,M.Sc.,Master of Science,Deutsch,3 - 5 Semester,4.0,4.3,1.0
1951,6,2026,False,0.13,Künstliche Intelligenz und Data Science,künstliche intelligenz und data science inform...,\r\n Der Master-Studiengang hat eine Regelstud...,https://www.studycheck.de/studium/angewandte-i...,https://www.hochschule-trier.de/informatik/stu...,Informatik & Mathematik,...,Angewandte Informatik,Hochschule Trier,Trier,M.Sc.,Master of Science,Deutsch,4 Semester,,,
1977,7,2052,True,0.12,Business Intelligence and Data Science,business intelligence and data science informa...,\r\n 1. Semester Datenmanagement Einführung in...,https://www.studycheck.de/studium/datenwissens...,https://www.ism.de/studium-vollzeit/master/bus...,Informatik & Mathematik,...,Datenwissenschaft,ISM - International School of Management,4 Standorte,M.Sc.,Master of Science,Englisch,3 - 4 Semester,2.0,4.8,1.0
474,8,502,True,0.11,Business Analytics,business analytics wirtschaft recht betriebswi...,"\r\n Quantitative Methoden: Statistk, Operatio...",https://www.studycheck.de/studium/business-adm...,https://hs-aalen.de/courses/108,Wirtschaft & Recht,...,Business Administration,Hochschule Aalen,Aalen,B.Sc.,Bachelor of Science,Deutsch,7 Semester,0.0,,
1970,9,2045,True,0.11,Business Analytics,business analytics informatik mathematik infor...,"\r\n Quantitative Methoden: Statistk, Operatio...",https://www.studycheck.de/studium/business-adm...,https://hs-aalen.de/courses/108,Informatik & Mathematik,...,Datenanalyse und Datenmanagement,Hochschule Aalen,Aalen,B.Sc.,Bachelor of Science,Deutsch,7 Semester,0.0,,
2031,10,2106,True,0.11,Künstliche Intelligenz,künstliche intelligenz informatik mathematik i...,\r\n 1. Semester: Einführungsprojekt Programmi...,https://www.studycheck.de/studium/informatik/t...,https://www.thi.de/studium/studienangebote/det...,Informatik & Mathematik,...,Informatik,Technische Hochschule Ingolstadt,Ingolstadt,B.Sc.,Bachelor of Science,Deutsch,7 Semester,10.0,4.4,1.0
