In [1]:
import pandas as pd # Needed for data processing
import numpy as np
import texthero as hero # Needed for data cleaning
from texthero import preprocessing
from nltk.corpus import stopwords # Needed for stopwords
from HanTa import HanoverTagger as ht # Needed for lemmatization
from nltk.tokenize import word_tokenize # Needed for tokenization
import pickle # Needed for object export
import sys # Needed for system settings
from sklearn.feature_extraction.text import TfidfVectorizer # Needed for NLP TF-IDF algorithm
from sklearn.metrics.pairwise import cosine_similarity # Needed for cosine similarity

pd.set_option('display.max_rows', 50)
np.set_printoptions(threshold = sys.maxsize)

hannover = ht.HanoverTagger("morphmodel_ger.pgz") # Needed for German lemmatization

In [2]:
# Reads in dataset
df = pd.read_csv("../data/raw_data.csv", encoding="utf-8")

# Cleans degree_label variable String for certain entries
df['degree_label'] = df["degree_label"].str.split('\r').str[0]

In [3]:
# Creates new columns for id and NLP application
df.insert(0, 'major_id', range(0, len(df)))
df.insert(3, "text", "NA")

# Concatenates course summary / description of majors with name of the major as well as categories, university and location into the newly created "text" column
# Many major descriptions only have specific courses listed, but do not contain the major's name. If one only searches for e.g. "Computer Science" he might not receive accurate recommendations since usually there is no course or module named like the major itself.
# Hence by including the major's name as well as its category and university/location this information is still considered for the recommendation, but downweighted by the number of similar entries in the data, so that it is no automatic guarantee for a top recommendation.
df["text"] = df[["major_name", "category", "subcategory", "major_category", "university", "location", "major_description"]].astype(str).agg(" ".join, axis = 1)

# Removes empty description data rows
df = df[df["major_description"].notna()]

In [4]:
# Defines symbols and other common terms that add no information gain to the algorithm
characters = ["z.B.", "(", ")", ":", ".", ",", "|", "*", "&", "+", " I ", " II ", " III ", " IV ", " V ", " VI ", " x ", "\x96", "Semester", "ECTS", "Bachelorarbeit", "Abschlussarbeit", "Bachelor", "Studium", "Grundlagen", "Wochen", "Auslandssemester", "Berufspraktikum", "Wahlfach"]

# Cleans text corpus based on previously defined characters
for i in range(len(characters)):
    df["text"] = [n.replace(characters[i], "") for n in df["text"]]
df["text"] = [n.replace("/", " ") for n in df["text"]]
df["text"] = [n.replace("-", " ") for n in df["text"]]
df["text"] = df["text"].str.replace('\d+', '', regex = True)

# Further cleans and prepares text corpus
custom_pipeline = [preprocessing.fillna,
                  preprocessing.lowercase,
                  preprocessing.remove_whitespace
                  ]
df["text"] = hero.clean(df["text"], custom_pipeline)

In [5]:
# Creates empty variable
token_stop = []

# Defines tokenization and lemmatization function
def tokenizer_lemmatizer(text, stopwords = token_stop, lemmatize = True, user_input = False):
    
    # Cleans data if it is user input, else skips this step
    if user_input:
        for i in range(len(characters)):
            text = text.replace(characters[i], "")
        text = text.replace("/", " ")
        text = text.replace("-", " ")
        
    # Lemmatizes data using the hannover.analyze lemmatization model for German language, else only tokenizes
    if lemmatize:
        tokens = [hannover.analyze(w)[0] for w in word_tokenize(text)]
    else:
        tokens = [w for w in word_tokenize(text)]
    tokens = [w for w in tokens if w not in stopwords]
    
    return(tokens)

# Defines stopwords
german_stop_words = stopwords.words("german")
# Adapts stop words
token_stop = tokenizer_lemmatizer(' '.join(german_stop_words), stopwords = "")

In [6]:
# Instantiates TfidfVectorizer()
vectorizer = TfidfVectorizer(tokenizer = tokenizer_lemmatizer)

# Tokenizes and lemmatizes text corpus of the dataset, then creates a sparse matrix of TF-IDF scores
tfidf_mat = vectorizer.fit_transform(df["text"].values)

In [8]:
# Saves df to csv
df = df.reset_index().iloc[:,1:]
df.to_csv("../data/processed_data.csv", encoding='utf-8', index = False)

In [9]:
# Exports TF-IDF model as preprocessed pickle objects
pickle.dump(vectorizer, open("../data/vectorizer.pkl", "wb"))
pickle.dump(tfidf_mat, open("../data/tfidf_mat.pkl", "wb"))

# Exports lemmatized stopwords as pickle object
pickle.dump(token_stop, open("../data/stopwords.pkl", "wb"))