# Content-based Recommendation System for Data Programs

In [2]:
import pandas as pd
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load("en_core_web_md")
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt

import sys
sys.path.insert(0, '../src')
from text_operations import TextPreprocessor

## 1. Prepare Data

- Separate program descriptions and GDS scores for all 111 programs
- Process program descriptions
- Combine processed program descriptions and GDS scores to make the main dataset (called `dataset`)

In [3]:
df = pd.read_csv("../../data_collection/data/labelled/masters_data_programs_india_usa.csv")
pgm_id = df['uni_name'] + " - " + df['pgm_name']
descr = df['descr']
text = df['descr'].apply(lambda x: TextPreprocessor(x).preprocess_text() if x!='Not inferred' else x)
df_gds = df[[
    "id",
    "Data Gathering, Preparation and Exploration",
    "Data Representation and Transformation",
    "Computing with Data",
    "Data Modeling",
    "Data Visualization and Presentation",
    "Science about Data Science",
]]

df_descr = pd.DataFrame({
    'id': pgm_id,
    'descr': descr,
    'text': text
})

### Vecorize the program descriptions
- Using TF-IDF vectorizer

In [4]:
# tfidfvectorizer
tfidf = TfidfVectorizer(
    stop_words=STOP_WORDS
)
tfidf_df_descr = tfidf.fit_transform(df_descr['text'])



In [5]:
# make the main dataset
prog_descr_df = pd.DataFrame(tfidf_df_descr.todense(), columns=tfidf.get_feature_names())
dataset = pd.concat([df_gds, prog_descr_df], axis=1)
dataset.shape

(111, 2490)

## 2. Compute cosine similarities among programs

In [6]:
features = dataset.drop(['id'], axis=1)
cosine_sim_programs = cosine_similarity(features)

## 3. Extract most similar programs

In [7]:
# helper functions
# source: https://medium.com/code-heroku/building-a-movie-recommendation-engine-in-python-using-scikit-learn-c7489d7cb145
def get_id_from_index(df, idx):
    return df[df.index == idx]['id'].values[0]

def get_index_from_id(df, pgm_id):
    return df[df['id'] == pgm_id].index.values[0]

def get_most_similar(pgm_name, dataset, sim_matrix, npgms=5):
    pgm_index = get_index_from_id(dataset, pgm_name)
    similar_pgms = list(enumerate(sim_matrix[pgm_index]))
    sorted_similar_pgms = sorted(similar_pgms,key=lambda x:x[1],reverse=True)[1:]

    top_n_pgms = sorted_similar_pgms[:npgms]
    
    return ([get_id_from_index(dataset, idx[0]) for idx in top_n_pgms]) 

In [8]:
pgm = "Boston University - Master of Science in Applied Data Analytics"
get_most_similar(pgm, dataset, cosine_sim_programs, 5)

['Boston University - Master of Science in Applied Business Analytics',
 'University of Southern California - Master of Science in Applied Data Science',
 'IIIT Allahabad - M.Tech Data Science And Analytics',
 'Georgetown University - Master of Science in Data Science and Analytics',
 'University of Southern California - Master of Science in Analytics']