# CCR - Cal Class Recommender

> Chaya Bakshi, Kalina Huynh

In [1]:
import numpy as np
import pandas as pd
import re

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, KBinsDiscretizer, FunctionTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings('ignore')

# The Data:

### Cleaning Survey Results:

In [2]:
ccr = pd.read_csv('ccr2.csv')
score_cols = [x for x in ccr.columns.to_list() if 'Score' in x]
feedback_cols = [x for x in ccr.columns.to_list() if 'Feedback' in x]
ccr.drop(columns=score_cols, inplace=True)
ccr.drop(columns=feedback_cols, inplace=True)
ccr.drop(columns={'Total score'}, inplace=True)
ccr.rename(columns={'What is your major/ intended major? (i.e. Data Science)':'Major', 
                  'What classes did you take last semester? Enter as course code ( i.e. Data 100, Stat 140, etc.)':'Last Semester',
                  "What classes are you taking this semester? (follow the same format as above)":'This Semester', 
                  "Out of the classes you've taken above, select three of which you would recommend to another student. (follow the same format as above)":'Recommendations'},
          inplace=True)
ccr.head()

Unnamed: 0,Timestamp,Username,Major,Last Semester,This Semester,Recommendations
0,2020/10/13 12:06:37 PM PDT,,Data Science,"Data 100, CS 61B, Engin 120","Ind Eng 135, Econ 140, Data C104, Ind Eng 95","Econ 140, Data 100, Data C104"
1,2020/10/13 12:41:12 PM PDT,,Data Science,"Data 100, CS 61B, Data 8, ETHSTD 21AC","Data 104, CS 169A, Math 128A, CYPLAN 198","ETHSTD 21AC, CS61B, Data 100"
2,2020/10/13 7:04:16 PM PDT,,MCB and Econ,"Poli Sci 179, Hist 136, Bio 1A/L, Chem 3A, Che...","Econ 140, Physics 8A, MCB 102, Bio 1B, Chem 3BL","Poli Sci 179, Econ 140, Econ 100A"
3,2020/10/13 8:32:07 PM PDT,,Applied Math,"Data 8, Math 128A, Math 104, Stat 33A","Data 100, IndEng 135, Math 185, Stat 33B","Data 8, Data 100, Stat 33B"
4,2020/10/14 3:47:24 AM PDT,,Data Science,"stat 134, ISF 100j,CHICANO 180AC, Data 100","Data 102, Econ 140, Stat 150, Stat133","Data 100, Stat133,Stat 150"


In [3]:
# removing whitespace
ccr['Last Semester'] = ccr['Last Semester'].apply(lambda x: x.replace(" ", ""))
ccr['This Semester'] = ccr['This Semester'].apply(lambda x: x.replace(" ", ""))
ccr['Recommendations'] = ccr['Recommendations'].apply(lambda x: x.replace(" ", ""))

# lower-casing the data
ccr['Last Semester'] = ccr['Last Semester'].apply(lambda x: x.lower())
ccr['This Semester'] = ccr['This Semester'].apply(lambda x: x.lower())
ccr['Recommendations'] = ccr['Recommendations'].apply(lambda x: x.lower())

# A few Troublesome entries:
ccr['Last Semester'] = ccr['Last Semester'].str.replace("datac8", "data8")
ccr['This Semester'] = ccr['This Semester'].str.replace("datac8", "data8")
ccr['Recommendations'] = ccr['Recommendations'].str.replace("datac8", "data8")

ccr['Last Semester'] = ccr['Last Semester'].str.replace("datac104", "data104")
ccr['This Semester'] = ccr['This Semester'].str.replace("datac104", "data104")
ccr['Recommendations'] = ccr['Recommendations'].str.replace("datac104", "data104")

ccr['Last Semester'] = ccr['Last Semester'].str.replace("ds100", "data100")
ccr['This Semester'] = ccr['This Semester'].str.replace("ds100", "data100")
ccr['Recommendations'] = ccr['Recommendations'].str.replace("ds100", "data100")

ccr['Last Semester'] = ccr['Last Semester'].str.replace("datax", "indeng135")
ccr['This Semester'] = ccr['This Semester'].str.replace("datax", "indeng135")
ccr['Recommendations'] = ccr['Recommendations'].str.replace("datax", "indeng135")

ccr['Last Semester'] = ccr['Last Semester'].str.replace("compsci", "cs")
ccr['This Semester'] = ccr['This Semester'].str.replace("compsci", "cs")
ccr['Recommendations'] = ccr['Recommendations'].str.replace("compsci", "cs")

ccr['Last Semester'] = ccr['Last Semester'].str.replace("ethstd195nutritionseminar", "ethstd195")
ccr['This Semester'] = ccr['This Semester'].str.replace("ethstd195nutritionseminar", "ethstd195")
ccr['Recommendations'] = ccr['Recommendations'].str.replace("ethstd195nutritionseminar", "ethstd195")

ccr['Last Semester'] = ccr['Last Semester'].str.replace("stay88", "stat88")
ccr['This Semester'] = ccr['This Semester'].str.replace("stay88", "stat88")
ccr['Recommendations'] = ccr['Recommendations'].str.replace("stay88", "stat88")

# transform into list of classes
ccr['Last Semester'] = ccr['Last Semester'].str.split(",")
ccr['This Semester'] = ccr['This Semester'].str.split(",")
ccr['Recommendations'] = ccr['Recommendations'].str.split(",")

ccr.drop(columns={'Username'}, inplace=True)

# create column of combined classes taken last and this semester
ccr['Classes'] = ccr['Last Semester'] + ccr['This Semester']

ccr.head()

Unnamed: 0,Timestamp,Major,Last Semester,This Semester,Recommendations,Classes
0,2020/10/13 12:06:37 PM PDT,Data Science,"[data100, cs61b, engin120]","[indeng135, econ140, data104, indeng95]","[econ140, data100, data104]","[data100, cs61b, engin120, indeng135, econ140,..."
1,2020/10/13 12:41:12 PM PDT,Data Science,"[data100, cs61b, data8, ethstd21ac]","[data104, cs169a, math128a, cyplan198]","[ethstd21ac, cs61b, data100]","[data100, cs61b, data8, ethstd21ac, data104, c..."
2,2020/10/13 7:04:16 PM PDT,MCB and Econ,"[polisci179, hist136, bio1a/l, chem3a, chem3b,...","[econ140, physics8a, mcb102, bio1b, chem3bl]","[polisci179, econ140, econ100a]","[polisci179, hist136, bio1a/l, chem3a, chem3b,..."
3,2020/10/13 8:32:07 PM PDT,Applied Math,"[data8, math128a, math104, stat33a]","[data100, indeng135, math185, stat33b]","[data8, data100, stat33b]","[data8, math128a, math104, stat33a, data100, i..."
4,2020/10/14 3:47:24 AM PDT,Data Science,"[stat134, isf100j, chicano180ac, data100]","[data102, econ140, stat150, stat133]","[data100, stat133, stat150]","[stat134, isf100j, chicano180ac, data100, data..."


### Transforming Classes as Features:

In [4]:
features = ['LS Class 1', 'LS Class 2', 'LS Class 3', 'LS Class 4', 'LS Class 5',
           'TS Class 1', 'TS Class 2', 'TS Class 3', 'TS Class 4', 'TS Class 5',
           'R Class 1', 'R Class 2', 'R Class 3']

In [5]:
# Function for combining the feature values into one string
def combine_features(row):
    return row['LS Class 1'] + ' ' + row['LS Class 2'] + ' ' + row['LS Class 3'] + ' ' + row['LS Class 4'] + ' ' + row['LS Class 5'] + ' ' + row['TS Class 1'] + ' ' + row['TS Class 2'] + ' ' + row['TS Class 3'] + ' ' + row['TS Class 4'] + ' ' + row['TS Class 5'] + ' ' + row['R Class 1'] + ' ' + row['R Class 2'] + ' ' + row['R Class 3']

In [6]:
def transform(df):
    cal_df = df.copy(deep=True)
    
    # Initalizing feature columns
    for i in np.arange(1,6):
        ls_col_name = 'LS Class ' + str(i)
        ts_col_name = 'TS Class ' + str(i)
        cal_df[ls_col_name] = np.nan
        cal_df[ts_col_name] = np.nan
    for i in np.arange(1,4):
        rec_col = 'R Class ' + str(i)
        cal_df[rec_col] = np.nan

    
    for i in range(len(cal_df['Last Semester'])):
        c = cal_df['Last Semester'][i]
        if len(c) > 5:
            c = c[:5]
        elif len(c) == 1:
            cal_df['LS Class 1'][i] = c[0]
            cal_df['LS Class 2'][i] = np.nan
            cal_df['LS Class 3'][i] = np.nan
            cal_df['LS Class 4'][i] = np.nan
            cal_df['LS Class 5'][i] = np.nan
        elif len(c) == 2:
            cal_df['LS Class 1'][i] = c[0]
            cal_df['LS Class 2'][i] = c[1]
            cal_df['LS Class 3'][i] = np.nan
            cal_df['LS Class 4'][i] = np.nan
            cal_df['LS Class 5'][i] = np.nan
        elif len(c) == 3:
            cal_df['LS Class 1'][i] = c[0]
            cal_df['LS Class 2'][i] = c[1]
            cal_df['LS Class 3'][i] = c[2]
            cal_df['LS Class 4'][i] = np.nan
            cal_df['LS Class 5'][i] = np.nan
        elif len(c) == 4:
            cal_df['LS Class 1'][i] = c[0]
            cal_df['LS Class 2'][i] = c[1]
            cal_df['LS Class 3'][i] = c[2]
            cal_df['LS Class 4'][i] = c[3]
        else:
            cal_df['LS Class 1'][i] = c[0]
            cal_df['LS Class 2'][i] = c[1]
            cal_df['LS Class 3'][i] = c[2]
            cal_df['LS Class 4'][i] = c[3]
            cal_df['LS Class 5'][i] = c[4]      


    for i in range(len(cal_df['This Semester'])):
        c = cal_df['This Semester'][i]
        if len(c) == 1:
            cal_df['TS Class 1'][i] = c[0]
            cal_df['TS Class 2'][i] = np.nan
            cal_df['TS Class 3'][i] = np.nan
            cal_df['TS Class 4'][i] = np.nan
            cal_df['TS Class 5'][i] = np.nan
        if len(c) == 2:
            cal_df['TS Class 1'][i] = c[0]
            cal_df['TS Class 2'][i] = c[1]
            cal_df['TS Class 3'][i] = np.nan
            cal_df['TS Class 4'][i] = np.nan
            cal_df['TS Class 5'][i] = np.nan
        elif len(c) == 3:
            cal_df['TS Class 1'][i] = c[0]
            cal_df['TS Class 2'][i] = c[1]
            cal_df['TS Class 3'][i] = c[2]
            cal_df['TS Class 4'][i] = np.nan
            cal_df['TS Class 5'][i] = np.nan
        elif len(c) == 4:
            cal_df['TS Class 1'][i] = c[0]
            cal_df['TS Class 2'][i] = c[1]
            cal_df['TS Class 3'][i] = c[2]
            cal_df['TS Class 4'][i] = c[3]
        else:
            cal_df['TS Class 1'][i] = c[0]
            cal_df['TS Class 2'][i] = c[1]
            cal_df['TS Class 3'][i] = c[2]
            cal_df['TS Class 4'][i] = c[3]
            cal_df['TS Class 5'][i] = c[4]   


    for i in range(len(cal_df['Recommendations'])):
        c = cal_df['Recommendations'][i]
        if len(c) == 2:
            cal_df['R Class 1'][i] = c[0]
            cal_df['R Class 2'][i] = c[1]
        else:
            cal_df['R Class 1'][i] = c[0]
            cal_df['R Class 2'][i] = c[1]
            cal_df['R Class 3'][i] = c[2]
            
    # Data Cleaning and Pre-Processing
    for feature in features:
        cal_df[feature] = cal_df[feature].fillna('') 
    cal_df['combined_features'] = cal_df.apply(combine_features,axis=1)  
            
    return cal_df.reset_index()

In [7]:
df = transform(ccr)
df.drop(index=2, inplace=True)
df.head()

Unnamed: 0,index,Timestamp,Major,Last Semester,This Semester,Recommendations,Classes,LS Class 1,TS Class 1,LS Class 2,...,LS Class 3,TS Class 3,LS Class 4,TS Class 4,LS Class 5,TS Class 5,R Class 1,R Class 2,R Class 3,combined_features
0,0,2020/10/13 12:06:37 PM PDT,Data Science,"[data100, cs61b, engin120]","[indeng135, econ140, data104, indeng95]","[econ140, data100, data104]","[data100, cs61b, engin120, indeng135, econ140,...",data100,indeng135,cs61b,...,engin120,data104,,indeng95,,,econ140,data100,data104,data100 cs61b engin120 indeng135 econ140 dat...
1,1,2020/10/13 12:41:12 PM PDT,Data Science,"[data100, cs61b, data8, ethstd21ac]","[data104, cs169a, math128a, cyplan198]","[ethstd21ac, cs61b, data100]","[data100, cs61b, data8, ethstd21ac, data104, c...",data100,data104,cs61b,...,data8,math128a,ethstd21ac,cyplan198,,,ethstd21ac,cs61b,data100,data100 cs61b data8 ethstd21ac data104 cs169a...
3,3,2020/10/13 8:32:07 PM PDT,Applied Math,"[data8, math128a, math104, stat33a]","[data100, indeng135, math185, stat33b]","[data8, data100, stat33b]","[data8, math128a, math104, stat33a, data100, i...",data8,data100,math128a,...,math104,math185,stat33a,stat33b,,,data8,data100,stat33b,data8 math128a math104 stat33a data100 indeng...
4,4,2020/10/14 3:47:24 AM PDT,Data Science,"[stat134, isf100j, chicano180ac, data100]","[data102, econ140, stat150, stat133]","[data100, stat133, stat150]","[stat134, isf100j, chicano180ac, data100, data...",stat134,data102,isf100j,...,chicano180ac,stat150,data100,stat133,,,data100,stat133,stat150,stat134 isf100j chicano180ac data100 data102 ...
5,5,2020/10/14 9:10:03 AM PDT,"Economics, Legal Studies","[econ100a, data8, stat88, info98, envdes1, pp101]","[econ140, econ162, legal149, cs61a, stat33a, u...","[data8, envdes1, econ140]","[econ100a, data8, stat88, info98, envdes1, pp1...",,econ140,,...,,legal149,,cs61a,,stat33a,data8,envdes1,econ140,econ140 econ162 legal149 cs61a stat33a da...


## CCR System:

In [8]:
# Function to find the best matching student for the user's recommendations to be based off
def find_best_match(title, d, user_classes):
    
    # if there are users that have taken the class
    if (d[d['combined_features'].str.contains(str(title))].shape[0] != 0):
        
        # get the indices of the users who have taken the class
        similar_users = d[d['combined_features'].str.contains(str(title))]['index'].values
        
        # if there is more than one similar user to consider
        if (len(similar_users > 1)):
            return find_most_similar(title, d, similar_users, user_classes)
        
        else:
            # return the only similar user
            return similar_users[0]
        
    # no similar user was found, recommend random user
    return np.random.choice(list(d['index']), 1, replace=False)[0]

In [9]:
# Helper function for find_best_match
# Returns the most similar student to the user when there are multiple options
def find_most_similar(courseTitle, d, users, user_clsses):
    
    # the most similar user will have the max amount of classes in common
    max_classes_common = 0
    
    # default set the most similar user to first option
    most_similar_user = users[0]
    
    # loop thru similar users
    for user_idx in users:
        
        # set their current classes in common count to 0
        cur_common_count = 0
        
        # loop thru the CCR user's classes
        for u_class in user_clsses:
            
            # if the similar user has taken that class, increment their classes in common count
            if d[d['index'] == user_idx]['combined_features'].str.contains(str(u_class)).any():
                cur_common_count += 1
                
        # if a new max classes in common count has been found
        if cur_common_count > max_classes_common:
            
            # update max classes in common seen
            max_classes_common = cur_common_count
            
            # update the most similar user to be the current one
            most_similar_user = user_idx
            
    return most_similar_user

In [10]:
# Helper funtion that provides a check that CCR is not recommending classes that
# are lower in number than what the user has already taken
# i.e. data8 is not a valid rec for a user who has taken data 100
def numerical_check(recc, u_classes):
    
    # split the rec into code and number
    rec_arr = re.split('(\d+)', recc)
    
    # loop thru the user_classes,
    for c in u_classes:
        
        # split each individually
        c_arr = re.split('(\d+)', c)
        
        # if code is same & number is less, return false that it is not a valid match
        if (c_arr[0] == rec_arr[0]):
            if (int(c_arr[1]) >= int(rec_arr[1])):
                return False
            
    # else continue looping to be sure the user did not take a diff class 
    # that makes the rec invalid
    return True

# re.split('(\d+)', 'data100') ---> ['data', '100', '']

In [11]:
# Funtion to turn the similarity scores to a more user intuitive star system
def star_rating(ss):
    stars = np.round(ss * 5, 2)
    strng = str(stars) + " stars / 5 stars"
    return strng

In [12]:
# The CCR System
def ccr_recommendation(user_classes, major='Data Science'):
    
    # Cleaning user input
    user_classes = [x.replace(" ", "") for x in user_classes]
    user_classes = [x.lower() for x in user_classes]
    
    # list to keep track of classes that have already been recommended to the user to avoid duplicates
    used_rec = []
    
    # filter the possible users to those of the same study
    same_majors_df = df[df['Major'].str.contains(major)]
    
    # loop thru the users classes to provide a recommendation for all
    for c in user_classes:
        rec_found = False
        
        # retrieve the unique id of the most similar user
        best_user_index = find_best_match(c, same_majors_df, user_classes)
        
        # One-Hot Encoding
        cv = CountVectorizer()
        count_matrix = cv.fit_transform(same_majors_df['combined_features']) 

        # Find similarities among users
        cosine_sim = cosine_similarity(count_matrix)

        # Finding Similar Classes
        similar_users =  list(enumerate(cosine_sim[best_user_index]))
        sorted_similar_users = sorted(similar_users,key=lambda x:x[1],reverse=True)[1:]
        
        
        # while we have not found a rec yet
        while(rec_found == False):
            
            # looping through sorted similar users
            for option in sorted_similar_users:
                
                # grab the similar user's recs
                    top_recs = df[df['index'] == option[0]]['Recommendations'].get(option[0])
                    
                    # loop through the recs
                    for rec in top_recs:
                        
                        # check that rec hasn't been taken by the user and hasn't been recommended already
                        if ((rec not in user_classes) and (rec not in used_rec)):
                            if (numerical_check(rec, user_classes)):
                                
                                # add rec to the used list
                                used_rec.append(rec)
                                rec_found = True;
                                break;
                                
                    # reached if provided recs have been used or taken
                    # continue to the next most similar user
                    if (rec_found == False):
                        continue;
                    else: # rec was found, exit to next rec based on a different class
                        break;
            break;
    return used_rec

In [13]:
# The CCR System with ad hoc printing
def ccr_recommendation_with_print(user_classes, major='Data Science'):
    
    # Cleaning user input
    user_classes = [x.replace(" ", "") for x in user_classes]
    user_classes = [x.lower() for x in user_classes]
    
    # list to keep track of classes that have already been recommended to the user to avoid duplicates
    used_rec = []
    
    # filter the possible users to those of the same study
    same_majors_df = df[df['Major'].str.contains(major)]
    
    print("CCR's Top Recommendations after taking "+str(user_classes)+" are:\n")
    
    # loop thru the users classes to provide a recommendation for all
    for c in user_classes:
        rec_found = False
        
        # retrieve the unique id of the most similar user
        best_user_index = find_best_match(c, same_majors_df, user_classes)
        
        # One-Hot Encoding
        cv = CountVectorizer()
        count_matrix = cv.fit_transform(same_majors_df['combined_features']) 

        # Find similarities among users
        cosine_sim = cosine_similarity(count_matrix)

        # Finding Similar Classes
        similar_users =  list(enumerate(cosine_sim[best_user_index]))
        sorted_similar_users = sorted(similar_users,key=lambda x:x[1],reverse=True)[1:]
        
        
        # while we have not found a rec yet
        while(rec_found == False):
            
            # looping through sorted similar users
            for option in sorted_similar_users:
                
                # grab the similar user's recs
                    top_recs = df[df['index'] == option[0]]['Recommendations'].get(option[0])
                    
                    # loop through the recs
                    for rec in top_recs:
                        
                        # check that rec hasn't been taken by the user and hasn't been recommended already
                        if ((rec not in user_classes) and (rec not in used_rec)):
                            if (numerical_check(rec, user_classes)):
                                
                                # add rec to the used list
                                used_rec.append(rec)
                                print("Recommendation based on " + str(c) + ":\n\t\t\t\t", 
                                      rec, "\t", star_rating(option[1]))
                                rec_found = True;
                                break;
                    
                    # reached if provided recs have been used or taken
                    # continue to the next most similar user
                    if (rec_found == False):
                        continue;
                    else: # rec was found, exit to next rec based on a different class
                        break;
            break;
    return used_rec

### Example: What does CCR recommend after taking Data 8, CS 61B, & Econ 140?

In [25]:
example_user_classes = ['Data 8', 'CS 61B', 'Econ 140']
example_major = 'Data Science'
examples_recs = ccr_recommendation_with_print(example_user_classes, example_major)

CCR's Top Recommendations after taking ['data8', 'cs61b', 'econ140'] are:

Recommendation based on data8:
				 data100 	 3.13 stars / 5 stars
Recommendation based on cs61b:
				 stat33b 	 3.87 stars / 5 stars
Recommendation based on econ140:
				 stat133 	 3.75 stars / 5 stars


In [15]:
examples_recs

['data100', 'stat33b', 'stat133']

### Example: What does CCR recommend after taking Data 100, Stat 134, & Ind Eng 135?

In [16]:
example_user_classes2 = ['Data 100', 'Stat 134', 'Ind Eng 135']
examples_recs2 = ccr_recommendation_with_print(example_user_classes2, example_major)

CCR's Top Recommendations after taking ['data100', 'stat134', 'indeng135'] are:

Recommendation based on data100:
				 stat150 	 3.75 stars / 5 stars
Recommendation based on stat134:
				 econ140 	 3.75 stars / 5 stars
Recommendation based on indeng135:
				 chicano161 	 2.58 stars / 5 stars
