In [1]:
!pip install surprise

import heapq
from collections import defaultdict
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from scipy.spatial.distance import cosine

from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
rating_df = pd.read_csv('courserating.csv')

In [3]:
rating_df

Unnamed: 0.1,Unnamed: 0,SQL,Spatial,PA1,DM in R,Python,Forecast,R Prog,Hadoop,Regression
0,LN,4.0,,,,3.0,2.0,4.0,,2.0
1,MH,3.0,4.0,,,4.0,,,,
2,JH,2.0,2.0,,,,,,,
3,EN,4.0,,,4.0,,,4.0,,3.0
4,DU,4.0,4.0,,,,,,,
5,FL,,4.0,,,,,,,
6,GL,,4.0,,,,,,,
7,AH,,3.0,,,,,,,
8,SA,,,4.0,,,,,,
9,RW,,,2.0,,,,,4.0,


In [4]:
# Q1 Upon visual examination, it seems that users LN and DS are most similar to EN.

In [5]:
rating_df.rename(columns={rating_df.columns[0]: 'Customer'}, inplace=True)

In [6]:
rating_df.set_index('Customer', inplace=True)

In [7]:
rating_df

Unnamed: 0_level_0,SQL,Spatial,PA1,DM in R,Python,Forecast,R Prog,Hadoop,Regression
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
LN,4.0,,,,3.0,2.0,4.0,,2.0
MH,3.0,4.0,,,4.0,,,,
JH,2.0,2.0,,,,,,,
EN,4.0,,,4.0,,,4.0,,3.0
DU,4.0,4.0,,,,,,,
FL,,4.0,,,,,,,
GL,,4.0,,,,,,,
AH,,3.0,,,,,,,
SA,,,4.0,,,,,,
RW,,,2.0,,,,,4.0,


In [8]:
def cosine_similarity_NA(data):
    m = data.shape[0]
    result = np.full((m, m), np.nan)
    for i in range(m):
        maski = ~np.isnan(data.iloc[i])
        for j in range(i, m):
            maskij = maski & ~np.isnan(data.iloc[j])
            if np.any(maskij):
                result[i, j] = 1 - cosine(data.iloc[i][maskij], data.iloc[j][maskij])
                result[j, i] = result[i, j]
    return pd.DataFrame(result, columns=data.index, index=data.index)

In [9]:
def commonRating_NA(data):
    m = data.shape[0]
    result = np.zeros((m,m))
    for i in range(m):
        maski = ~np.isnan(data.iloc[i])
        for j in range(i, m):
            maskij = maski & ~np.isnan(data.iloc[j])
            if np.any(maskij):
                result[i, j] = np.sum(maskij)
                result[j, i] = result[i, j]
    return pd.DataFrame(result, columns=data.index, index=data.index)

In [10]:
cosine_similarity_NA(rating_df)

Customer,LN,MH,JH,EN,DU,FL,GL,AH,SA,RW,BA,MG,AF,KG,DS
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
LN,1.0,0.96,1.0,0.9891,1.0,,,,,,,1.0,,,1.0
MH,0.96,1.0,0.989949,1.0,0.989949,1.0,1.0,1.0,,,,,,,1.0
JH,1.0,0.989949,1.0,1.0,1.0,1.0,1.0,1.0,,,,,,,1.0
EN,0.9891,1.0,1.0,1.0,1.0,,,,,,,,,,0.96225
DU,1.0,0.989949,1.0,1.0,1.0,1.0,1.0,1.0,,,,,,,1.0
FL,,1.0,1.0,,1.0,1.0,1.0,1.0,,,,,,,
GL,,1.0,1.0,,1.0,1.0,1.0,1.0,,,,,,,
AH,,1.0,1.0,,1.0,1.0,1.0,1.0,,,,,,,
SA,,,,,,,,,1.0,1.0,1.0,1.0,1.0,1.0,
RW,,,,,,,,,1.0,1.0,1.0,1.0,1.0,1.0,


In [11]:
# Q3 By cosine similarity alone, it would seem that users MH, JH, and DU are the most similar to EN.

In [21]:
# Q4 Just from a simple observation, it could be useful to find correlations between courses SQL/Spatial and R Prog/Regression
# From this observation, I would also recommend the Spatial and and Python courses to EN.

In [15]:
cosine_similarity_NA(rating_df.transpose())

Unnamed: 0,SQL,Spatial,PA1,DM in R,Python,Forecast,R Prog,Hadoop,Regression
SQL,1.0,0.990375,,0.948683,0.96,1.0,1.0,,0.980581
Spatial,0.990375,1.0,,,1.0,,,,
PA1,,,1.0,,,1.0,,1.0,
DM in R,0.948683,,,1.0,,,0.948683,,1.0
Python,0.96,1.0,,,1.0,1.0,1.0,,1.0
Forecast,1.0,,1.0,,1.0,1.0,1.0,,1.0
R Prog,1.0,,,0.948683,1.0,1.0,1.0,,0.980581
Hadoop,,,1.0,,,,,1.0,
Regression,0.980581,,,1.0,1.0,1.0,0.980581,,1.0


In [12]:
ratings = []
for customer, row in rating_df.iterrows():
    for course, value in row.iteritems():
        if np.isnan(value): continue
        ratings.append([customer, course, value])
ratings = pd.DataFrame(ratings, columns=['customer', 'course', 'rating'])

In [13]:
ratings

Unnamed: 0,customer,course,rating
0,LN,SQL,4.0
1,LN,Python,3.0
2,LN,Forecast,2.0
3,LN,R Prog,4.0
4,LN,Regression,2.0
5,MH,SQL,3.0
6,MH,Spatial,4.0
7,MH,Python,4.0
8,JH,SQL,2.0
9,JH,Spatial,2.0


In [17]:
commonRating_NA(rating_df)

Customer,LN,MH,JH,EN,DU,FL,GL,AH,SA,RW,BA,MG,AF,KG,DS
Customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
LN,5.0,2.0,1.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0
MH,2.0,3.0,2.0,1.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
JH,1.0,2.0,2.0,1.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
EN,3.0,1.0,1.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
DU,1.0,2.0,2.0,1.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
FL,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GL,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AH,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
RW,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0,1.0,1.0,1.0,0.0


In [22]:
rating_df.corr()

Unnamed: 0,SQL,Spatial,PA1,DM in R,Python,Forecast,R Prog,Hadoop,Regression
SQL,1.0,0.866025,,,-1.0,,,,
Spatial,0.866025,1.0,,,,,,,
PA1,,,1.0,,,,,,
DM in R,,,,1.0,,,,,
Python,-1.0,,,,1.0,,,,
Forecast,,,,,,1.0,,,
R Prog,,,,,,,,,
Hadoop,,,,,,,,,
Regression,,,,,,,,,1.0


In [24]:
# Q4 After calculating the correlation between SQL and Spatial using the formula given in the book, I found the correlation to be .8660
# Correlation between R Prog and Regression courses could not be found due to the formula resulting in division by zero. 
# This is due to the fact that all ratings of R Prog were equal to the mean. When subtracting the mean rating from any one rating, the result was zero.
# Both of these conclusions are checked against the df.corr() function.

In [25]:
reader = Reader(rating_scale=(1,5))

In [26]:
data = Dataset.load_from_df(ratings, reader)

In [27]:
trainset, testset = train_test_split(data, test_size = .25, random_state=1)

In [34]:
sim_options = {'name': 'cosine','user_based':'False'}
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f2506c3a7d0>

In [35]:
predictions = []
for user in rating_df.index:
    predictions.append([algo.predict(user, course).est for course in rating_df])
predictions = pd.DataFrame(predictions, columns=rating_df.columns)

In [36]:
predictions

Unnamed: 0,SQL,Spatial,PA1,DM in R,Python,Forecast,R Prog,Hadoop,Regression
0,3.502446,3.324324,4.0,2.99452,3.489796,3.0,4.0,3.433333,2.49726
1,3.498325,3.501681,3.433333,3.0,3.510204,2.0,4.0,3.433333,2.510204
2,3.500839,3.499161,3.433333,3.0,3.497475,2.0,4.0,3.433333,2.5
3,3.495913,3.333333,3.433333,3.019238,3.50274,2.0,4.0,3.433333,2.50274
4,3.500839,3.499161,3.433333,3.0,3.497475,2.0,4.0,3.433333,2.5
5,3.0,3.5,3.433333,3.433333,4.0,3.433333,3.433333,3.433333,3.433333
6,3.0,3.5,3.433333,3.433333,4.0,3.433333,3.433333,3.433333,3.433333
7,3.0,3.5,3.433333,3.433333,4.0,3.433333,3.433333,3.433333,3.433333
8,3.433333,3.433333,3.5,3.433333,3.433333,4.0,3.433333,4.0,3.433333
9,3.433333,3.433333,3.5,3.433333,3.433333,4.0,3.433333,4.0,3.433333


In [37]:
trainset = data.build_full_trainset()

In [38]:
sim_options = {'name': 'cosine','user_based':'False'}
algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f2506c3aed0>

In [39]:
courses = rating_df.columns
for course in courses: 
    print(course, algo.predict('EN', course).est)

SQL 3.4959127717492273
Spatial 3.3333333333333335
PA1 3.433333333333333
DM in R 3.0192378864668408
Python 3.5027398082909524
Forecast 2.0
R Prog 4.0
Hadoop 3.433333333333333
Regression 2.502739808290953


In [40]:
# Q5 The model seems to suggest that Spatial, PA1, Python, and Hadoop would all be suitable courses for EN.
# I personally would recommend Spatial since it seems to be best supported by the similarity and correlation tests.