In [3]:
# Imports 
import pandas as pd
from collections import defaultdict
from surprise import KNNBasic
from surprise import Dataset
from surprise import Reader
from sklearn import preprocessing
import pymysql
import numpy as np

In [4]:
# Reading Data from csv files
courseTagsData = pd.read_csv('data_files_ml_engineer/course_tags.csv')
userAssessmentData = pd.read_csv('data_files_ml_engineer/user_assessment_scores.csv')
userInterestsData = pd.read_csv('data_files_ml_engineer/user_interests.csv')
courseViewData = pd.read_csv('data_files_ml_engineer/user_course_views.csv')

In [5]:
# This method takes Trainset and returns similarity matrix based on Pearson Similarity 
def calculateViewSimilarity(trainSet):
    sim_options = {'name': 'pearson',
                   'user_based': True
                   }
 
    model = KNNBasic(sim_options=sim_options)
    model.fit(trainSet)
    simsMatrix = model.compute_similarities()
    return simsMatrix

In [6]:
# Checking the calculateViewSimilarity  method
course = courseViewData[['user_handle', 'course_id', 'view_time_seconds']]
reader = Reader()
data = Dataset.load_from_df(course[['user_handle', 'course_id', 'view_time_seconds']], reader)
trainSet = data.build_full_trainset()


In [7]:
# Testing the method 
simsMatrix = calculateViewSimilarity(trainSet)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.


In [8]:
# To sort users based on their score. Using heap for sorting
import heapq

test_ID = 1
testUserInnerID = trainSet.to_inner_uid(test_ID)
similarityRow = simsMatrix[testUserInnerID]
similarUsers = []
for innerID, score in enumerate(similarityRow):
    if(innerID != testUserInnerID):
        similarUsers.append((innerID,score))
kNeighbors = heapq.nlargest(10000, similarUsers, key=lambda t: t[1])

In [9]:
# Changing tags to the numbers suing Label Encoder
from sklearn import preprocessing
courseEncoder = preprocessing.LabelEncoder() 
courseEncoder.fit(courseTagsData['course_id'])
courseTagsData['course_id'] = courseEncoder.transform(courseTagsData['course_id'])

In [10]:
userInterestsData = userInterestsData.drop(['date_followed'],axis=1)

In [11]:
# Changing tags to the numbers suing Label Encoder
interestEncoder = preprocessing.LabelEncoder()
interestEncoder.fit(userInterestsData['interest_tag'])

userInterestsData['interest_tag'] = interestEncoder.transform(userInterestsData['interest_tag'])
userAssessmentData['user_assessment_score'] = userAssessmentData['user_assessment_score']/298


userAssessmentData = userAssessmentData.drop(['user_assessment_date'],axis=1)


arrUsers = courseViewData['user_handle'].unique()


In [12]:
tagEncoder = preprocessing.LabelEncoder()
tagEncoder.fit(userAssessmentData['assessment_tag'])

LabelEncoder()

In [13]:
userAssessmentData['assessment_tag'] = tagEncoder.transform(userAssessmentData['assessment_tag'])

In [14]:
# SParse matrix for interest and assesment data,
# total users = 10000,
interest_matrix = np.zeros((10000, len(userInterestsData['interest_tag'].unique()))),
assessment_matrix = np.zeros((10000,len(userAssessmentData['assessment_tag'].unique())))


In [15]:
userAssessmentData.head(2)

Unnamed: 0,user_handle,assessment_tag,user_assessment_score
0,7487,6,0.449664
1,7487,15,0.127517


In [16]:
userInterestsData.head(2)

Unnamed: 0,user_handle,interest_tag
0,1,423
1,1,425


In [17]:
# Creating a dictionary for each user and his tags
interest_dict = {}
for row in userInterestsData.itertuples():
    user_id = row[1]
    user_tag = row[2]
    try:
        interest_dict[user_id].append(user_tag)
    except KeyError:
        interest_dict[user_id] = [user_tag]

In [18]:
assessment_dict = {}
for row in userAssessmentData.itertuples():
    user_id = row[1]
    user_tag = row[2]
    user_score = row[3]
    user_tuple = (user_tag, user_score)
    try:
        assessment_dict[user_id].append(user_tuple)
    except KeyError:
        assessment_dict[user_id] = [user_tuple]

In [19]:
# SParse matrix for interest and assesment data,
# total users = 10000,
interest_matrix = np.zeros((10000, len(userInterestsData['interest_tag'].unique())))
assessment_matrix = np.zeros((10000,len(userAssessmentData['assessment_tag'].unique())))

for user_handle, tags in interest_dict.items():
    for tag in tags:
        interest_matrix[user_handle-1][tag] = 1
        
for user_handle, assessments in assessment_dict.items():
    for (assessment_tag_id, score) in assessments:
        assessment_matrix[user_handle-1][assessment_tag_id] = score


In [31]:
# Converting matrix to sparse for future use
import scipy.sparse as sp

interest = sp.csc_matrix(interest_matrix)
assessment = sp.csc_matrix(assessment_matrix)

In [20]:

import sklearn.preprocessing as pp
def cosine_similarities(mat):
    col_normed_mat = pp.normalize(mat.tocsc(), axis=0)
    return  col_normed_mat * col_normed_mat.T

In [37]:
# Finding similarities between the users based on tags using cosine similarity
interest_cosine = cosine_similarities(interest)
assessment_cosine = cosine_similarities(assessment)


assessment_mat = assessment_cosine.toarray()
interest_mat = interest_cosine.toarray()


In [73]:
def knnScores(rawID):
    listUsers = []
    listScore = []
    similarUsers = []
    try:
        UserInnerID = trainSet.to_inner_uid(rawID)
        similarityRow = simsMatrix[UserInnerID]
        for innerID, score in enumerate(similarityRow):
            if((innerID != UserInnerID) and score >0):
                listUsers.append(trainSet.to_raw_uid(innerID))
                listScore.append(score)
    except:
        print("Error in key so returning empty list")
    return listUsers,listScore

In [61]:
interest_mat.shape

(10000, 10000)

In [62]:
assessment_mat.shape

(10000, 10000)

In [63]:
len(courseViewData['user_handle'].unique())

8760

In [228]:
# MySQL database connection 
def connectDatabase():
    host="mysqlinstance.cjm8qag6rwgx.us-east-1.rds.amazonaws.com"
    port=3306
    dbname="pluralsight"
    user ="mydb"
    pwd = "9542582841"
    conn = pymysql.connect(host, user=user,port=port,
                           passwd=pwd, db=dbname)
    return conn

In [None]:
# Calculating the similarity from views, interest and assessment and storing them in MySQL RDS
conn = connectDatabase()

sql = "INSERT INTO similarityScores (user_id,other_id,score) VALUES (%s,%s,%s)"
for i in range(1,10001):
    #print(i)
    listViewUsers,listScores = knnScores(i)
    listAssessmentUsers = assessmentNonZero(i-1)
    listInterestUsers = interestNonZero(i-1)
    finalList = set(listViewUsers +listAssessmentUsers+listInterestUsers)
    data = []
    for j in finalList:
        score = 0
        score += assessment_mat[i-1][j-1]
        score += interest_mat[i-1][j-1]
        if j in listViewUsers:
            score += listScores[listViewUsers.index(j)]
        score = score/3
        if i != j :
            data.append((i,j,float(score)))
    curObj = conn.cursor()
    curObj.executemany(sql,data)
    conn.commit()
    curObj.close()
conn.close()

In [210]:
def assessmentNonZero(index):
    arr = np.nonzero(assessment_mat[index])
    listMat=[]
    for i in arr:
        listMat.append(i.tolist())
    resList = [i+1 for i in listMat[0]]
    return resList

In [212]:
def interestNonZero(index):
    arr = np.nonzero(interest_mat[index])
    listMat=[]
    for i in arr:
        listMat.append(i.tolist())
    resList = [i+1 for i in listMat[0]]
    return resList

In [111]:
assessment_mat[5000][25]

0.0005056407920374583

In [144]:
len(listMat[0])

612

In [197]:
courseViewData['user_handle'].unique()

array([    1,     2,     3, ...,  9998,  9999, 10000])