## Import Libraries and Read Data:

In [1]:
import numpy as np
import pandas as pd
import pickle
import sqlite3
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KDTree

In [2]:
user_scores = pd.read_csv("./data_files/user_assessment_scores.csv")
user_courses = pd.read_csv("./data_files/user_course_views.csv")
courses = pd.read_csv("./data_files/course_tags.csv")
user_interests = pd.read_csv("./data_files/user_interests.csv")

### Checking unique values in each file:

In [3]:
user_scores.nunique()

user_handle              3114
assessment_tag             54
user_assessment_date     6570
user_assessment_score     282
dtype: int64

In [4]:
user_courses.nunique()

user_handle           8760
view_date              212
course_id             5942
author_handle         1412
level                    3
view_time_seconds    11631
dtype: int64

In [5]:
courses.nunique()

course_id      5942
course_tags     998
dtype: int64

In [6]:
user_interests.nunique()

user_handle      10000
interest_tag       748
date_followed    12869
dtype: int64

In [7]:
course_ids = list(user_courses.course_id.unique())
interest_tags = list(user_interests.interest_tag.unique())
userhandle_main = list(user_interests.user_handle.unique())
assess_tags = list(user_scores.assessment_tag.unique())

print("Total number of users: "+ str(len(userhandle_main)))
print("Total number of courses: "+ str(len(course_ids)))
print("Total number of assessment tags: "+ str(len(assess_tags)))
print("Total number of interest tags: "+ str(len(interest_tags)))

Total number of users: 10000
Total number of courses: 5942
Total number of assessment tags: 54
Total number of interest tags: 748


## Creating User-Item matrices:

### 1. For User_course_views:

In [8]:
courses_mat = np.zeros((len(userhandle_main),len(course_ids)))

#Values in course_views matrix are filled using view_time_seconds 
users = user_courses['user_handle']
courses = user_courses['course_id']
viewtime = user_courses['view_time_seconds']

for i in range(len(users)):
    r = userhandle_main.index(users[i])
    c = course_ids.index(courses[i])
    time = float(viewtime[i])
    courses_mat[r][c] += time

courses_mat

array([[4884., 4959.,  102., ...,    0.,    0.,    0.],
       [   0.,    0.,    0., ...,    0.,    0.,    0.],
       [   0.,    0.,    0., ...,    0.,    0.,    0.],
       ...,
       [   0.,    0.,    0., ...,    0.,    0.,    0.],
       [   0.,    0.,    0., ...,    0.,    0.,    0.],
       [   0.,    0.,    0., ...,    0.,    0.,    0.]])

In [9]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))
courses_mat_scaled = scaler.fit_transform(courses_mat)
courses_mat_scaled

array([[0.93923077, 1.        , 0.00369258, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

### 2. For user_interests:

In [10]:
interests_mat = np.zeros((len(userhandle_main),len(interest_tags)))

#Values in user_interests matrix are filled using 1 or 0
users = user_interests['user_handle']
interests = user_interests['interest_tag']

for i in range(len(users)):
    r = userhandle_main.index(users[i])
    c = interest_tags.index(interests[i])
    interests_mat[r][c] = 1

interests_mat

array([[1., 1., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [1., 1., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### 3. For user_assessment_scores:

In [11]:
assess_mat = np.zeros((len(userhandle_main),len(assess_tags)))

#Values in user_assessment matrix are filled using 1 or 0
users = user_scores['user_handle']
assesstags = user_scores['assessment_tag']

for i in range(len(users)):
    r = userhandle_main.index(users[i])
    c = assess_tags.index(assesstags[i])
    assess_mat[r][c] = 1

assess_mat

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [12]:
final_mat = np.hstack((courses_mat_scaled, interests_mat, assess_mat))
final_df = pd.DataFrame(final_mat, index = userhandle_main, columns = course_ids + interest_tags + assess_tags)

In [13]:
final_df.shape

(10000, 6744)

## Finding Similarity using Euclidean distance and determining neighbors using K-D tree:

In [14]:
from sklearn.neighbors import KDTree

In [15]:
tree = KDTree(final_df, metric = 'euclidean')
s = pickle.dumps(tree)                     
tree_copy = pickle.loads(s)  

In [16]:
dist, index = tree_copy.query([final_df.iloc[0]], k=6)
print(index)

[[   0 6530 3556  328 1711 4390]]


In [17]:
np.delete(index[0],0)

array([6530, 3556,  328, 1711, 4390])

In [18]:
similar_users=[]
for i in index[0]:
    val={}
    a = (final_df.iloc[i])
    b = a.nonzero()
    sim_courses = [x for x in b[0] if x< 5942]
    if(i==index[0][0]):
        val["Courses"] = final_df.columns[sim_courses].values
    else:
        val[i] = final_df.columns[sim_courses].values
    similar_users.append(val)
    

In [19]:
similar_users

[{'Courses': array(['cpt-sp2010-web-designers-branding-intro',
         'cpt-sp2010-web-designers-css',
         'aws-certified-solutions-architect-professional',
         'aws-certified-sysops-admin-associate',
         'aws-system-admin-fundamentals', 'react-js-getting-started'],
        dtype=object)},
 {6530: array(['angular-2-end-to-end'], dtype=object)},
 {3556: array(['npm-playbook', 'learning-programming-javascript'], dtype=object)},
 {328: array([], dtype=object)},
 {1711: array(['angular-2-getting-started-update'], dtype=object)},
 {4390: array(['react-js-getting-started', 'java-fundamentals-language',
         'python-understanding-machine-learning', 'rest-fundamentals',
         'typescript', 'python-getting-started',
         'understanding-machine-learning', 'html5-fundamentals',
         'what-is-programming', 'javascript-programming-basics',
         'ionic2-angular2-typescript-mobile-apps',
         'cloud-computing-fundamentals', 'android-start-developing',
         '

## Finding Similarity using Cosine similarity:

In [20]:
from sklearn.neighbors import NearestNeighbors

usersim_cosine = NearestNeighbors(n_neighbors=6, algorithm = 'brute', metric='cosine')
usersim_cosine.fit(final_df)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=6, p=2, radius=1.0)

In [22]:
dist, ind = usersim_cosine.kneighbors([final_df.iloc[0]]) 
print(ind)

[[   0 3556 6530  328 1711 4390]]


### We can observe that both euclidean distance and cosine similarity gave the same set of users for this sample. 
But it need not apply for all the samples. 

### Putting everything together:

In [5]:
import numpy as np
import pandas as pd
import pickle
import sqlite3
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KDTree

def read_and_process():
    user_scores = pd.read_csv("./data_files/user_assessment_scores.csv")
    user_courses = pd.read_csv("./data_files/user_course_views.csv")
    courses = pd.read_csv("./data_files/course_tags.csv")
    user_interests = pd.read_csv("./data_files/user_interests.csv")

    course_ids = list(user_courses.course_id.unique())
    interest_tags = list(user_interests.interest_tag.unique())
    userhandle_main = list(user_interests.user_handle.unique())
    assess_tags = list(user_scores.assessment_tag.unique())

    courses_mat = np.zeros((len(userhandle_main),len(course_ids)))

    #Values in course_views matrix are filled using view_time_seconds 
    users = user_courses['user_handle']
    courses = user_courses['course_id']
    viewtime = user_courses['view_time_seconds']

    for i in range(len(users)):
        r = userhandle_main.index(users[i])
        c = course_ids.index(courses[i])
        time = float(viewtime[i])
        courses_mat[r][c] += time

    scaler = MinMaxScaler(feature_range=(0,1))
    courses_mat_scaled = scaler.fit_transform(courses_mat)

    interests_mat = np.zeros((len(userhandle_main),len(interest_tags)))

    #Values in course_views matrix are filled using view_time_seconds 
    users = user_interests['user_handle']
    interests = user_interests['interest_tag']

    for i in range(len(users)):
        r = userhandle_main.index(users[i])
        c = interest_tags.index(interests[i])
        interests_mat[r][c] = 1

    assess_mat = np.zeros((len(userhandle_main),len(assess_tags)))

    #Values in course_views matrix are filled using view_time_seconds 
    users = user_scores['user_handle']
    assesstags = user_scores['assessment_tag']

    for i in range(len(users)):
        r = userhandle_main.index(users[i])
        c = assess_tags.index(assesstags[i])
        assess_mat[r][c] = 1


    final_mat = np.hstack((courses_mat_scaled, interests_mat, assess_mat))
    final_df = pd.DataFrame(final_mat, index = userhandle_main, columns = course_ids + interest_tags + assess_tags)
    final_df.to_csv("finaldata.csv")
    
    tree = KDTree(final_df, metric = 'euclidean')
    pickle.dump(tree, open("user_similarity.pkl", "wb"))


In [4]:
read_and_process()