# Data Processing


Data Cleaning / reducing the data set
Data set has 3 or more columns needed columsn are userId courseId and rating (mandatory)

In [5]:
from __future__ import print_function, division
from builtins import range, input

In [6]:
#all imports

import pandas as pd
import pickle
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

In [7]:
df = pd.read_csv('C:/Users/Acer/Desktop/Hackathon_Project/rating.csv')

In [8]:
df.head()



Unnamed: 0,userId,courseId,rating,timestamp
0,1,2,3.5,02-04-2005 23:53
1,1,29,3.5,02-04-2005 23:31
2,1,32,3.5,02-04-2005 23:33
3,1,47,3.5,02-04-2005 23:32
4,1,50,3.5,02-04-2005 23:29


In [9]:
#Unique course ids
unique_course_ids = set(df.courseId.values)
course2idx = {}
count = 0
for course_id in unique_course_ids:
  course2idx[course_id] = count
  count += 1

In [10]:
#adding to dataframe
df['course_idx'] = df.apply(lambda row: course2idx[row.courseId], axis=1)

In [9]:
df.to_csv('C:/Users/Acer/Desktop/Hackathon_Project/edited_rating.csv', index=False)

In [10]:
#checking the length

df = pd.read_csv('C:/Users/Acer/Desktop/Hackathon_Project/edited_rating.csv')
print("original dataframe size:", len(df))

original dataframe size: 1048575


In [11]:
N = df.userId.max() + 1 # number of users
M = df.course_idx.max() + 1 # number of courses

In [16]:
user_ids_count = Counter(df.userId)
course_ids_count = Counter(df.course_idx)


In [17]:
# number of users and courses we would like to keep
n = 10000
m = 2000

In [18]:
user_ids = [u for u, c in user_ids_count.most_common(n)]
course_ids = [m for m, c in course_ids_count.most_common(m)]

In [19]:
# make a copy, otherwise ids won't be overwritten
df_small = df[df.userId.isin(user_ids) & df.course_idx.isin(course_ids)].copy()

In [20]:
# need to remake user ids and course ids since they are no longer sequential
new_user_id_map = {}
i = 0
for old in user_ids:
  new_user_id_map[old] = i
  i += 1
print("i:", i)

new_course_id_map = {}
j = 0
for old in course_ids:
  new_course_id_map[old] = j
  j += 1
print("j:", j)

print("Setting new ids")

i: 7120
j: 2000
Setting new ids


In [21]:
df_small.loc[:, 'userId'] = df_small.apply(lambda row: new_user_id_map[row.userId], axis=1)
df_small.loc[:, 'course_idx'] = df_small.apply(lambda row: new_course_id_map[row.course_idx], axis=1)

print("max user id:", df_small.userId.max())
print("max course id:", df_small.course_idx.max())

max user id: 7119
max course id: 1999


In [22]:
print("small dataframe size:", len(df_small))
df_small.to_csv('C:/Users/Acer/Desktop/Hackathon_Project/final_rating.csv', index=False)

small dataframe size: 849323


In [23]:
#loading the file above

df = pd.read_csv('C:/Users/Acer/Desktop/Hackathon_Project/final_rating.csv')

N = df.userId.max() + 1 # number of users
M = df.course_idx.max() + 1 # number of courses

In [26]:
# split into train and test
df = shuffle(df)
cutoff = int(0.8*len(df))
df_train = df.iloc[:cutoff]
df_test = df.iloc[cutoff:]

In [28]:
# a dictionary to tell us which users have rated which courses
user2course = {}
# a dicationary to tell us which courses have been rated by which users
course2user = {}
# a dictionary to look up ratings
usercourse2rating = {}

In [29]:
print("Calling: update_user2course_and_course2user")
count = 0
def update_user2course_and_course2user(row):
  global count
  count += 1
  if count % 100000 == 0:
    print("processed: %.3f" % (float(count)/cutoff))

  i = int(row.userId)
  j = int(row.course_idx)
  if i not in user2course:
    user2course[i] = [j]
  else:
    user2course[i].append(j)

  if j not in course2user:
    course2user[j] = [i]
  else:
    course2user[j].append(i)

  usercourse2rating[(i,j)] = row.rating

Calling: update_user2course_and_course2user


In [30]:
df_train.apply(update_user2course_and_course2user, axis=1)

processed: 0.147
processed: 0.294
processed: 0.442
processed: 0.589
processed: 0.736
processed: 0.883


390789    None
221571    None
100678    None
537482    None
501231    None
151717    None
768109    None
515538    None
609081    None
252348    None
328582    None
828859    None
762205    None
810809    None
623162    None
437027    None
609025    None
255437    None
800666    None
789403    None
478679    None
96142     None
186673    None
301602    None
803112    None
105422    None
226727    None
209795    None
154004    None
568066    None
          ... 
563834    None
562760    None
848058    None
201115    None
155502    None
72768     None
604072    None
191237    None
14635     None
27923     None
552980    None
351385    None
718414    None
449317    None
825512    None
339185    None
760266    None
98487     None
746839    None
389653    None
437352    None
60938     None
229461    None
766992    None
69930     None
835049    None
418684    None
615624    None
113974    None
602277    None
Length: 679458, dtype: object

In [31]:
#Using pickle for json

# test ratings dictionary
usercourse2rating_test = {}
print("Calling: update_usercourse2rating_test")
count = 0
def update_usercourse2rating_test(row):
  global count
  count += 1
  if count % 100000 == 0:
    print("processed: %.3f" % (float(count)/len(df_test)))

  i = int(row.userId)
  j = int(row.course_idx)
  usercourse2rating_test[(i,j)] = row.rating
df_test.apply(update_usercourse2rating_test, axis=1)

# note: these are not really JSONs
with open('user2course.json', 'wb') as f:
  pickle.dump(user2course, f)

with open('course2user.json', 'wb') as f:
  pickle.dump(course2user, f)

with open('usercourse2rating.json', 'wb') as f:
  pickle.dump(usercourse2rating, f)

with open('usercourse2rating_test.json', 'wb') as f:
  pickle.dump(usercourse2rating_test, f)

Calling: update_usercourse2rating_test
processed: 0.589


# User User Code

In [1]:
from datetime import datetime
from sortedcontainers import SortedList

In [13]:
import sys
import os
os.getcwd()

'C:\\Users\\Acer\\Desktop\\final hackathon\\Jupyter\\files'

In [14]:
import os
if not os.path.exists('user2course.json') or \
   not os.path.exists('course2user.json') or \
   not os.path.exists('usercourse2rating.json') or \
   not os.path.exists('usercourse2rating_test.json'):
    print("File not found")



with open('user2course.json', 'rb') as f:
  user2course = pickle.load(f)

with open('course2user.json', 'rb') as f:
  course2user = pickle.load(f)

with open('usercourse2rating.json', 'rb') as f:
  usercourse2rating = pickle.load(f)

with open('usercourse2rating_test.json', 'rb') as f:
  usercourse2rating_test = pickle.load(f)


In [16]:
N = np.max(list(user2course.keys())) + 1
# the test set may contain course the train set doesn't have data on
m1 = np.max(list(course2user.keys()))
m2 = np.max([m for (u, m), r in usercourse2rating_test.items()])
M = max(m1, m2) + 1
print("N:", N, "M:", M)

#if N > 10000:
 # print("N =", N, "are you sure you want to continue?")
  #print("Comment out these lines if so...")
  #exit()

N: 7120 M: 2000


In [None]:
#Neghbour and course handling

K = 25 # number of neighbors we'd like to consider
limit = 5 # number of common courses users must have in common in order to consider
neighbors = [] # store neighbors in this list
averages = [] # each user's average rating for later use
deviations = [] # each user's deviation for later use
for i in range(N):
  # find the 25 closest users to user i
  courses_i = user2course[i]
  courses_i_set = set(courses_i)

  # calculate avg and deviation
  ratings_i = { course:usercourse2rating[(i, course)] for course in courses_i }
  avg_i = np.mean(list(ratings_i.values()))
  dev_i = { course:(rating - avg_i) for course, rating in ratings_i.items() }
  dev_i_values = np.array(list(dev_i.values()))
  sigma_i = np.sqrt(dev_i_values.dot(dev_i_values))

  # save these for later use
  averages.append(avg_i)
  deviations.append(dev_i)

  sl = SortedList()
  for j in range(N):
    # don't include yourself
    if j != i:
      courses_j = user2course[j]
      courses_j_set = set(courses_j)
      common_courses = (courses_i_set & courses_j_set) # intersection
      if len(common_courses) > limit:
        # calculate avg and deviation
        ratings_j = { course:usercourse2rating[(j, course)] for course in courses_j }
        avg_j = np.mean(list(ratings_j.values()))
        dev_j = { course:(rating - avg_j) for course, rating in ratings_j.items() }
        dev_j_values = np.array(list(dev_j.values()))
        sigma_j = np.sqrt(dev_j_values.dot(dev_j_values))

        # calculate correlation coefficient
        numerator = sum(dev_i[m]*dev_j[m] for m in common_courses)
        w_ij = numerator / (sigma_i * sigma_j)

        # insert into sorted list and truncate
        # negate weight, because list is sorted ascending
        # maximum value (1) is "closest"
        sl.add((-w_ij, j))
        if len(sl) > K:
          del sl[-1]

  # store the neighbors
  neighbors.append(sl)

  # print out useful things
 # if i % 1 == 0:
  #  print(i)
