In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


# **Collaborative Filtering**
---

### Reading comma-separated values using pandas.

In [None]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/DATASET/TrainingRatings.txt", sep = ',', header = None,)
df_test = pd.read_csv("/content/drive/MyDrive/DATASET/TestingRatings.txt", sep = ',', header = None,)

### Encoding the user ids and movie ids so there are no missing rows and columns, its easy to form a matrix.

In [None]:
from sklearn.preprocessing import LabelEncoder
le_mov, le_usr = [LabelEncoder(),LabelEncoder()] 

df[0] = le_mov.fit_transform(df[0])
df_test[0] = le_mov.transform(df_test[0])
df[1] = le_usr.fit_transform(df[1])
df_test[1] = le_usr.transform(df_test[1])

### Grouping by user ids and get their average rating.

In [None]:
grouped_by_user_test = df_test.groupby(1, group_keys=True)
userid_keys_test = grouped_by_user_test.groups.keys()
temp = []
for uids in userid_keys_test:
  temp.append([uids, len(grouped_by_user_test.get_group(uids))]) 

grouped_by_user = df.groupby(1, group_keys=True)
userid_keys = grouped_by_user.groups.keys()
(len(set(df[0])),len(userid_keys))

(1821, 28978)

### Creating Average rating vector for each user id.

In [None]:
import numpy as np
avg_rating_user_vi = np.zeros((len(userid_keys),1), dtype = 'float32')
movie_id_userid_matrix_vij = np.zeros((len(set(df[0])),len(userid_keys),), dtype='float32')
for userid in userid_keys:
  avg_rating_user_vi[userid,0] = grouped_by_user.get_group(userid)[2].sum()/(len(grouped_by_user.get_group(userid)))
print(avg_rating_user_vi.shape)

(28978, 1)


### Creating user id x movie id matrix.

In [None]:
# import pickle
for index, row in df.iterrows():
  movie_id_userid_matrix_vij[int(row[0]),int(row[1])] = row[2]

# op_path = '/content/drive/MyDrive/DATASET/'
# p_out = open(op_path + 'train_data.pickle','wb')
# pickle.dump(movie_id_userid_matrix_vij, p_out)
# p_out.close()

In [None]:
# import pickle
# op_path = '/content/drive/MyDrive/DATASET/'
# movie_id_userid_matrix_vij = pickle.load(open(op_path + 'train_data.pickle','rb'))

### **Using KD Tree to get approximate 10000 neighbors** (users with similar ratings based on euclidean distance). 
### Takes about 60 minutes to finish.

In [None]:
from sklearn.neighbors import KDTree, BallTree
import sklearn.metrics.pairwise as smp
kdt_pearson = KDTree(np.transpose(movie_id_userid_matrix_vij), leaf_size=30, metric='euclidean')

In [None]:
from time import time
nearest_neighbors = [0]*30
distances = [0]*30

# op_path = '/content/drive/MyDrive/DATASET/'
for i in range(29):
  st = time()
  distances[i], nearest_neighbors[i] = kdt_pearson.query(np.transpose(movie_id_userid_matrix_vij)[1000*i:1000*i+1000,:], k=10000, return_distance=True)
  print(i,":",(time() - st)/60)
  # p_out = open(op_path + 'nearest_neighbors1.pickle','wb')
  # pickle.dump(nearest_neighbors, p_out)
  # p_out.close()
  # p_out = open(op_path + 'distances1.pickle','wb')
  # pickle.dump(distances, p_out)
  # p_out.close()


14 : 2.3744153022766112
15 : 2.4343807339668273
16 : 2.505748999118805
17 : 2.415492955843608
18 : 2.336666436990102
19 : 2.31311270793279
20 : 2.3396354635556538
21 : 2.490177396933238
22 : 2.3689947287241617
23 : 2.3420947154362994
24 : 2.3773436307907105
25 : 2.5081828037897744
26 : 2.3523050904273988
27 : 2.4273357431093854
28 : 2.3359150886535645


### Calculating weight matrix based on the pearson coefficient and cosine distance between the 100 nearest users found. Takes about 28~30 minutes.

In [None]:
# import pickle
# op_path = '/content/drive/MyDrive/DATASET/'
# distancs = pickle.load(open(op_path + 'distances.pickle','rb'))
# distancs1 = pickle.load(open(op_path + 'distances1.pickle','rb'))
# nearest_neighbors = distancs[:14]+distancs1[14:29]

In [None]:
import numpy as np
from time import time
from scipy import spatial
from scipy.stats import pearsonr  
from sklearn.metrics.pairwise import cosine_similarity
weights_corr = np.zeros((28978, 101, 1))
weights_cosign = np.zeros((28978, 101, 1))
vals = 0
st = time()
for part in nearest_neighbors:
  for nns in part:
    for i, nn in enumerate(nns[1:102]):
      weights_corr[nns[0],i] = (np.corrcoef(np.transpose(movie_id_userid_matrix_vij)[nn,:],np.transpose(movie_id_userid_matrix_vij)[nns[0],:]))[0,1]
      weights_cosign[nns[0], i] = spatial.distance.cosine(np.transpose(movie_id_userid_matrix_vij)[nn,:],np.transpose(movie_id_userid_matrix_vij)[nns[0],:])
      
      vals += 1
      if(vals > 100000):
        vals = 0
        print('Time Taken:', (time()-st)/60, "min")
        # p_out = open(op_path + 'bkp_corr.pickle','wb')
        # pickle.dump(weights_corr, p_out)
        # p_out.close()
        # p_out = open(op_path + 'bkp_cos.pickle','wb')
        # pickle.dump(weights_cosign, p_out)
        # p_out.close()
        st = time()

### Calculating normalizing factor K for correlation weights and cosine weights.

In [None]:
# p_out = open(op_path + 'bkp_cos.pickle','rb')
# weights_cosign = pickle.load(p_out)
# p_out.close()

# p_out = open(op_path + 'bkp_corr.pickle','rb')
# weights_corr = pickle.load(p_out)
# p_out.close()

In [None]:
norm_weights_corr = np.zeros((28978, 101, 1))
norm_weights_cosine = np.zeros((28978, 101, 1))
# weights_cosign = -(weights_cosign-1)
K_corr = 0
K_cos = 0
for i, wt_arr in enumerate(weights_corr):
  norm_weights_corr[i,:] = abs(wt_arr)/np.sum(abs(wt_arr))
  K_corr += np.sum(abs(wt_arr))

for i, wt_arr in enumerate(weights_cosign):
  norm_weights_cosine[i,:] = abs(wt_arr)/np.sum(abs(wt_arr))
  K_cos += np.sum(abs(wt_arr))

  norm_weights_corr[i,:] = abs(wt_arr)/np.sum(abs(wt_arr))


### Calculating the difference between a users movie rating and their average rating.

In [None]:
v_diff_matrix = np.zeros((len(set(df[0])),len(userid_keys),), dtype='float32')
for i in range(len(avg_rating_user_vi)):
  v_diff_matrix[:, i] = movie_id_userid_matrix_vij[:,i] - avg_rating_user_vi[i]

### **Predicting values for user id and movie id pairs in the test dataset and compare with actual rating.**
### 1.   Predicting using Correlation and Cosine weights.
### 2.   Finding accuracy based on exact value, 1.0 error, 1.5 error and binary classification just for reccommendation purposes.



In [None]:
import math
row_num = -1
t, f = [0,0,0,0], [0,0,0,0]
n = len(df_test[0])
err_mae = 0
err_rmse = 0
for idx, row in df_test.iterrows():
  # row_num += 1
  mid, uid, act_rat = int(row[0]), int(row[1]), row[2]
  influence = 0
  for i, nn in enumerate(nearest_neighbors[uid//1000][uid%1000][1:101]):
    influence += abs(weights_corr[uid,i])*(movie_id_userid_matrix_vij[mid, nn] - avg_rating_user_vi[nn])
  predict_rat = max(0, min(5.0, float(avg_rating_user_vi[uid] + (1/K_corr)*influence)))
  
  err_mae += abs(act_rat - predict_rat)
  err_rmse += (act_rat - predict_rat)*(act_rat - predict_rat)

  if(round(predict_rat)==act_rat):
    t[0] += 1
  else:
    f[0] += 1

  if(abs(act_rat - predict_rat) <= 1):
    t[1] += 1
  else:
    f[1] += 1

  if(abs(act_rat - predict_rat) <= 1.5):
    t[2] += 1
  else:
    f[2] += 1

  if(predict_rat <= 2.5 and act_rat <= 2.5) or (predict_rat > 2.5 and act_rat > 2.5):
    t[3] += 1
  else:
    f[3] += 1

print("For Correlation Weights:")
print("Mean Absolute Error",err_mae/n)
print("Root Mean Squared Error:",math.sqrt(err_rmse/n))
print("Accuracy for predicting the Exact Value:", 100*t[0]/(t[0]+f[0]))
print("Accuracy for predicting with error of 1.0:", 100*t[1]/(t[1]+f[1]))
print("Accuracy for predicting with error of 1.5:", 100*t[2]/(t[2]+f[2]))
print("Accuracy for predicting Binary rating <>2.5:", 100*t[3]/(t[3]+f[3]))

For Correlation Weights:
Mean Absolute Error 0.7902306948553648
Root Mean Squared Error: 0.9886975382052426
Accuracy for predicting the Exact Value: 38.74380461394534
Accuracy for predicting with error of 1.0: 69.03401739684308
Accuracy for predicting with error of 1.5: 87.00113457672326
Accuracy for predicting Binary rating <>2.5: 83.94374888035192


In [None]:
import math
row_num = -1
t, f = [0,0,0,0], [0,0,0,0]
n = len(df_test[0])
err_mae = 0
err_rmse = 0

for idx, row in df_test.iterrows():
  mid, uid, act_rat = int(row[0]), int(row[1]), row[2]
  influence = 0
  for i, nn in enumerate(nearest_neighbors[uid//1000][uid%1000][1:101]):
    influence += abs(weights_cosign[uid,i])*(movie_id_userid_matrix_vij[mid, nn] - avg_rating_user_vi[nn])
  predict_rat = max(0, min(5.0, float(avg_rating_user_vi[uid] + (1/K_cos)*influence)))
  
  err_mae += abs(act_rat - predict_rat)
  err_rmse += (act_rat - predict_rat)*(act_rat - predict_rat)

  if(round(predict_rat)==act_rat):
    t[0] += 1
  else:
    f[0] += 1

  if(abs(act_rat - predict_rat) <= 1):
    t[1] += 1
  else:
    f[1] += 1

  if(abs(act_rat - predict_rat) <= 1.5):
    t[2] += 1
  else:
    f[2] += 1

  if(predict_rat <= 2.5 and act_rat <= 2.5) or (predict_rat > 2.5 and act_rat > 2.5):
    t[3] += 1
  else:
    f[3] += 1

print("For Cosine Similarity Weights:")
print("Mean Absolute Error",err_mae/n)
print("Root Mean Squared Error:",math.sqrt(err_rmse/n))
print("Accuracy for predicting the Exact Value:", 100*t[0]/(t[0]+f[0]))
print("Accuracy for predicting with error of 1.0:", 100*t[1]/(t[1]+f[1]))
print("Accuracy for predicting with error of 1.5:", 100*t[2]/(t[2]+f[2]))
print("Accuracy for predicting Binary rating <>2.5:", 100*t[3]/(t[3]+f[3]))

For Cosine Similarity Weights:
Mean Absolute Error 1.520541810147495
Root Mean Squared Error: 1.8681189547001045
Accuracy for predicting the Exact Value: 19.028045940404866
Accuracy for predicting with error of 1.0: 50.97135691395131
Accuracy for predicting with error of 1.5: 50.97135691395131
Accuracy for predicting Binary rating <>2.5: 83.27693624475009
