In [56]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn import preprocessing

In [107]:
def create_train_test(data, t):
    training_df = data.iloc[0:int(data.shape[0]*t)]    
    validation_df = data.iloc[int(data.shape[0]*t):]    
    return training_df, validation_df

names = ['userId', 'movieId', 'rating', 'timestamp']
data = pd.read_csv('ml-100k/u.data', '\t', names=names,
                       engine='python')
# https://medium.com/@gazzaazhari/model-based-collaborative-filtering-systems-with-machine-learning-algorithm-d5994ae0f53b
columns = ['item_id', 'movie title', 'release date', 'video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
          'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
          'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies = pd.read_csv('ml-100k/u.item', sep='|', names=columns, encoding='latin-1')
movie_names = movies[['item_id', 'movie title']]
movie_names.head()



data['userId'] = data['userId'].astype('str')
data['movieId'] = data['movieId'].astype('str')
users = data['userId'].unique() 
movies = data['movieId'].unique() 

print(data.shape[0]*0.9)
training_df, validation_df = create_train_test(data,0.9)
print(data.shape)
print(training_df.shape)
print(validation_df.shape)

print("Total Unique Users", len(users))
# should be 943
print("Total Unique Movies", len(movies))
# should be 1682
print(data.head())
data = training_df.pivot_table(index='userId', columns='movieId', values='rating',fill_value=0)
# https://predictivehacks.com/item-based-collaborative-filtering-in-python/
data2 = validation_df.pivot_table(index='userId', columns='movieId', values='rating',fill_value=0)
print("TRAIN")
print(data.shape)
print(data.head())
print("TEST")
print(data2.shape)
print(data2.head())

90000.0
(100000, 4)
(90000, 4)
(10000, 4)
Total Unique Users 943
Total Unique Movies 1682
  userId movieId  rating  timestamp
0    196     242       3  881250949
1    186     302       3  891717742
2     22     377       1  878887116
3    244      51       2  880606923
4    166     346       1  886397596
TRAIN
(943, 1663)
movieId  1  10  100  1000  1001  1002  1003  1004  1005  1006  ...  990  991  \
userId                                                         ...             
1        5   3    5     0     0     0     0     0     0     0  ...    0    0   
10       4   0    5     0     0     0     0     0     0     0  ...    0    0   
100      0   0    0     0     0     0     0     0     0     0  ...    3    0   
101      3   0    0     0     0     0     0     0     0     0  ...    0    0   
102      0   0    0     0     0     0     0     0     0     0  ...    0    0   

movieId  992  993  994  995  996  997  998  999  
userId                                           
1          0   

In [58]:
data.shape

(943, 1682)

In [59]:
# Item based filtering
X = data.T
# user based filtering
# X = data
X.shape

(1682, 943)

In [60]:
# https://medium.com/@gazzaazhari/model-based-collaborative-filtering-systems-with-machine-learning-algorithm-d5994ae0f53b
SVD = TruncatedSVD(n_components=12, random_state=5)
resultant_matrix = SVD.fit_transform(X)
resultant_matrix.shape
# print(resultant_matrix)

(1682, 12)

In [61]:
corr_mat = np.corrcoef(resultant_matrix)
corr_mat.shape

(1682, 1682)

In [88]:
movies_list =list(movie_names['movie title'])
print(movies_list[:20])
toy_story = movies_list.index('Toy Story (1995)')
print(toy_story)

['Toy Story (1995)', 'GoldenEye (1995)', 'Four Rooms (1995)', 'Get Shorty (1995)', 'Copycat (1995)', 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)', 'Twelve Monkeys (1995)', 'Babe (1995)', 'Dead Man Walking (1995)', 'Richard III (1995)', 'Seven (Se7en) (1995)', 'Usual Suspects, The (1995)', 'Mighty Aphrodite (1995)', 'Postino, Il (1994)', "Mr. Holland's Opus (1995)", 'French Twist (Gazon maudit) (1995)', 'From Dusk Till Dawn (1996)', 'White Balloon, The (1995)', "Antonia's Line (1995)", 'Angels and Insects (1995)']
0


In [87]:
corr_toy_story = corr_mat[toy_story]
print(len(corr_toy_story))
print(len(movies_list))
print(corr_toy_story[0])
print(movies_list[:20])
mList = []
for i in range(len(movies_list)):
    if ((corr_toy_story[i]<1.0) & (corr_toy_story[i]>0.9)):
        mList.append(movies_list[i])
# list(movies_list[(corr_toy_story<1.0) & (corr_toy_story > 0.95)])
print("\n\n\n\n\n")
print("RECOMMENDED LIST HERE")
print(mList)

1682
1682
0.9999999999999998
['Toy Story (1995)', 'GoldenEye (1995)', 'Four Rooms (1995)', 'Get Shorty (1995)', 'Copycat (1995)', 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)', 'Twelve Monkeys (1995)', 'Babe (1995)', 'Dead Man Walking (1995)', 'Richard III (1995)', 'Seven (Se7en) (1995)', 'Usual Suspects, The (1995)', 'Mighty Aphrodite (1995)', 'Postino, Il (1994)', "Mr. Holland's Opus (1995)", 'French Twist (Gazon maudit) (1995)', 'From Dusk Till Dawn (1996)', 'White Balloon, The (1995)', "Antonia's Line (1995)", 'Angels and Insects (1995)']






RECOMMENDED LIST HERE
['Toy Story (1995)', 'Amadeus (1984)', 'Citizen Ruth (1996)', 'Wolf (1994)', 'Something to Talk About (1995)', 'Mrs. Winterbourne (1996)', 'Last Man Standing (1996)', 'Bad Taste (1987)', "April Fool's Day (1986)", 'Mrs. Dalloway (1997)', 'Chungking Express (1994)']


In [64]:
data = data.to_numpy()

In [92]:
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())
def SVD(learning_rate=0.2,latent_features=6, epochs = 50):

    p = np.random.uniform(0,1.1,size=(data.shape[0],latent_features))
    q = np.random.uniform(0,1.1,size=(latent_features, data.shape[1]))

    
# print(p)
# print(q)
    sse_accum = 0
    pred = p.dot(q)
    print("PRED")
    print(pred)
    print("DATA")
    print(data)
    print("DELTA")
    for epoch in range(epochs):
        for i in range(data.shape[0]):
            for j in range(data.shape[1]):
                sse_accum = 0
#                 unsure if this is necessary
        # if the rating exists
        
#             print(data[i,j])
                if data[i, j] > 0:
#          verified by debugger and annie wangliu
                    diff = data[i, j] - np.dot(p[i, :], q[:, j])
                    sse_accum += diff**2 #keep tracking the sum of square error for the matrix
                    for k in range(latent_features):
#                         update is
#                         verified by wiki, annie wangliu;
#                         verified by debugger
                        p[i, k] += learning_rate * (2*diff*q[k, j])
                        q[k, j] += learning_rate * (2*diff*p[i, k])
        learning_rate*=.95
        print("RMSE for epoch "+str(epoch+1))
        print(rmse(p.dot(q),data))
    return(p,q)

p,q=SVD()
# print("RMSE: ")
print(p.dot(q)-data)


PRED
[[2.12004072 2.55897857 2.82557894 ... 1.79760591 1.57904582 2.0744154 ]
 [1.46763223 1.56292607 1.73006782 ... 1.27836313 1.04614455 0.99559578]
 [2.40785835 2.51195133 2.47636523 ... 2.58190663 2.08716489 2.70484126]
 ...
 [1.94214976 2.35111712 2.15515117 ... 1.76963405 1.82714526 2.20542185]
 [2.37155757 2.43241685 2.45316945 ... 1.80696045 1.63966072 2.29429626]
 [1.80545034 2.03450804 1.79587103 ... 1.40638195 1.632797   2.24262222]]
DATA
[[5 3 5 ... 0 0 0]
 [4 0 5 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [4 0 2 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [4 0 5 ... 0 0 0]]
DELTA


  sse_accum += diff**2 #keep tracking the sum of square error for the matrix
  q[k, j] += learning_rate * (2*diff*p[i, k])
  p[i, k] += learning_rate * (2*diff*q[k, j])
  p[i, k] += learning_rate * (2*diff*q[k, j])


RMSE for epoch 1
nan
RMSE for epoch 2
nan
RMSE for epoch 3
nan
RMSE for epoch 4
nan
RMSE for epoch 5
nan
RMSE for epoch 6
nan
RMSE for epoch 7
nan
RMSE for epoch 8
nan
RMSE for epoch 9
nan
RMSE for epoch 10
nan
RMSE for epoch 11
nan
RMSE for epoch 12
nan
RMSE for epoch 13
nan
RMSE for epoch 14
nan
RMSE for epoch 15
nan
RMSE for epoch 16
nan
RMSE for epoch 17
nan
RMSE for epoch 18
nan
RMSE for epoch 19
nan
RMSE for epoch 20
nan
RMSE for epoch 21
nan
RMSE for epoch 22
nan
RMSE for epoch 23
nan
RMSE for epoch 24
nan
RMSE for epoch 25
nan
RMSE for epoch 26
nan
RMSE for epoch 27
nan
RMSE for epoch 28
nan
RMSE for epoch 29
nan
RMSE for epoch 30
nan
RMSE for epoch 31
nan
RMSE for epoch 32
nan
RMSE for epoch 33
nan
RMSE for epoch 34
nan
RMSE for epoch 35
nan
RMSE for epoch 36
nan
RMSE for epoch 37
nan
RMSE for epoch 38
nan
RMSE for epoch 39
nan
RMSE for epoch 40
nan
RMSE for epoch 41
nan
RMSE for epoch 42
nan
RMSE for epoch 43
nan
RMSE for epoch 44
nan
RMSE for epoch 45
nan
RMSE for epoch 46
n