In [32]:
import os 
import numpy as np 
import pandas as pd 

import torch 
import torch.nn as nn 
import torch.nn.functional as F

import random
from matplotlib import pyplot as plt


In [27]:
dpath = 'archive-2'
data_list = []
for i in os.listdir(dpath):
    if 'combined_data' in i:
        data_list.append(i)
data_list

['combined_data_1.txt',
 'combined_data_3.txt',
 'combined_data_2.txt',
 'combined_data_4.txt']

In [28]:
df = pd.DataFrame({'Cust_ID','Rating'}) 

for data in data_list:
    temp_df = pd.read_csv(os.path.join(dpath, data), header = None, names = ['Cust_ID', 'Rating'], usecols = [0,1])
    temp_df['Rating'] = temp_df['Rating'].astype(float)
    df = pd.concat([df, temp_df])
    print("Loaded: ", data)

df.index = np.arange(0,len(df))

Loaded:  combined_data_1.txt
Loaded:  combined_data_3.txt
Loaded:  combined_data_2.txt
Loaded:  combined_data_4.txt


In [29]:
df_nan = pd.DataFrame(pd.isnull(df.Rating))
df_nan = df_nan[df_nan['Rating'] == True]
df_nan = df_nan.reset_index()
df_nan

Unnamed: 0,index,Rating
0,0,True
1,1,True
2,2,True
3,550,True
4,696,True
...,...,...
17767,100488433,True
17768,100489039,True
17769,100489244,True
17770,100490607,True


In [30]:
movie_np = []
movie_id = 1

for i,j in zip(df_nan['index'][1:],df_nan['index'][:-1]):
    # numpy approach
    temp = np.full((1,i-j-1), movie_id)
    movie_np = np.append(movie_np, temp)
    movie_id += 1

# Account for last record and corresponding length
# numpy approach
last_record = np.full((1,len(df) - df_nan.iloc[-1, 0] - 1),movie_id)
movie_np = np.append(movie_np, last_record)

print('Movie numpy: {}'.format(movie_np))
print('Length: {}'.format(len(movie_np)))

Movie numpy: [3.0000e+00 3.0000e+00 3.0000e+00 ... 1.7772e+04 1.7772e+04 1.7772e+04]
Length: 100480507


In [34]:
df = df[pd.notnull(df['Rating'])]
df['Movie_Id'] = movie_np.astype(int)
df['Cust_ID'] = df['Cust_ID'].astype(int)
print(df.iloc[::5000000, :])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Movie_Id'] = movie_np.astype(int)


             0  Cust_ID  Rating  Movie_Id
3          NaN  1488844     3.0         3
5000998    NaN   501954     2.0       998
10001964   NaN   404654     5.0      1964
15002878   NaN   886608     2.0      2878
20003827   NaN  1193835     2.0      3827
25004724   NaN    35179     4.0      4724
30005670   NaN  2561536     3.0      5670
35006570   NaN   200362     4.0      6570
40007523   NaN   686629     4.0      7523
45008341   NaN  1494017     2.0      8341
50009228   NaN   437784     4.0      9228
55010054   NaN   788058     4.0     10054
60010826   NaN   433661     4.0     10826
65011672   NaN  2402781     4.0     11672
70012700   NaN   182620     4.0     12700
75013584   NaN   506044     4.0     13584
80014455   NaN   353605     2.0     14455
85015118   NaN   664606     3.0     15118
90016010   NaN  2213715     3.0     16010
95016881   NaN  1589401     5.0     16881
100017629  NaN  2314006     4.0     17629


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Cust_ID'] = df['Cust_ID'].astype(int)


In [36]:
df.to_csv('preprocessed_df.csv', index=False)

## MF

In [None]:
def predict(P, Q, mu, b_u, b_i, user, item):
    pred = mu + b_u[user] + b_i[item] + P[user, :].T.dot(Q[item, :])
    return pred

In [31]:
def sgd(P, Q, mu, b_u, b_i, samples, lr, reg):
    for user, item, rating in samples:
        pred = predict(P, Q, mu, b_u, b_i, user, item)
        
        error = rating - pred
        
        b_u[user] += lr * (error - reg * b_u[user])
        b_i[item] += lr * (error - reg * b_i[item])
        
        P[user, :] += lr * (error * Q[item, :] - reg * P[user, :])
        Q[item, :] += lr * (error * P[user, :] - reg * Q[item, :])

In [None]:
def rmse(samples, P, Q, mu, b_u, b_i):
    error = [] 
    for user,item,rating in samples:
        square_error = (rating - predict(P, Q, mu, b_u, b_i, user, item))**2
        error.append(square_error)
    rmse = np.sqrt(np.array(error).mean())
    return rmse

In [None]:
class MF_with_sgd(object):
    
    def __init__(self, df ,num_users, num_items, F, lr, reg, epochs):
        self.df = df
        self.num_users, self.num_items = num_users, num_items
        self.F = F
        self.lr = lr 
        self.reg = reg 
        self.epochs = epochs
        
        self.summary = pd.DataFrame(columns = ['epoch','rmse'])
    
    def train(self): 
        self.P = np.random.normal(scale = 1/self.F,size = (self.num_users, self.F))
        self.Q = np.random.normal(scale = 1/self.F,size = (self.num_items, self.F))
        
        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)
        
        self.mu = np.mean(self.R[np.where(self.R != 0)])
        
        self.samples = [] 
        for idx in range(len(self.df)):
            user, item, rating = df.iloc[idx]
            self.samples.append((user,item,rating))
        
        for epoch in range(self.epochs): 
            np.random.shuffle(self.samples)
            sgd(self.P, self.Q, self.mu, self.b_u, self.b_i, self.samples, self.lr, self.reg)
            rmse(self.samples, self.P, self.Q, self.mu, self.b_u, self.b_i)

            print("Epoch: %d ; error = %.4f" % (epoch+1, rmse))
            self.summary.loc[epoch] = [epoch, rmse]
            

In [None]:
K = 10 
lr = 0.01 
reg = 0.2 
epochs = 30 

mf = MF(user_num,item_num, K, lr, reg, epochs)
mf.train()

In [None]:
result_df = mf.summary 

x = result_df['epoch'].values
y = result_df['rmse'].values

plt.plot(x, y)
plt.xticks(x,x)
plt.xlabel('Epoch')
plt.ylabel('RMSE')
plt.grid(axis = 'y')
plt.show()

## ALS

In [None]:
def als(R, P, Q, F, reg):
    for user in len(R):
        QT_Q = np.matmul(Q.T, Q)
        li = reg * np.eye(F)
        QT_ru = np.matmul(Q.T,R[user])
        P[user] = np.linalg.solve(QT_Q + li,QT_ru)
        
    for item in len(R.T):
        PT_P = np.matmul(P.T, P)
        li = reg * np.eye(F)
        PT_ri = np.matmul(P.T,R[:,item])
        Q[item] = np.linalg.solve(PT_P + li,PT_ri)

In [None]:
def als_loss(sample, R, P, Q, reg):
    loss = 0
    for user, item, rating in sample: 
        loss += (rating - np.matmul(P[user],Q[item]))**2
    for user in len(R):
        loss += reg * np.matmul(P[user],P[user])
    for item in len(R.T):
        loss += reg * np.matmul(Q[item],Q[item])
        
    return loss

In [None]:
class MF_with_als(object):
    def __init__(self, df, R, F, reg, epochs):
        self.df = df
        self.R = R
        self.num_users, self.num_items = R.shape 
        self.F = F 
        self.reg = reg 
        self.epochs = epochs 
        
        self.summary = pd.DataFrame(columns = ['epoch','loss'])
        
    def train(self):
        self.P = np.random.normal(scale = 1/self.F, size = (self.num_users, self.F))
        self.Q = np.random.normal(scale = 1/self.F, size = (self.num_items, self.F))
        self.samples = []
        
        for idx in range(len(self.df)):
            user, item, rating = df.iloc[idx]
            self.samples.append((user,item,rating))
            
        for epoch in range(self.epochs): 
            als(self.R, self.P, self.Q, self.F, self.reg)
            loss = als_loss(self.samples, self.R, self.P, self.Q, self.reg)
            self.summary.loc[epoch] = [epoch, loss]
            
    

In [None]:
R = df.pivot_table('rating','userID','movieID').fillna(0) 

F = 10 
reg = 0.2
epochs = 500 

In [None]:
MF_als = MF_with_als(df, R, F, reg, epochs)
MF_als.train()