In [1]:
import csv
import numpy as np
import os
import math
import random
random.seed(42)
np.random.seed(42)

In [2]:
from sklearn.decomposition import NMF
from surprise import Reader
from surprise import AlgoBase
from surprise import Dataset
from surprise import evaluate
from surprise import accuracy
from sklearn.model_selection import KFold

In [3]:
file_path_ratings = os.path.expanduser('ml-latest-small/ratings.csv')
file_path_movies = os.path.expanduser('ml-latest-small/movies.csv')

movie_list = dict()
user_id = []
movie_id = []
ratings = []

# loading the user_id and ratings
with open(file_path_ratings) as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        user_id.append(int(row['userId']))
        movie_id.append(int(row['movieId']))
        ratings.append(float(row['rating']))

In [4]:
#movie_list=np.zeros(max(movie_id)+1)
movie_number=0
user_number=0
user_id_u=[]
for id in user_id:
    if id not in user_id_u:
        user_id_u.append(id)
        user_number=user_number+1
print('number of user=',user_number)
movie_id_u=[]
for id in movie_id:
    if id not in movie_id_u:
        movie_list[id]=int(movie_number)
        movie_id_u.append(id)
        movie_number=movie_number+1
print('number of rated movie=',movie_number)
total_available_rating=len(user_id)
total_possible_rating=user_number*movie_number
print('Sparsity=',float(total_available_rating) / total_possible_rating)

number of user= 610
number of rated movie= 9724
Sparsity= 0.016999683055613623


In [5]:
R = np.zeros((user_number, movie_number))  
W = np.zeros((user_number, movie_number))
for i in range(len(user_id)):
    x = user_id[i] - 1
    y =  int(movie_list[movie_id[i]])
    R[x,y] = ratings[i]
    W[x,y]=1

In [6]:
np.sum(R,axis=0)
sss=np.sum(W,axis=0)
rating=np.sum(R,axis=0)/np.sum(W,axis=0)
for i in range(movie_number):
    if sss[i]<200:
        rating[i]=5 
movie_id_u[np.argmin(rating)]

780

# Question 30

In [7]:
print(R.shape)
avg=np.sum(R,axis=1)/np.sum(W,axis=1)
print(avg.shape)

(610, 9724)
(610,)


In [8]:
data = []
data.append(user_id)
data.append(movie_id)
data.append(ratings)
data = np.transpose(data)
data

array([[1.00000e+00, 1.00000e+00, 4.00000e+00],
       [1.00000e+00, 3.00000e+00, 4.00000e+00],
       [1.00000e+00, 6.00000e+00, 4.00000e+00],
       ...,
       [6.10000e+02, 1.68250e+05, 5.00000e+00],
       [6.10000e+02, 1.68252e+05, 5.00000e+00],
       [6.10000e+02, 1.70875e+05, 3.00000e+00]])

In [9]:
kf = KFold(n_splits=10,random_state=42,shuffle=True)
RMSE_avg=0
RMSE_list=[]
for train_index, test_index in kf.split(data):
    print("TRAIN_index:", train_index, "TEST_index:", test_index)
    RMSE=0
    for index in test_index:
        pred=avg[int(data[index,0]-1)]
        RMSE=RMSE+(data[index,2]-pred)*(data[index,2]-pred)
        #print(pred,data[index,2])
    RMSE_avg=RMSE_avg+math.sqrt(RMSE/len(test_index))   
    RMSE_list.append(math.sqrt(RMSE/len(test_index)))
RMSE_avg=RMSE_avg/10    
#print(RMSE_list)
print('average RMSE=',RMSE_avg)

TRAIN_index: [     0      1      2 ... 100833 100834 100835] TEST_index: [    23     35     53 ... 100811 100814 100820]
TRAIN_index: [     0      1      2 ... 100832 100833 100834] TEST_index: [     8     39     75 ... 100803 100807 100835]
TRAIN_index: [     0      1      2 ... 100833 100834 100835] TEST_index: [    12     34     54 ... 100792 100799 100808]
TRAIN_index: [     0      1      2 ... 100833 100834 100835] TEST_index: [     4      6      7 ... 100823 100827 100829]
TRAIN_index: [     0      1      2 ... 100833 100834 100835] TEST_index: [    21     32     33 ... 100801 100806 100813]
TRAIN_index: [     1      2      3 ... 100833 100834 100835] TEST_index: [     0     11     13 ... 100824 100830 100832]
TRAIN_index: [     0      2      4 ... 100833 100834 100835] TEST_index: [     1      3      9 ... 100800 100819 100825]
TRAIN_index: [     0      1      2 ... 100832 100833 100835] TEST_index: [    14     16     29 ... 100812 100828 100834]
TRAIN_index: [     0      1     

# Question 31

In [10]:
# user defined functions
def PopTrim(test_index, movieFreq):
    new_index=[]
    for index in test_index:
        if movieFreq[movie_list[data[index,1]]] > 2:
            new_index.append(index)
    return new_index

def UnpopTrim(test_index, movieFreq):
    new_index=[]
    for index in test_index:
        if movieFreq[movie_list[data[index,1]]] <= 2:
            new_index.append(index)
    return new_index

def HighVarTrim(testset, movieFreq, movieVar):
    new_index=[]
    for index in test_index:
        if movieVar[movie_list[data[index,1]]] >= 2 and movieFreq[movie_list[data[index,1]]] >= 5:
            new_index.append(index)
            
    return new_index
movieFreq = np.sum(R != 0, axis = 0)
movieFreq_dict = {m: movieFreq[movie_list[m]] for m in movie_list.keys()}
movieVar = [np.var([e for e in row if e != 0]) for row in np.transpose(R)]
movieVar_dict = {m: movieVar[movie_list[m]] for m in movie_list.keys()}

In [11]:
RMSE_avg=0
RMSE_list=[]
for train_index, test_index in kf.split(data):
    print("TRAIN_index:", train_index, "TEST_index:", test_index)    
    RMSE=0
    trimmed_index=PopTrim(test_index, movieFreq)
    #print(trimmed_index)
    for index in trimmed_index:
        pred=avg[int(data[index,0]-1)]
        RMSE=RMSE+(data[index,2]-pred)*(data[index,2]-pred)
        #print(pred,data[index,2])
    RMSE_avg=RMSE_avg+math.sqrt(RMSE/len(trimmed_index))   
    RMSE_list.append(math.sqrt(RMSE/len(trimmed_index)))
RMSE_avg=RMSE_avg/10    
#print(RMSE_list)
print('average RMSE=',RMSE_avg)

TRAIN_index: [     0      1      2 ... 100833 100834 100835] TEST_index: [    23     35     53 ... 100811 100814 100820]
TRAIN_index: [     0      1      2 ... 100832 100833 100834] TEST_index: [     8     39     75 ... 100803 100807 100835]
TRAIN_index: [     0      1      2 ... 100833 100834 100835] TEST_index: [    12     34     54 ... 100792 100799 100808]
TRAIN_index: [     0      1      2 ... 100833 100834 100835] TEST_index: [     4      6      7 ... 100823 100827 100829]
TRAIN_index: [     0      1      2 ... 100833 100834 100835] TEST_index: [    21     32     33 ... 100801 100806 100813]
TRAIN_index: [     1      2      3 ... 100833 100834 100835] TEST_index: [     0     11     13 ... 100824 100830 100832]
TRAIN_index: [     0      2      4 ... 100833 100834 100835] TEST_index: [     1      3      9 ... 100800 100819 100825]
TRAIN_index: [     0      1      2 ... 100832 100833 100835] TEST_index: [    14     16     29 ... 100812 100828 100834]
TRAIN_index: [     0      1     

# Question 32

In [12]:
RMSE_avg=0
RMSE_list=[]
for train_index, test_index in kf.split(data):
    print("TRAIN_index:", train_index, "TEST_index:", test_index)    
    RMSE=0
    trimmed_index=UnpopTrim(test_index, movieFreq)
    #print(trimmed_index)
    for index in trimmed_index:
        pred=avg[int(data[index,0]-1)]
        RMSE=RMSE+(data[index,2]-pred)*(data[index,2]-pred)
        #print(pred,data[index,2])
    RMSE_avg=RMSE_avg+math.sqrt(RMSE/len(trimmed_index))   
    RMSE_list.append(math.sqrt(RMSE/len(trimmed_index)))
RMSE_avg=RMSE_avg/10    
#print(RMSE_list)
print('average RMSE=',RMSE_avg)

TRAIN_index: [     0      1      2 ... 100833 100834 100835] TEST_index: [    23     35     53 ... 100811 100814 100820]
TRAIN_index: [     0      1      2 ... 100832 100833 100834] TEST_index: [     8     39     75 ... 100803 100807 100835]
TRAIN_index: [     0      1      2 ... 100833 100834 100835] TEST_index: [    12     34     54 ... 100792 100799 100808]
TRAIN_index: [     0      1      2 ... 100833 100834 100835] TEST_index: [     4      6      7 ... 100823 100827 100829]
TRAIN_index: [     0      1      2 ... 100833 100834 100835] TEST_index: [    21     32     33 ... 100801 100806 100813]
TRAIN_index: [     1      2      3 ... 100833 100834 100835] TEST_index: [     0     11     13 ... 100824 100830 100832]
TRAIN_index: [     0      2      4 ... 100833 100834 100835] TEST_index: [     1      3      9 ... 100800 100819 100825]
TRAIN_index: [     0      1      2 ... 100832 100833 100835] TEST_index: [    14     16     29 ... 100812 100828 100834]
TRAIN_index: [     0      1     

# Question 33

In [13]:
RMSE_avg=0
RMSE_list=[]
for train_index, test_index in kf.split(data):
    print("TRAIN_index:", train_index, "TEST_index:", test_index)    
    RMSE=0
    trimmed_index=HighVarTrim(test_index, movieFreq,movieVar)
    #print(trimmed_index)
    for index in trimmed_index:
        pred=avg[int(data[index,0]-1)]
        RMSE=RMSE+(data[index,2]-pred)*(data[index,2]-pred)
        #print(pred,data[index,2])
    RMSE_avg=RMSE_avg+math.sqrt(RMSE/len(trimmed_index))   
    RMSE_list.append(math.sqrt(RMSE/len(trimmed_index)))
RMSE_avg=RMSE_avg/10    
#print(RMSE_list)
print('average RMSE=',RMSE_avg)

TRAIN_index: [     0      1      2 ... 100833 100834 100835] TEST_index: [    23     35     53 ... 100811 100814 100820]
TRAIN_index: [     0      1      2 ... 100832 100833 100834] TEST_index: [     8     39     75 ... 100803 100807 100835]
TRAIN_index: [     0      1      2 ... 100833 100834 100835] TEST_index: [    12     34     54 ... 100792 100799 100808]
TRAIN_index: [     0      1      2 ... 100833 100834 100835] TEST_index: [     4      6      7 ... 100823 100827 100829]
TRAIN_index: [     0      1      2 ... 100833 100834 100835] TEST_index: [    21     32     33 ... 100801 100806 100813]
TRAIN_index: [     1      2      3 ... 100833 100834 100835] TEST_index: [     0     11     13 ... 100824 100830 100832]
TRAIN_index: [     0      2      4 ... 100833 100834 100835] TEST_index: [     1      3      9 ... 100800 100819 100825]
TRAIN_index: [     0      1      2 ... 100832 100833 100835] TEST_index: [    14     16     29 ... 100812 100828 100834]
TRAIN_index: [     0      1     

# code below only for test

In [None]:
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
data = Dataset.load_from_file('ml-latest-small/ratings.csv', reader)

In [None]:
data.split(n_folds=10)
#kf = KFold(n_splits=10)

In [None]:
class MyOwnAlgorithm(AlgoBase):

    def __init__(self):
        # Always call base method before doing anything.
        AlgoBase.__init__(self)      
    

    def estimate(self, uid, iid):        
        print(uid,iid,avg[uid-1])
        return avg[uid-1]
    
algo = MyOwnAlgorithm()

for trainset, testset in data.folds(): #kf.split(data):
    # train and test algorithm.
    predictions = algo.test(testset)
    print(predictions)
    # Compute and print Root Mean Squared Error
    rmse = accuracy.rmse(predictions, verbose=True)

In [None]:
# print(avg.shape)
print(avg[11],R[11,int(movie_list[6132])])
print(avg[318],R[318,int(movie_list[1421])])

In [None]:
uid = str(20)  # raw user id (as in the ratings file). They are **strings**!
iid = str(3114)  # raw item id (as in the ratings file). They are **strings**!

# get a prediction for specific users and items.
pred = algo.predict(uid, iid, verbose=True)

In [None]:
kf = KFold(n_splits=2)
for trainset, testset in kf.split(data):
    print(trainset.shape)

In [None]:
#X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
#y = np.array([1, 2, 3, 4])
kf = KFold(n_splits=10)
kf.get_n_splits(R)
for train_index, test_index in kf.split(R):
    
    #print(train_index)
    print(train_index.shape[0],test_index.shape)

In [None]:
kf.Splits(R)

In [None]:
R