In [None]:
!pip install scikit-surprise

# SET LIBRARY

In [1]:
import numpy as np
import pandas as pd

from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise.model_selection import train_test_split
from surprise.model_selection import LeaveOneOut

# LOAD DATA

In [None]:
#!pwd
!ls

In [2]:
tab = pd.read_csv("./ratings.csv", names = ['uid','iid','rating','timestamp'], header=1)
#tab = tab.sample(30000)

In [3]:
tab.head()

Unnamed: 0,uid,iid,rating,timestamp
0,1,1029,3.0,1260759179
1,1,1061,3.0,1260759182
2,1,1129,2.0,1260759185
3,1,1172,4.0,1260759205
4,1,1263,2.0,1260759151


In [4]:
tab.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100003 entries, 0 to 100002
Data columns (total 4 columns):
uid          100003 non-null int64
iid          100003 non-null int64
rating       100003 non-null float64
timestamp    100003 non-null int64
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [6]:
print("# of User: ", data.df['uid'].nunique())
print("# of Item: ", data.df['iid'].nunique())

# of User:  671
# of Item:  9066


In [18]:
iid = np.random.choice(tab['iid'].unique(),2000)

In [20]:
tab = tab[tab['iid'].isin(iid)]

In [27]:
reader = Reader(line_format='user item rating', rating_scale=(1, 5))
data = Dataset.load_from_df(tab[['uid', 'iid', 'rating']], reader)
train, test = train_test_split(data, test_size=.25, random_state=1)

In [28]:
gph = SVD(random_state=10)
gph.fit(train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7ffa3e4ee390>

In [29]:
pred = gph.test(test)

In [23]:
from surprise import accuracy
from collections import defaultdict

class Metrics:
    def MAE(pred):
        return accuracy.mae(pred, verbose=False)
    
    def RMSE(pred):
        return accuracy.rmse(pred, verbose=False)
    
    def TopN(pred, n=10, minRating=4.0):
        '''Return the top-N recommendation for each user from a set of predictions.
        Args:
            predictions(list of Prediction objects): The list of predictions, as
                returned by the test method of an algorithm.
            n(int): The number of recommendation to output for each user. Default
                is 10.

        Returns:
        A dict where keys are user (raw) ids and values are lists of tuples:
            [(raw item id, rating estimation), ...] of size n.
        '''
        topN = defaultdict(list)
        
        # First map the predictions to each user.
        for uid, iid, obsRating, prdRating, _ in pred:
            if (obsRating >= minRating):
                topN[uid].append((iid, prdRating))
                
        # Then sort the predictions for each user and retrieve the k highest ones.
        for uid, ratings in topN.items():
            ratings.sort(key=lambda x: x[1], reverse=True)
            topN[uid] = ratings[:n]
            
        return topN
        
    def Hitrate(prdTopN, prdLeftOut):
        hits = 0
        total = 0
        
        for leftout in prdLeftOut:
            user_leftout = leftout[0]
            item_leftout = leftout[1]
            
            hit = False
            for item_TopN, Rating_TopN in prdTopN[int(user_leftout)]:
                if(int(item_TopN) == int(item_leftout)):
                    hit = True
                    break
            if (hit):
                hits += 1
            total += 1
            
        return hits/total
    
    def CumeHitrate(prdTopN, prdLeftOut, cutRating=0):
        hits = 0
        total = 0
        
        for leftout in prdLeftOut:
            user_leftout = leftout[0]
            item_leftout = leftout[1]
            obsRating = leftout[2]
            
            if (obsRating >= cutRating):
                hit = False
                for item_TopN, Rating_TopN in prdTopN[int(user_leftout)]:
                    if(int(item_TopN) == int(item_leftout)):
                        hit = True
                        break
                if (hit):
                    hits += 1
                total += 1
       
        return hits/total                
            

In [30]:
print("MAE: ", round(Metrics.MAE(pred),4))
print("RMSE: ", round(Metrics.RMSE(pred),4))

MAE:  0.7135
RMSE:  0.9228


In [31]:
LOOCV = LeaveOneOut(n_splits=1, random_state=1)

# SANITY CHECK

In [32]:
print("# of User: ", data.df['uid'].nunique())
print("# of Item: ", data.df['iid'].nunique())
print("# of Every Combinations: ", data.df['uid'].nunique(), "*", data.df['iid'].nunique(),"=", data.df['uid'].nunique()*data.df['iid'].nunique())
print("# of Represented Instances:", data.df.shape[0])
#print("# of Represented Instances:", data.df.loc[:,['uid','iid']].apply(lambda df: str(df['uid'])+":"+str(df['iid']), axis=1).nunique())
for train, test in LOOCV.split(data):
    train_leftout, test_leftout = train, test    
    gph.fit(train_leftout)
    test0 = train_leftout.build_anti_testset()
    break
print("# of Left-out Testset:", pd.DataFrame(test_leftout).shape[0])
cnt = 0
for i in list(train_leftout.ur.values()):
    cnt += len(i)
print("# of Left-out Trainset:", cnt)
print("# of Every Combination Except Trainset:", pd.DataFrame(test0).shape[0])
#print("# of Every Combination Except Trainset:", pd.DataFrame(test0,columns=['uid','iid','rating']).loc[:,['uid','iid']].apply(lambda df: str(df['uid'])+":"+str(df['iid']), axis=1).nunique())

# of User:  671
# of Item:  1810
# of Every Combinations:  671 * 1810 = 1214510
# of Represented Instances: 20652
# of Left-out Testset: 671
# of Left-out Trainset: 19981
# of Every Combination Except Trainset: 1177684


# CHECK BUILD_ANTI_TESTSET

In [None]:
uid = pd.DataFrame({'uid':["u{}".format(i) for i in range(100)]})
iid = pd.DataFrame({'iid':["i{}".format(i) for i in range(10)]})

uid['key'] = 0
iid['key'] = 0
tab = uid.merge(iid, how='outer').drop("key", axis=1)
tab['rating'] = np.random.randint(1, 6, tab.shape[0])

reader = Reader(line_format='user item rating', rating_scale=(1, 5))
data = Dataset.load_from_df(tab[['uid', 'iid', 'rating']], reader)

In [None]:
print("# of User: ", data.df['uid'].nunique())
print("# of Item: ", data.df['iid'].nunique())
print("# of Every Combinations: ", data.df['uid'].nunique(), "*", data.df['iid'].nunique(),"=", data.df['uid'].nunique()*data.df['iid'].nunique())
print("# of Represented Instances:", data.df.shape[0])
#print("# of Represented Instances:", data.df.loc[:,['uid','iid']].apply(lambda df: str(df['uid'])+":"+str(df['iid']), axis=1).nunique())
for train, test in LOOCV.split(data):
    train_leftout, test_leftout = train, test    
    gph.fit(train_leftout)
    test0 = train_leftout.build_anti_testset()
    break
print("# of Left-out Testset:", pd.DataFrame(test_leftout).shape[0])
cnt = 0
for i in list(train_leftout.ur.values()):
    cnt += len(i)
print("# of Left-out Trainset:", cnt)
print("# of Every Combination Except Trainset:", pd.DataFrame(test0).shape[0])
#print("# of Every Combination Except Trainset:", pd.DataFrame(test0,columns=['uid','iid','rating']).loc[:,['uid','iid']].apply(lambda df: str(df['uid'])+":"+str(df['iid']), axis=1).nunique())

In [33]:
for train, test in LOOCV.split(data):
    gph.fit(train)
    
    prdLeftOut = gph.test(test)
    
    testset = train.build_anti_testset()
    prdTestSet = gph.test(testset)
    
    prdTopN = Metrics.TopN(prdTestSet, n=10)
    print(Metrics.Hitrate(prdTopN, prdLeftOut))

0.0


In [None]:
prdTopN