In [1]:
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise.model_selection import KFold
from surprise.model_selection import GridSearchCV
from surprise import SVD, NMF, KNNWithZScore, KNNWithMeans
from surprise import AlgoBase

from scipy import sparse

import matplotlib.pyplot as plt
%matplotlib inline

import os
# import sys
import math
import statistics
import collections

import sklearn as sk
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import NMF, TruncatedSVD
import numpy as np
import pandas as pd

import random
my_seed = 0
random.seed(my_seed)
np.random.seed(my_seed)

import datetime
from datetime import datetime
from time import time
import timeit

import math

In [2]:
class RatingDataset:
    import numpy as np
    from scipy import sparse
    
    def __init__(self):
        self.rating_mat = None
        # self.time_mat = None
        self._data_file_path = ''
        
        # list of raw user_IDs (dataset IDs)
        self.items = []
        self.users = []
        self.item_n = 0
        self.user_n = 0
        
        # maps raw user_id to user_iid(or inner id)
        self.user_to_iid = {}
        # maps user inner id to dataset raw ID
        self.user_to_ID = {}
        # maps raw item_id (dataset) to item_iid(or inner id)
        self.item_to_iid = {}
        # maps item inner id to dataset raw ID
        self.item_to_ID = {}
        
        # list of triples of (item, rating, timestamp) for each user_iid. 
        # TODO: In case there were no Timestamp in the data, pairs of (item, rating) will be kept
        self.user_ratings = []
        # list of pair of (user, rating) for each item_iid
        self.item_ratings = []
        
        
    def __get_line_format_indices(self, line_format):
        # specifying the order of 'user, item, rating, timestamp' in each line 
        lf_sp = line_format.split(' ')
        # if len(lf_sp) != 4:
        #     raise Exception('''Bad line format!
        #     line_format should be space-separated and it should always specified by 
        #     "user item rating timestamp" with any order!''')
        user_idx = -1
        item_idx = -1
        rating_idx = -1
        # timestamp_idx = -1
        for c in range(len(lf_sp)):
            if lf_sp[c] == 'user':
                user_idx = c
            elif lf_sp[c] == 'item':
                item_idx = c
            elif lf_sp[c] == 'rating':
                rating_idx = c
            # elif lf_sp[c] == 'timestamp':
            #     timestamp_idx = c
            else:
                raise Exception('line_format must be exactly dictated by one of: (user/item/rating/timestamp) separated by sep!')
        
        # return user_idx, item_idx, rating_idx, timestamp_idx
        return user_idx, item_idx, rating_idx
    
    
    '''
        Read the rating data from file and parse it and then make the dataset.
    '''
    # def read_from_file(self, data_fn, skip_lines=0, sep=',', line_format='user item rating timestamp'):
    def read_from_file(self, data_fn, skip_lines=0, sep=',', line_format='user item rating'):
        
        # user_fmt_idx, item_fmt_idx, rating_fmt_idx, timestamp_fmt_idx = self.__get_line_format_indices(line_format)
        user_fmt_idx, item_fmt_idx, rating_fmt_idx = self.__get_line_format_indices(line_format)
        
        file = open(data_fn, 'r')
        
        # skip lines that are specified from input
        for _ in range(skip_lines):
            file.readline()
            
        # users list as in input file
        users_lin = []
        items_lin = []
        ratings_lin = []
        # timestamps_lin = []
        for l in file:
            lsp = l.split(sep)
            user_id = lsp[user_fmt_idx]
            item_id = lsp[item_fmt_idx]
            rating = float(lsp[rating_fmt_idx])
            # timestamp = int(lsp[timestamp_fmt_idx].strip('\n'))
            
            users_lin.append(user_id)
            items_lin.append(item_id)
            ratings_lin.append(rating)
            # timestamps_lin.append(timestamp)
            
        self.users = list(set(users_lin))
        self.items = list(set(items_lin))
        
        self.user_n = len(self.users)
        self.item_n = len(self.items)
        
        '''note that raw ids are in STRING format, and the iid in INTEGER format!'''
        # set the mappings
        for idx in range(self.user_n): 
            self.user_to_iid[self.users[idx]] = idx
            
        for idx in range(self.user_n):
            self.user_to_ID[idx] = self.users[idx] 
            
        for idx in range(self.item_n):
            self.item_to_iid[self.items[idx]] = idx 
            
        for idx in range(self.item_n):
            self.item_to_ID[idx] = self.items[idx] 
        
        # init rating matrix
        self.rating_mat = sparse.lil_matrix((self.user_n, self.item_n))
        # self.time_mat = sparse.lil_matrix((self.user_n, self.item_n))
        for idx in range(len(users_lin)):
            user_iid = self.user_to_iid[users_lin[idx]]
            item_iid = self.item_to_iid[items_lin[idx]]
            rating = ratings_lin[idx]
            self.rating_mat[user_iid, item_iid] = rating
            # self.time_mat[user_iid, item_iid] = timestamps_lin[idx]
            
            
    def list_users_ratings(self, rating_matrix):
        # finding the user and item ratings
        user_ratings = []
        for user_iid in range(self.user_n):
            # append a list for this user
            user_ratings.append([])
            user_nonze = np.nonzero(rating_matrix[user_iid])
            for item_iid in user_nonze[1]:
                # add items and its rating into the last user added to the list
                user_ratings[-1].append((item_iid, rating_matrix[user_iid, item_iid]))
                if rating_matrix[user_iid, item_iid] == 0:
                    raise Exception('Found zero rating in nonzero ratings of user with inner id %d and item iid %d!' % (user_iid, item_iid))
        return user_ratings
    
            
    def list_items_ratings(self, rating_matrix):
        item_ratings = []
        for item_iid in range(self.item_n):
            # append a list for this item
            item_ratings.append([])
            item_nonze = np.nonzero(rating_matrix.T[item_iid])
            for user_iid in item_nonze[1]:
                # add users and its rating into the last item added to the list
                item_ratings[-1].append((user_iid, rating_matrix[user_iid, item_iid]))
                if rating_matrix[user_iid, item_iid] == 0:
                    raise Exception('Found zero rating in nonzero ratings of user with inner id %d and item iid %d!' % (user_iid, item_iid))
        return item_ratings
        
            
    def train_test_split(self, test_percent=0.2, least_userlen_test=10):
        if test_percent > 1:
            raise Exception('test_percent should be between 0 and 1.')
            
        user_ratings = self.list_users_ratings(self.rating_mat)
        
        mat = sparse.lil_matrix((self.user_n, self.item_n))
        user_tests = {}
        n_users_in_test = 0
        n_ratings_in_test = 0
        n_ratings_in_train = 0
        
        for user_iid in range(self.user_n):
            len_u = len(user_ratings[user_iid])
            if len_u >= least_userlen_test:
                n_users_in_test += 1
                test_len = int(len_u * test_percent)
                test_set_u = list(range(len_u))
#                 print(test_len, len_u)
                random.shuffle(test_set_u)
                
                train_set_u = test_set_u[test_len:][:]
                test_set_u = test_set_u[:test_len][:]
                
#                 print(len(train_set_u))
                
                for ir_idx in train_set_u:
                    # ir = the pair of (item, rating)
                    ir = user_ratings[user_iid][ir_idx]
                    mat[user_iid, ir[0]] = ir[1]
                    n_ratings_in_train += 1
                
                user_tests[user_iid] = []
                for ir_idx in test_set_u:
                    # ir = the pair of (item, rating)
                    ir = user_ratings[user_iid][ir_idx]
                    user_tests[user_iid].append(ir)
                    n_ratings_in_test += 1
                    
            else: # if no test set should be seprated from ratings of this user
                for ir in user_ratings[user_iid]:
                    # ir = the pair of (item, rating)
                    mat[user_iid, ir[0]] = ir[1]
                    n_ratings_in_train += 1
    
        print('\nNumber of users with some items in testset: %d' % n_users_in_test)
        print('Number of ratings in trainset: %d \t Number of ratings in testset: %d\n' % (n_ratings_in_train, n_ratings_in_test))
        return mat, user_tests
    
    def cross_validate(self, n_splits=5):

        user_ratings = self.list_users_ratings(self.rating_mat)
        len_user_ratings = {u:len(user_ratings[0]) for u in range(self.user_n)}
        user_indices = {}
        for u in range(self.user_n):
            indices = list(range(len_user_ratings[u]))
            random.shuffle(indices)
            user_indices[u] = indices

        
        user_start = {}
        user_stop = {}
        for u in range(self.user_n):
            user_start[u] = 0
            user_stop[u] = 0


        for fold_i in range(n_splits):
            train_mat = sparse.lil_matrix((self.user_n, self.item_n))
            user_tests = {}

            for u in range(self.user_n):
                user_start[u] = user_stop[u]
                user_stop += len_user_ratings[u] // n_splits
                if fold_i < len_user_ratings[u] % n_splits:
                    user_stop += 1
                
                train_set_indices = user_indices[u][:user_start[u]] + user_indices[u][user_stop[u]:]
                test_set_indices = user_indices[u][user_start[u]:user_stop[u]]

                for ir_idx in train_set_indices:
                    ir = user_ratings[u][ir_idx]
                    train_mat[u, ir[0]] = ir[1]
                
                user_tests[u] = []
                for ir_idx in test_set_indices:
                    ir = user_ratings[u][ir_idx]
                    user_tests[u].append(ir)

            yield train_mat, user_tests
                



In [3]:
def seperate_preds(preds):
    
    pro_preds = []
    unpro_preds = []

    for row in preds:
        # check here: pro_users_index consists of inner ids
        if iid_to_gender[row.uid] == 'F':
            pro_preds.append(row)
        else:
            unpro_preds.append(row)
            
    return pro_preds, unpro_preds

In [2]:
df = pd.read_csv('data/ml-100k/udata.csv', sep=';', header=0, engine='python', names=['user', 'item', 'rating'])
user = pd.read_csv('data/ml-100k/uuser.csv', sep=';', header=0, engine='python', names=['id', 'age', 'gender', 'occupation', 'zipcode'])
user = user[['id', 'gender']]

In [3]:
user['is_pro'] = 0
for i, row in user.iterrows():
    if row['gender'] == 'F':
        user.loc[i, 'is_pro'] = 1

In [4]:
df = df.merge(user, left_on='user', right_on='id')

In [5]:
m_df = df[df['gender'] == 'M']
f_df = df[df['gender'] == 'F']

In [6]:
m_df = m_df[['user', 'item', 'rating']].sample(n=25000, random_state=1)
f_df = f_df[['user', 'item', 'rating']].sample(n=25000, random_state=1)

In [7]:
df = pd.concat([m_df[:25000], f_df[:25000]]).sample(frac=1, axis=0).reset_index(drop=True)

In [10]:
df.to_csv('data/ml-100k/udata_sampled.csv', index=False)

In [8]:
m_df.to_csv('data/ml-100k/udata_male_sampled.csv', index=False)
f_df.to_csv('data/ml-100k/udata_female_sampled.csv', index=False)

## Male only

In [145]:
dataset = RatingDataset()
dataset.read_from_file(data_fn='./data/ml-100k/udata_male_sampled.csv', skip_lines=1)

In [146]:
train_mat, test_mat = dataset.train_test_split(test_percent=0.2, least_userlen_test=10)


Number of users with some items in testset: 346
Number of ratings in trainset: 8441 	 Number of ratings in testset: 1559



In [147]:
user_ratings = dataset.list_users_ratings(train_mat)
# user_ratings[0]

tr_lst = []

for user_iid in range(dataset.user_n): 
    # trainset or dataset.user_n?? we keep the users the same (user-fixed) so they are equal.
    
    if user_ratings[user_iid]:
        base_rec = pd.DataFrame(user_ratings[user_iid])
        base_rec[2] = user_iid

        tr_lst.append(base_rec[[2,0,1]])
    

train_df = pd.concat(tr_lst, ignore_index=True)
train_df.columns = ['user','item','rating']
train_df.head()

Unnamed: 0,user,item,rating
0,0,103,4.0
1,0,306,3.0
2,0,327,1.0
3,0,384,3.0
4,0,414,2.0


In [148]:
# test set to a dataframe
test_lst = []

for uiid in test_mat.keys():
    base_rec = pd.DataFrame(test_mat[uiid])
    base_rec[2] = uiid
    test_lst.append(base_rec[[2,0,1]])


test_df = pd.concat(test_lst, ignore_index=True)
test_df.columns = ['user','item','rating']
test_df.head()

Unnamed: 0,user,item,rating
0,0,749,2.0
1,0,1138,3.0
2,1,609,4.0
3,1,888,2.0
4,1,880,3.0


In [149]:
reader = Reader(rating_scale=(1, 5))

# got the best(lowest) results of 0.86
svd_sup = SVD(n_epochs = 100, n_factors = 100, lr_all=0.005, reg_all=0.1, verbose = False, random_state=41023)


In [150]:
testset = Dataset.load_from_df(test_df[['user', 'item', 'rating']], reader).build_full_trainset().build_testset()

train_sup = Dataset.load_from_df(train_df[['user', 'item', 'rating']], reader).build_full_trainset()
svd_sup.fit(train_sup)
preds = svd_sup.test(testset)
rmse = accuracy.rmse(preds, verbose=False)

In [151]:
rmse

0.9580273750170519

## Female only

In [152]:
dataset = RatingDataset()
dataset.read_from_file(data_fn='./data/ml-100k/udata_female_sampled.csv', skip_lines=1)

In [153]:
train_mat, test_mat = dataset.train_test_split(test_percent=0.2, least_userlen_test=10)


Number of users with some items in testset: 220
Number of ratings in trainset: 8167 	 Number of ratings in testset: 1833



In [154]:
user_ratings = dataset.list_users_ratings(train_mat)
# user_ratings[0]

tr_lst = []

for user_iid in range(dataset.user_n): 
    # trainset or dataset.user_n?? we keep the users the same (user-fixed) so they are equal.
    
    if user_ratings[user_iid]:
        base_rec = pd.DataFrame(user_ratings[user_iid])
        base_rec[2] = user_iid

        tr_lst.append(base_rec[[2,0,1]])
    

train_df = pd.concat(tr_lst, ignore_index=True)
train_df.columns = ['user','item','rating']
train_df.head()

Unnamed: 0,user,item,rating
0,0,11,1.0
1,0,72,1.0
2,0,105,3.0
3,0,119,5.0
4,0,201,4.0


In [155]:
# test set to a dataframe
test_lst = []

for uiid in test_mat.keys():
    base_rec = pd.DataFrame(test_mat[uiid])
    base_rec[2] = uiid
    test_lst.append(base_rec[[2,0,1]])


test_df = pd.concat(test_lst, ignore_index=True)
test_df.columns = ['user','item','rating']
test_df.head()

Unnamed: 0,user,item,rating
0,0,220,5.0
1,0,680,4.0
2,0,618,5.0
3,0,1217,5.0
4,0,1172,5.0


In [156]:
reader = Reader(rating_scale=(1, 5))

# got the best(lowest) results of 0.86
svd_sup = SVD(n_epochs = 100, n_factors = 100, lr_all=0.005, reg_all=0.1, verbose = False, random_state=41023)


In [157]:
testset = Dataset.load_from_df(test_df[['user', 'item', 'rating']], reader).build_full_trainset().build_testset()

train_sup = Dataset.load_from_df(train_df[['user', 'item', 'rating']], reader).build_full_trainset()
svd_sup.fit(train_sup)
preds = svd_sup.test(testset)
rmse = accuracy.rmse(preds, verbose=False)

In [158]:
rmse

1.0228167253388734

## Join genders

In [12]:
dataset = RatingDataset()
dataset.read_from_file(data_fn='./data/ml-100k/udata_sampled.csv', skip_lines=1)

In [13]:
'pro & unpro data structures'

global pro_users_index
global unpro_users_index

pro_users_index = []
unpro_users_index = []


# initializing a column
# this is inner id or index
user['iid'] = 0.1


skip_c = 0

for i, row in user[['id', 'is_pro']].iterrows():
    try: 
        # user id to inner_id (index)
        u_index = dataset.user_to_iid[str(row['id'])]
        # print(row['id'], u_index, dataset.user_to_ID[u_index])

        # just add the inner id (index) to the user dataframe
        user.loc[user['id'] == row['id'], 'iid'] = u_index


        if row['is_pro'] == 1:
            # pro_users_id.append(row['id'])
            pro_users_index.append(u_index)

        else:
            # unpro_users_id.append(row['id'])
            unpro_users_index.append(u_index)
    except KeyError as err:
        skip_c += 1
        
        

print('users skipped', skip_c, '\n')
print('protected users', len(pro_users_index))
print('unprotected users', len(unpro_users_index))
print()

user.head(5)


users skipped 35 

protected users 273
unprotected users 635



Unnamed: 0,id,gender,is_pro,iid
0,1,M,0,275.0
1,2,F,1,142.0
2,3,M,0,798.0
3,4,M,0,752.0
4,5,F,1,5.0


In [14]:
user = user[user['iid']!=0.1]
user

Unnamed: 0,id,gender,is_pro,iid
0,1,M,0,275.0
1,2,F,1,142.0
2,3,M,0,798.0
3,4,M,0,752.0
4,5,F,1,5.0
...,...,...,...,...
937,938,F,1,513.0
938,939,F,1,666.0
939,940,M,0,521.0
941,942,F,1,528.0


In [15]:
iid_to_gender = dict(zip(user.iid, user.gender))
id_to_gender = dict(zip(user.id, user.gender))

In [16]:
train_mat, test_mat = dataset.train_test_split(test_percent=0.2, least_userlen_test=10)


Number of users with some items in testset: 347
Number of ratings in trainset: 8608 	 Number of ratings in testset: 1392



In [17]:
user_ratings = dataset.list_users_ratings(train_mat)
# user_ratings[0]

tr_lst = []

for user_iid in range(dataset.user_n): 
    # trainset or dataset.user_n?? we keep the users the same (user-fixed) so they are equal.
    
    if user_ratings[user_iid]:
        base_rec = pd.DataFrame(user_ratings[user_iid])
        base_rec[2] = user_iid

        tr_lst.append(base_rec[[2,0,1]])
    

train_df = pd.concat(tr_lst, ignore_index=True)
train_df.columns = ['user','item','rating']
train_df.head()

Unnamed: 0,user,item,rating
0,0,32,3.0
1,0,41,3.0
2,0,43,3.0
3,0,53,4.0
4,0,70,1.0


In [18]:
# test set to a dataframe
test_lst = []

for uiid in test_mat.keys():
    base_rec = pd.DataFrame(test_mat[uiid])
    base_rec[2] = uiid
    test_lst.append(base_rec[[2,0,1]])


test_df = pd.concat(test_lst, ignore_index=True)
test_df.columns = ['user','item','rating']
test_df.head()

Unnamed: 0,user,item,rating
0,0,11,4.0
1,0,1219,3.0
2,0,318,3.0
3,0,1069,4.0
4,0,775,3.0


In [19]:
reader = Reader(rating_scale=(1, 5))

# got the best(lowest) results of 0.86
svd_sup = SVD(n_epochs = 100, n_factors = 100, lr_all=0.005, reg_all=0.1, verbose = False, random_state=41023)


In [20]:
testset = Dataset.load_from_df(test_df[['user', 'item', 'rating']], reader).build_full_trainset().build_testset()
train_sup = Dataset.load_from_df(train_df[['user', 'item', 'rating']], reader).build_full_trainset()

In [21]:
svd_sup.fit(train_sup)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1078556d0>

In [None]:
svd_sup.

In [167]:

svd_sup.fit(train_sup)
preds = svd_sup.test(testset)
rmse = accuracy.rmse(preds, verbose=False)
pro_, unpro_ = seperate_preds(preds)

In [168]:
accuracy.rmse(unpro_, verbose=False)

0.982410071566697

In [169]:
accuracy.rmse(pro_, verbose=False)

1.0513857790677856