In [1]:
%matplotlib inline

import numpy as np
import scipy
import scipy.io
import scipy.sparse as sp
import matplotlib.pyplot as plt
from helpers import *
import csv

%load_ext autoreload
%autoreload 2

In [2]:
from helpers import load_data, preprocess_data

path_dataset = "47b05e70-6076-44e8-96da-2530dc2187de_data_train.csv"
path_submission = "9b4d32bb-f99a-466f-95a1-0ab80048971c_sample_submission (2).csv"
ratings = load_data(path_dataset)
ratings_=ratings.toarray()
submission = load_submission(path_submission)
submission_row_col = submission[0]
submission_pos = submission[1]
num_item, num_user = ratings.get_shape()

number of items: 10000, number of users: 1000


In [3]:
nb_nzi = len(ratings.nonzero()[0])
nb_nzu = len(ratings.nonzero()[1])

users = ratings.nonzero()[1]
items = ratings.nonzero()[0]

stars = []
for i in range(nb_nzi):
    stars.append(ratings[items[i],users[i]])

In [4]:
with open('surprise', 'w') as csvfile:
    fieldnames = ['item', 'user','ratings']
    writer = csv.DictWriter(csvfile, delimiter=";", fieldnames=fieldnames)
    for r1, r2,r3 in zip(items, users, stars):
        writer.writerow({'item':r1,'user':r2,'ratings':r3})

### mean calculation

In [5]:
def extract_user_mean(train):
    #calculate user mean
    user_nnz = train.getnnz(axis=0)
    user_sum = train.sum(axis=0)
    user_mean = np.empty((1, num_user))
    for ind in range(num_user):
        user_mean[0,ind] = user_sum[0,ind] / user_nnz[ind]
    return user_mean
def extract_global_mean(train):
    # calculate the global mean
    nonzero_train = train[train.nonzero()]
    global_mean = nonzero_train.mean()
    return global_mean
user_mean=extract_user_mean(ratings)
global_mean=extract_global_mean(ratings)

### baseline estimate

In [6]:
def baseline_estimate(train,lamda_i,lamda_u,epochs):
    # set the user and item baselines
    bu = np.zeros(num_user)
    bi = np.zeros(num_item)    
    
    # group the indices by row or column index
    nz_train, nz_item_userindices, nz_user_itemindices = build_index_groups(train)
    
    #using Alternating Least Squares (ALS)
    for iter_ in range(epochs):
        for i,i_users in nz_item_userindices:
            dev_i = 0
            for u in i_users:
                dev_i += train[i,u] - global_mean - bu[u]

            bi[i] = dev_i / (lamda_i + len(i_users))

        for u,u_items in nz_user_itemindices:
            dev_u = 0    
            for i in u_items:
                dev_u += train[i,u] - global_mean - bi[i]

            bu[u] = dev_u / (lamda_u + len(u_items))
   
    return bu,bi

In [7]:
# set the parameters
lamda_i = 10
lamda_u = 15
epochs = 10
#baseline_estimate
bu,bi = baseline_estimate(ratings,lamda_i,lamda_u,epochs)

### Similarity with pearson baseline

In [8]:
def user_based_similarity_by_pearson_baseline(train,min_support,global_mean, user_biases, item_biases, shrinkage=100):
    train=train.toarray()
    # set some matrixs
    freq = np.zeros((num_user, num_user))# matrix of number of common items
    prods = np.zeros((num_user, num_user))# matrix of sum (r_ui - b_ui) * (r_vi - b_vi) for common items
    sq_diff_u = np.zeros((num_user,num_user))# matrix of sum (r_ui - b_ui)**2 for common items
    sq_diff_v = np.zeros((num_user,num_user))# matrix of sum (r_vi - b_vi)**2 for common items
    sim = np.zeros((num_user, num_user))#matrix of similatiries

    # Need this because of shrinkage. When pearson coeff is zero when support is 1, so that's OK.
    min_support = max(2, min_support)

    # group the indices by row or column index
    nz_train, nz_item_userindices, nz_user_itemindices = build_index_groups(train)
    
    for u,items_u in nz_user_itemindices:
        sim[u, u] = 1
        for v,items_v in nz_user_itemindices[(u+1):]:  
            com_items = np.intersect1d(items_u,items_v)
            freq[u, v] = len(com_items)
            diff_u = (train[com_items,u] - (global_mean + item_biases[com_items] + user_biases[u]))
            diff_v = (train[com_items,v] - (global_mean + item_biases[com_items] + user_biases[v]))
            prods[u, v]= diff_u.T @ diff_v
            sq_diff_u[u, v] = diff_u.T @ diff_u
            sq_diff_v[u, v] = diff_v.T @ diff_v
            if freq[u, v] < min_support:
                sim[u, v] = 0
            else:
                # calculate the similarity
                sim[u, v] = prods[u, v] / (np.sqrt(sq_diff_u[u, v] *
                                                       sq_diff_v[u, v]))
                # shrunk similarity
                sim[u, v] *= (freq[u, v] - 1) / (freq[u, v] - 1 +
                                                     shrinkage)

            sim[v, u] = sim[u, v]

    return sim

In [9]:
#set the parameters
min_support = 1
shrinkage = 1000
sim = user_based_similarity_by_pearson_baseline(ratings, min_support, global_mean, bu, bi, shrinkage)

### KNN with means 

In [25]:
def KNN_with_user_means(train,sim_matrix,k,min_k,user_mean):
    
    pred=[]
    
    for row,col in submission_row_col:
        i = row-1
        u = col-1
        #x, y = self.switch(u, i)neighbors=[]
        neighbors=[]
        for v in range(num_user):
            if train[i,v]>0:
                new_neighbors=(v,sim_matrix[u, v],train[i,v])
                neighbors.append(new_neighbors)
        # Extract the top-K most-similar ratings
        k_neighbors = heapq.nlargest(k, neighbors, key=lambda t: t[1])

        #initial setting
        est = user_mean[u]
        sum_sim = 0
        sum_ratings = 0
        actual_k = 0
        
        # compute weighted average
        for (nb,sim_, r) in k_neighbors:
            if sim_ > 0:
                sum_sim += sim_
                sum_ratings += (sim_ * (r - user_mean[nb]) )
                actual_k += 1

        if actual_k < min_k:
            sum_ratings = 0
        if sum_sim>0:
            est += sum_ratings / sum_sim
        
        # round ratings
        if est < 1:
            est = 1
        elif est > 5:
            est = 5
        else:
            est = np.round(est)
        pred.append(est)         
    return pred


In [26]:
import heapq
import math
#initial setting    
k = 200
min_k = 1
ratings_ = ratings.toarray()
pred = KNN_with_user_means(ratings_,sim,k,min_k,user_mean[0].T)

### submission

In [27]:
create_csv_submission(submission_pos, pred, "pred_fuxian.csv")