In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('resources/filtered_train_ex.csv')

In [5]:
df = df.apply(pd.to_numeric,downcast='unsigned') # reduce memory usage

In [7]:
n_users = df['user_id'].nunique()
n_restaurants = df['restaurant_id'].nunique()
print(n_users)
print(n_restaurants)

53801
3386


In [8]:
from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype

### Create rows and columns from user_id and restaurant_id

In [9]:
user_id_c = CategoricalDtype(sorted(df.user_id.unique()), ordered=True)
restaurant_id_c = CategoricalDtype(sorted(df.restaurant_id.unique()), ordered=True)
value = df['score']
row = df.user_id.astype(user_id_c).cat.codes
col = df.restaurant_id.astype(restaurant_id_c).cat.codes

### Create score matrix

In [10]:
sparse_matrix = csr_matrix((value, (row, col)), shape=(user_id_c.categories.size, restaurant_id_c.categories.size))

In [11]:
sparse_matrix = sparse_matrix.astype('f')

In [12]:
score_matrix = np.zeros((n_users,n_restaurants),dtype=float)

In [13]:
score_matrix[:] = sparse_matrix.todense()

### Demean the score matrix

In [14]:
mean_score = df.groupby(['user_id'])['score'].mean().to_numpy()

In [15]:
demeaned_score = score_matrix - mean_score.reshape(-1,1)

### Calculate latent factors and reconstruct the score matrix

In [16]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds

u, s, vt = svds(demeaned_score, k = 50)
s_diag_matrix=np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
del u,s,vt,s_diag_matrix

### Add the mean score back

In [17]:
X_pred = X_pred + mean_score.reshape(-1,1)

In [18]:
score_df = pd.DataFrame(X_pred, index=user_id_c.categories, columns=restaurant_id_c.categories)

In [19]:
test_df = pd.read_csv('resources/test.csv',header=None)
test_df.columns = ['user_id','num_restaurant']

### Create default restaurant for new users

In [26]:
avg_score = []
for i in score_df:
    avg_score.append(score_df[score_df[i] > 1][i].size)

default_restaurant_df = pd.DataFrame(avg_score, index=score_df.columns)
default_restaurant_df.sort_values(by=0, inplace=True, ascending=False)

default_restaurant_df.to_csv('resources/default_restaurant.csv')

### Predict the test data

In [33]:
with open('output.txt','w') as file:
    for index, row in test_df.iterrows():
        user = row['user_id']
        num_restaurant = row['num_restaurant']
        user_score = score_df.loc[score_df.index==user]
        if (user_score.size==0):
            default_restaurant_df.iloc[0:num_restaurant].index.values.tofile(file,sep=',')
            file.write('\n')
        else:
            corr_restaurant = score_df.loc[score_df.index == user].transpose().sort_values(by=user, ascending=False).index.values
            corr_restaurant[0:num_restaurant].tofile(file,sep=',')
            file.write('\n')