In [3]:
import numpy as np
import pandas as pd

In [4]:
df = pd.read_csv('resources/filtered_train.csv')

In [5]:
df.head()

Unnamed: 0,restaurant_id,user_id,score
0,2,14,1
1,2,2087,1
2,2,4625,1
3,2,4626,1
4,2,4826,1


In [6]:
df.sort_values('user_id').head()

Unnamed: 0,restaurant_id,user_id,score
95014,5649,1,1
165979,9595,1,1
80183,3622,1,1
322352,28653,1,1
225125,12894,1,1


In [7]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 446845 entries, 0 to 446844
Data columns (total 3 columns):
restaurant_id    446845 non-null int64
user_id          446845 non-null int64
score            446845 non-null int64
dtypes: int64(3)
memory usage: 10.2 MB


In [8]:
df = df.apply(pd.to_numeric,downcast='unsigned')

In [9]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 446845 entries, 0 to 446844
Data columns (total 3 columns):
restaurant_id    446845 non-null uint32
user_id          446845 non-null uint32
score            446845 non-null uint8
dtypes: uint32(2), uint8(1)
memory usage: 3.8 MB


In [10]:
n_users = df['user_id'].nunique()
n_restaurants = df['restaurant_id'].nunique()
print(n_users)
print(n_restaurants)

90340
3368


In [11]:
from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype

In [12]:
user_id_c = CategoricalDtype(sorted(df.user_id.unique()), ordered=True)
restaurant_id_c = CategoricalDtype(sorted(df.restaurant_id.unique()), ordered=True)
value = df['score']
row = df.user_id.astype(user_id_c).cat.codes
col = df.restaurant_id.astype(restaurant_id_c).cat.codes

In [13]:
sparse_matrix = csr_matrix((value, (row, col)), shape=(user_id_c.categories.size, restaurant_id_c.categories.size))

In [14]:
sparse_matrix = sparse_matrix.astype('f')

In [36]:
print(sparse_matrix.toarray())
print(sparse_matrix.shape)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 3. 0. ... 0. 0. 0.]
 [0. 2. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(90340, 3368)


In [16]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds

#get SVD components from train matrix. Choose k.
u, s, vt = svds(sparse_matrix, k = 20)
s_diag_matrix=np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)

In [17]:
from sklearn.metrics.pairwise import pairwise_distances

In [18]:
test = pd.read_csv('resources/test.csv')

In [19]:
test.head()

Unnamed: 0,2,20
0,3,30
1,4,373
2,5,30
3,9,190
4,10,212


In [20]:
test.columns=['user_id', 'number_of_restaurants']

In [21]:
test.head()

Unnamed: 0,user_id,number_of_restaurants
0,3,30
1,4,373
2,5,30
3,9,190
4,10,212


In [43]:
# we need to map user_id to our matrix's rows
test['converted_user_id'] = test['user_id'].astype(user_id_c).cat.codes

In [45]:
user_id_c

CategoricalDtype(categories=[      1,       2,       3,       4,       5,       6,
                        8,       9,      10,      12,
                  ...
                  3003674, 3004473, 3006030, 3006874, 3006908, 3007727,
                  3007902, 3008961, 3010623, 3011215],
                 ordered=True)

In [47]:
test.head(20) # we can see here some converted_user_ids are -1 that mean they are not in our train data

Unnamed: 0,user_id,number_of_restaurants,converted_user_id
0,3,30,2
1,4,373,3
2,5,30,4
3,9,190,7
4,10,212,8
5,11,19,-1
6,14,267,10
7,17,74,13
8,18,1,-1
9,21,3,-1


In [68]:
popular_restaurant = df.groupby('restaurant_id').sum().drop('user_id', axis=1).sort_values('score', ascending = False).drop('score', axis=1)
popular_restaurant.head()

12340
10183
126062
2785
5290


In [75]:
def isclose(a, b, rel_tol=1e-09, abs_tol=0.0):
    return abs(a-b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)

def nmostpopularrestaurant(n):
    return popular_restaurant[0:n]

def getmaxnelem(converted_user_id, n):
    if(converted_user_id != -1):
        return sorted(X_pred[converted_user_id], reverse=True)[:n]
    else:
        return nmostpopularrestaurant(n)

def getindexofmaxelem(fromtrain, frommaxelem):
    l = []
    for j in range(0, len(frommaxelem)):
        for i in range(0, len(fromtrain)):
            if(isclose(fromtrain[i],frommaxelem[j])):
                l.append(i)
    return l

def convertbacktorestaurantid(restaurant_id, tobeconverted):
    l = []
    for i in tobeconverted:
        l.append(restaurant_id_c.categories[i])
    return l

In [77]:
nmostpopularrestaurant(5) #will come to fix bugs 0.0

AttributeError: 'DataFrame' object has no attribute 'to_array'

In [27]:
file = open('output.txt', 'w')

In [62]:
# does it still have some bugs?
# some user_ids are not in our train data but why those are in the test file T_T
test = test.sort_values('user_id')
for i in range (0, test.shape[0]):
    maxelems = getmaxnelem(test['converted_user_id'][i], test['number_of_restaurants'][i])
    l = getindexofmaxelem(X_pred[i], maxelems)
    l = convertbacktorestaurantid(restaurant_id_c, l)
    for j in range(0, len(l)):
        file.write(str(l[j]))
        if(j != len(l) - 1):
            file.write(', ')
    file.write('\n')
    print(str(i + 1) + " out of " + str(test.shape[0]))
file.close()

1 out of 167556
2 out of 167556
3 out of 167556
4 out of 167556
5 out of 167556


KeyError: 'restaurant_id'