In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [14]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import sparse
import gc

In [3]:
from load_functions import *
from transform_functions import *
from score_functions import *
from i2i_functions import *
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=2)

INFO: Pandarallel will run on 2 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [4]:
test_users = pd.read_csv('test_users.csv', dtype=str, names=['user_id'],header=0)
users_df = load_users('users.csv', test_users)
orgs_df = load_orgs('organisations.csv')
test_users_with_city = users_df[users_df.in_test==1][['user_id','city']]
print(len(test_users),len(users_df))

16967 1252801


In [5]:
reviews, user_agg, org_agg = load_reviews(users_df=users_df, orgs_df=orgs_df)
orgs_df = orgs_df.merge(org_agg, on='org_id')
users_df = users_df.merge(user_agg, on='user_id', how='left')
print(f"{len(reviews)}, {len(orgs_df)}, {len(users_df)}")

3640835, 66405, 1252801


In [6]:
test_users_df = test_users_with_city\
    .merge(reviews[reviews.rating>=4.0][['user_id','org_id']], on='user_id', how='left')\
    .groupby(['user_id','city'])\
    .aggregate(list)\
    .reset_index()

In [7]:
train_reviews, test_reviews = train_test_split(reviews, users_df, 
                                               min_user_reviews=2,
                                               min_ts=700,
                                               frac=0.5)
print(len(train_reviews), len(test_reviews))

3616942 14686


In [13]:
features = load_features(orgs_df=orgs_df)

In [23]:
f_enc = LabelEncoder()
f_enc.fit(features.feature_id)
idxs = f_enc.transform(features.feature_id)
weights = np.zeros_like(f_enc.classes_)
weights[idxs] = features.count_normed

In [19]:
pandarallel.initialize(nb_workers=8)
mr=50
l2=1e5
result, encs = prepare_reviews_i2i(train_reviews, users_df, orgs_df,
                                   min_reviews_per_user=2,
                                   min_org_reviews=mr,
                                   min_travels_reviews=2,
                                   min_org_score=3.0)
m = reviews_matrix(result, encs)
cc_mat = m.T * m
print(f"mr: {mr} {len(result)}") 
print(f"l2: {l2}")
ease_sim = ease_solution(cc_mat, l2=l2).T
del m
del cc_mat
gc.collect()
ease_preds = i2i_predict(ease_sim, test_reviews, encs, N=60)
for N in [20,60]:
    print(f"{N} {recall(test_reviews, ease_preds, N=N)}")
for t_size in [1,3,5,10,20]:
    test_preds = test_reviews[test_reviews.org_id.str.len()>=t_size]
    print(f">={t_size} {recall(test_preds, ease_preds)}")
    test_preds = test_reviews[test_reviews.org_id.str.len()<=t_size]
    print(f"<={t_size} {recall(test_preds, ease_preds)}")

mnap_score = mnap(test_preds, ease_preds)
print_score(mnap_score)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
mr: 50 1774478
l2: 100000.0
20 0.1676577870897921
60 0.24035385654388797
>=1 0.1676577870897921
<=1 0.09922359502149074
>=3 0.19452237661395413
<=3 0.13201188819457785
>=5 0.19820403544762222
<=5 0.1469637790466356
>=10 0.1982298933159379
<=10 0.16024459868244623
>=20 0.19268483952302176
<=20 0.1658836985589191
MNAP-score: 101.81


In [16]:
ease_preds = i2i_predict(ease_sim, test_users_df, encs)
validate_preds(fallback_with_top_recs(ease_preds, orgs_df), orgs_df, users_df)

All good


In [17]:
save_predictions(fallback_with_top_recs(ease_preds, orgs_df), path='ease_ans.csv')

In [22]:
aspects = load_aspects()
rubrics = load_rubrics(orgs_df=orgs_df)
features = load_features(orgs_df=orgs_df)

In [20]:
from attributes_recs import *

In [23]:
attr_enc = AttrEncoders(orgs_df, features)

In [24]:
attr_enc.features_mat

<66405x63 sparse matrix of type '<class 'numpy.int64'>'
	with 385749 stored elements in Compressed Sparse Row format>

In [32]:
idxs = range(ease_sim.shape[0])
org_ids = encs.decode_orgs(idxs)
org_idxs = attr_enc.org_enc.transform(org_ids)
features = attr_enc.features_mat[org_idxs,:]

In [46]:
np.matmul(ease_sim,features.toarray()).T.shape

(63, 7296)

In [42]:
features.T.shape

(63, 7296)