In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import sparse
import gc

In [3]:
from load_functions import *
from transform_functions import *
from score_functions import *
from i2i_functions import *
from attributes_recs import *
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [4]:
test_users = pd.read_csv('test_users.csv', dtype=str, names=['user_id'],header=0)
users_df = load_users('users.csv', test_users)
orgs_df = load_orgs('organisations.csv')
test_users_with_city = users_df[users_df.in_test==1][['user_id','city']]
print(len(test_users),len(users_df))

16967 1252801


In [5]:
reviews, user_agg, org_agg = load_reviews(users_df=users_df, orgs_df=orgs_df)
orgs_df = orgs_df.merge(org_agg, on='org_id')
users_df = users_df.merge(user_agg, on='user_id', how='left')
print(f"{len(reviews)}, {len(orgs_df)}, {len(users_df)}")

3640835, 66405, 1252801


In [6]:
test_users_df = test_users_with_city\
    .merge(reviews[reviews.rating>=4.0][['user_id','org_id']], on='user_id', how='left')\
    .groupby(['user_id','city'])\
    .aggregate(list)\
    .reset_index()

In [7]:
train_reviews, test_reviews = train_test_split(reviews, users_df, 
                                               min_user_reviews=2,
                                               min_ts=700,
                                               frac=0.5)
print(len(train_reviews), len(test_reviews))

3617038 14583


In [8]:
features = load_features(orgs_df=orgs_df)
rubrics = load_rubrics(orgs_df=orgs_df)

In [10]:
pandarallel.initialize(progress_bar=False)
mr=50
l2=1e5
result, encs = prepare_reviews_i2i(train_reviews, users_df, orgs_df,
                                   min_reviews_per_user=2,
                                   min_org_reviews=mr,
                                   min_travels_reviews=4,
                                   min_org_score=4.0)
m = reviews_matrix(result, encs)
cc_mat = m.T * m
ease_sim = ease_solution(cc_mat, l2=l2).T
ease_preds = i2i_predict(ease_sim, test_reviews, encs, N=60)

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [13]:
rubrics_enc = AttrEncoders(orgs_df, rubrics, 
                        colname='rubric_id',
                        org_colname='rubrics_id')
rubrics_enc.build_attr_org_mat(ease_sim, encs)
pandarallel.initialize(progress_bar=True)
rubrics_recs = rubrics_enc.attr_predict(test_reviews, encs, N=60)
pandarallel.initialize(progress_bar=False)
for N in [20,60]:
    print(f"{N} {recall(test_reviews, rubrics_recs, N=N)}")
for t_size in [1,3,5,10,20]:
    test_preds = test_reviews[test_reviews.org_id.str.len()>=t_size]
    print(f">={t_size} {recall(test_preds, rubrics_recs)}")
    test_preds = test_reviews[test_reviews.org_id.str.len()<=t_size]
    print(f"<={t_size} {recall(test_preds, rubrics_recs)}")

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3646), Label(value='0 / 3646'))), …

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.
20 0.1309772600739849
60 0.20746913555994764
>=1 0.1309772600739849
<=1 0.12382545981062901
>=3 0.13071293226369457
<=3 0.13307796843744685
>=5 0.1283614103024054
<=5 0.1323972005891526
>=10 0.11931865723469234
<=10 0.1345211036820662
>=20 0.10889422399056625
<=20 0.1331644535391358


In [16]:
feature_enc = AttrEncoders(orgs_df, features, 
                        colname='feature_id',
                        org_colname='features_id')
feature_enc.build_attr_org_mat(ease_sim, encs)
feature_preds = feature_enc.attr_predict(test_reviews, encs, N=60)

for N in [20,60]:
    print(f"{N} {recall(test_reviews, feature_preds, N=N)}")
for t_size in [1,3,5,10,20]:
    test_preds = test_reviews[test_reviews.org_id.str.len()>=t_size]
    print(f">={t_size} {recall(test_preds, feature_preds)}")
    test_preds = test_reviews[test_reviews.org_id.str.len()<=t_size]
    print(f"<={t_size} {recall(test_preds, feature_preds)}")

20 0.13065904776002113
60 0.20394763114360046
>=1 0.13065904776002113
<=1 0.12069887998975178
>=3 0.13150241013473077
<=3 0.13162712593976408
>=5 0.12893773279238033
<=5 0.13155598770521038
>=10 0.11921733641077853
<=10 0.1342451719206995
>=20 0.1089949639100563
<=20 0.13283549105969455


In [None]:
mr=50
l2=1e5
result, encs = prepare_reviews_i2i(train_reviews, users_df, orgs_df,
                                   min_reviews_per_user=2,
                                   min_org_reviews=mr,
                                   min_travels_reviews=4,
                                   min_org_score=4.0)
m = reviews_matrix(result, encs)
cc_mat = m.T * m
print(f"mr: {mr} {len(result)}") 
print(f"l2: {l2}")
ease_sim = ease_solution(cc_mat, l2=l2).T
del m
del cc_mat
gc.collect()
ease_preds = i2i_predict(ease_sim, test_reviews, encs, N=60)
# for N in [20,60]:
#     print(f"{N} {recall(test_reviews, ease_preds, N=N)}")
# for t_size in [1,3,5,10,20]:
#     test_preds = test_reviews[test_reviews.org_id.str.len()>=t_size]
#     print(f">={t_size} {recall(test_preds, ease_preds)}")
#     test_preds = test_reviews[test_reviews.org_id.str.len()<=t_size]
#     print(f"<={t_size} {recall(test_preds, ease_preds)}")

# mnap_score = mnap(test_preds, ease_preds)
# print_score(mnap_score)

In [None]:
ease_preds = i2i_predict(ease_sim, test_users_df, encs)
validate_preds(fallback_with_top_recs(ease_preds, orgs_df), orgs_df, users_df)

In [None]:
save_predictions(fallback_with_top_recs(ease_preds, orgs_df), path='ease_ans.csv')

In [21]:
test_reviews.head()

Unnamed: 0,user_id,org_id,target,city
0,10001706002943608531,"[15250345250621165867, 13567855329363715883, 6...",[8333731978912449416],msk
1,10002747919095316231,[12710385033822863571],[16781089025309172995],msk
2,10004407154823998155,[6086962021284491292],[17100111766319823484],msk
3,10005883421366692807,"[9104453017196776235, 16785419493666881395, 16...","[17214645308532312848, 18082340557576449859]",msk
4,10006413168486885833,"[338111669548333930, 10953068897945996446, 170...",[8022186758461891078],msk


In [22]:
compare_ranks(test_reviews[test_reviews.org_id.str.len()<3], ease_preds, feature_preds, ['ease','features']).describe()

Unnamed: 0,ease,features,ease_features,imp_ease,imp_features
count,5268.0,5268.0,5268.0,5268.0,5268.0
mean,0.190585,0.254556,0.317008,0.062453,0.126424
std,0.441499,0.49521,0.543284,0.259417,0.355542
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,1.0,0.0,0.0
max,4.0,4.0,4.0,3.0,4.0


In [24]:
compare_ranks(test_reviews[test_reviews.org_id.str.len()<3], ease_preds, rubrics_recs, ['ease','rubrics']).describe()

Unnamed: 0,ease,rubrics,ease_rubrics,imp_ease,imp_rubrics
count,5268.0,5268.0,5268.0,5268.0,5268.0
mean,0.190585,0.264047,0.324412,0.060364,0.133827
std,0.441499,0.505469,0.550572,0.252124,0.368354
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,1.0,0.0,0.0
max,4.0,4.0,4.0,3.0,4.0
