In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import sparse
import gc

In [3]:
pd.set_option("display.max_columns", 1000)

In [1]:
from load_functions import *
from transform_functions import *
from score_functions import *
from i2i_functions import *
from attributes_recs import *
from learn_to_rank import *
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [5]:
test_users = pd.read_csv('test_users.csv', dtype=str, names=['user_id'],header=0)
users_df = load_users('users.csv', test_users)
orgs_df = load_orgs('organisations.csv')
combined_features = load_combined(["rubrics.csv","features.csv"], orgs_df=orgs_df)
test_users_with_city = users_df[users_df.in_test==1][['user_id','city']]
print(len(test_users),len(users_df))

16967 1252801


In [6]:
reviews = load_reviews("reviews.csv", users_df=users_df, orgs_df=orgs_df)
orgs_df=enrich_orgs(orgs_df, reviews)
print(f"{len(reviews)}")
reviews.head()

3640835


Unnamed: 0,user_id,org_id,rating,ts,aspects,good,user_city,in_test,org_city,travel
0,16998268288908323644,7184895086928047809,2.0,105,[],0,msk,0,msk,0
1,3121447338909258868,7184895086928047809,5.0,464,[],1,msk,0,msk,0
2,1970649778250883025,7184895086928047809,3.0,789,[],0,msk,0,msk,0
3,7554889464530643866,7184895086928047809,4.0,936,[],1,msk,0,msk,0
4,15907910894057053620,7184895086928047809,1.0,1143,[],0,msk,0,msk,0


In [7]:
test_users_df = test_users_with_city\
    .merge(reviews[reviews.good>0][['user_id','org_id']], on='user_id', how='left', indicator=True)
good_test_users = test_users_df[test_users_df["_merge"]=='both']\
    .groupby(['user_id','city'])\
    .agg(org_id=pd.NamedAgg("org_id", aggfunc=list)).reset_index()

empty_test_users = test_users_df[test_users_df["_merge"]=='left_only'][['user_id','city']]
empty_test_users['org_id'] = [[] for _ in empty_test_users.itertuples()]
final_test_users_df = pd.concat([good_test_users, empty_test_users], ignore_index=True)
del test_users_df
del good_test_users
del empty_test_users
print(len(final_test_users_df), len(test_users_with_city))

16967 16967


In [12]:
draft_train_reviews, draft_test_reviews = train_test_split(reviews, min_ts=1000)
print(len(draft_train_reviews), len(draft_test_reviews))

9642 9642


In [35]:
draft_test_reviews = draft_test_reviews.merge(users_df[['user_id','city']],on='user_id')

In [19]:
train_pairs = draft_train_reviews.explode("org_id").drop(columns='city')
test_pairs = draft_test_reviews.explode("target").rename(columns={'target':'org_id'})

result, encs = prepare_reviews_i2i(reviews, orgs_df,
                                   train_pairs, test_pairs,
                                   min_reviews_per_user=2,
                                   min_org_reviews=5,
                                   min_travels_reviews=1,
                                   min_org_score=3.5)
may_be_in_draft = set(encs.orgs_enc.classes_)
should_be_in_draft = set(draft_test_reviews.target.explode())
print(len(result))
print(f"{len(may_be_in_draft)},{len(should_be_in_draft)},{len(may_be_in_draft&should_be_in_draft)/len(should_be_in_draft)}")

1747354
17918,4627,0.8113248325048628


In [20]:
pandarallel.initialize(progress_bar=False, nb_workers=2)
l2=1.0e5
m = reviews_matrix(result, encs)
cc_mat = m.T * m
ease_sim = ease_solution(cc_mat, l2=l2).T
del m
del cc_mat
draft_ease_preds = i2i_predict(ease_sim, draft_train_reviews, encs, N=60)
print(f"MNAP: {mnap(draft_test_reviews, draft_ease_preds, N=20)}")
for N in [20,60]:
    print(f"{N} {recall(draft_test_reviews, draft_ease_preds, N=N).mean()}")
# for t_size in [1,3,5,10,20,50]:
#     draft_test_preds = draft_test_reviews[draft_train_reviews.org_id.str.len()<=t_size]
#     print(f"<={t_size} {recall(draft_test_preds, draft_ease_preds).mean()}")

INFO: Pandarallel will run on 2 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
MNAP: 0.04311796675660804
20 0.1275304053081298
60 0.1945152264416404
<=1 0.08361167527872167
<=3 0.10107414876987618
<=5 0.11308687368534295
<=10 0.12143733049164172
<=20 0.12525100761809815
<=50 0.12702712169116087


In [21]:
draft_final_ease_preds = i2i_predict(ease_sim, final_test_users_df, encs, N=20)
draft_final_preds = fallback_with_top_recs(draft_final_ease_preds, orgs_df)
validate_preds(draft_final_preds, orgs_df, users_df, N=20)
save_predictions(draft_final_preds, "ease_draft_ans.csv")

All good


In [None]:
combined_enc = AttrEncoders(orgs_df, combined_features, 
                        colname='combined_id',
                        org_colname='combined_id')
combined_enc.build_attr_org_mat(ease_sim, encs)
pandarallel.initialize(progress_bar=False)
draft_combined_recs = combined_enc.attr_predict(draft_test_reviews, encs, N=60)
print(f"{mnap(draft_test_reviews, draft_ease_preds, N=N)}")
for N in [20,60]:
    print(f"{N} {recall(draft_test_reviews, draft_combined_recs, N=N)}")
for t_size in [1,3,5,10,20]:
    draft_test_preds = draft_test_reviews[draft_test_reviews.org_id.str.len()<=t_size]
    print(f"<={t_size} {recall(draft_test_preds, draft_combined_recs)}")

In [None]:
draft_final_comb_preds = combined_enc.attr_predict(final_test_users_df, encs, N=20)
draft_final_preds = fallback_with_top_recs(draft_final_comb_preds, orgs_df)
validate_preds(draft_final_preds, orgs_df, users_df, N=20)
save_predictions(draft_final_preds, "attr_draft_ans.csv")

In [None]:
merged_drafts = combine_preds(final_test_users_df, draft_final_ease_preds, draft_final_comb_preds)
draft_final_preds = fallback_with_top_recs(merged_drafts, orgs_df)
validate_preds(draft_final_preds, orgs_df, users_df, N=20)
save_predictions(draft_final_preds, "merged_draft_ans.csv")

In [None]:
test_merged_drafts = combine_preds(draft_test_reviews, draft_ease_preds, draft_combined_recs)
print(f"mnap {mnap(draft_test_reviews, test_merged_drafts, N=20)}")
print(f"recall {recall(draft_test_reviews, test_merged_drafts, N=20)}")

In [None]:
pandarallel.initialize(progress_bar=False)
print(f"mnap {mnap(draft_test_reviews, draft_ease_preds, N=20)}")
print(f"recall {recall(draft_test_reviews, draft_ease_preds, N=20).mean()}")

print(f"mnap {mnap(draft_test_reviews,draft_combined_recs, N=20)}")
print(f"recall {recall(draft_test_reviews,draft_combined_recs, N=20).mean()}")

In [None]:
draft_train_preds = concat_preds(draft_ease_preds, draft_combined_recs,N=15)
validate_preds(draft_train_preds, orgs_df, users_df, N=-1)
print(f"mnap {mnap(draft_test_reviews, draft_train_preds, N=20)}")
print(f"recall {recall(draft_test_reviews, draft_train_preds, N=20).mean()}")

In [24]:
len(np.unique(final_train_data.groups))/len(draft_ease_preds.user_id.unique())

0.17641568139390168

In [36]:
pool_builder = PoolBuilder(orgs_df, reviews)
final_train_data = pool_builder.build_big_pool(draft_ease_preds, draft_test_reviews)

In [37]:
train_ranking, valid_ranking = split_4_ranking(final_train_data)

In [38]:
train_pool = train_data_to_pool(train_ranking)
valid_pool = train_data_to_pool(valid_ranking)

In [44]:
def cb_params():
    return {
    'custom_metric': ['PFound:top=2'],
    'loss_function':"YetiRank",
    'metric_period': 50
}

In [45]:
model = CatBoost(cb_params())
model.fit(train_pool, plot=True, eval_set=valid_pool, verbose=False)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoost at 0x7fa71ca06ee0>

In [46]:
pandarallel.initialize(progress_bar=False)
valid_preds = cb_predict(model, valid_pool, valid_ranking.groups, valid_ranking.org_ids)
print(f"mnap {mnap(draft_test_reviews, valid_preds, N=20)}")
print(f"recall {recall(draft_test_reviews, valid_preds, N=20).mean()}")

print(f"mnap {mnap(draft_test_reviews, draft_ease_preds, N=20)}")
print(f"recall {recall(draft_test_reviews, draft_ease_preds, N=20).mean()}")

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
mnap 0.662208087708563
recall 0.8964784271461503
mnap 0.04311796675660804
recall 0.1275304053081298


In [None]:
gc.collect()

In [48]:
pandarallel.initialize(progress_bar=False, nb_workers=2)
# draft_final_preds = concat_preds(draft_final_ease_preds, draft_final_comb_preds)
draft_final_ease_preds = i2i_predict(ease_sim, final_test_users_df, encs, N=60)
draft_final_preds = fallback_with_top_recs(draft_final_ease_preds, orgs_df)
validate_preds(draft_final_preds, orgs_df, users_df, N=-1)
final_test_data = pool_builder.build_test_pool(draft_final_preds)
final_test_pool = test_data_to_pool(final_test_data)

INFO: Pandarallel will run on 2 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


MemoryError: 

In [None]:
final_scores = model.predict(final_test_pool)
final_preds = cb_predict(model, final_test_pool, final_test_data.groups, final_test_data.org_ids)
validate_preds(final_preds, orgs_df, users_df, N=20)
save_predictions(draft_final_preds, "reranked_logit_ans.csv")

In [43]:
compare_preds(final_preds, draft_final_ease_preds)

1379606939281785570


Unnamed: 0,new,old
0,12784724396885152442,12046097390037935713
1,13351482607452884539,6838233943148091808
2,707903906558057894,14814427257061788801
3,6838233943148091808,3182217017434418454
4,8790571376065654670,5002407858008059043
5,12909766905446674254,2070377783033138991
6,14814427257061788801,12784724396885152442
7,11303035368518888098,707903906558057894
8,12046097390037935713,11303035368518888098
9,5756313314324569869,8790571376065654670
