In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import sparse
import gc

In [136]:
pd.set_option("display.max_columns", 1000)

In [115]:
from load_functions import *
from transform_functions import *
from score_functions import *
from i2i_functions import *
from attributes_recs import *
from learn_to_rank import *
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [6]:
test_users = pd.read_csv('test_users.csv', dtype=str, names=['user_id'],header=0)
users_df = load_users('users.csv', test_users)
orgs_df = load_orgs('organisations.csv')
orgs_df=enrich_orgs(orgs_df, reviews)
combined_features = load_combined(["rubrics.csv","features.csv"], orgs_df=orgs_df)
test_users_with_city = users_df[users_df.in_test==1][['user_id','city']]
print(len(test_users),len(users_df))

16967 1252801


Unnamed: 0,org_id,city,average_bill,rating,rubrics_id,features_id,combined_id
0,16848414477362211020,spb,1000.0,4.479702,"[30776, 31375]","[1018, 1509, 11177, 11617, 11629, 11704, 11867...","[30776, 31375, 1018, 1509, 11177, 11617, 11629..."
1,1430604733320164116,spb,1000.0,4.514509,"[30776, 30770]","[246, 1018, 11617, 11629, 11704, 11867, 20422,...","[30776, 30770, 246, 1018, 11617, 11629, 11704,..."
2,9880309324224147401,spb,1000.0,3.884615,"[30770, 30774]","[1018, 11177, 11617, 11629, 11704, 11867, 2042...","[30770, 30774, 1018, 11177, 11617, 11629, 1170..."
3,5617879987171966456,spb,1000.0,,"[30774, 30775]","[1018, 1509, 10596, 11177, 11629, 11634, 11704...","[30774, 30775, 1018, 1509, 10596, 11177, 11629..."
4,5241461680470612149,spb,1000.0,4.532468,[30776],"[1018, 11177, 11617, 11629, 11704, 11867, 2042...","[30776, 1018, 11177, 11617, 11629, 11704, 1186..."


In [7]:
reviews = load_reviews("reviews.csv",users_df=users_df, orgs_df=orgs_df)
print(f"{len(reviews)}")
reviews.head()

3640835


Unnamed: 0,user_id,org_id,rating,ts,aspects,good,user_city,in_test,org_city,travel
0,16998268288908323644,7184895086928047809,2.0,105,[],0,msk,0,msk,0
1,3121447338909258868,7184895086928047809,5.0,464,[],1,msk,0,msk,0
2,1970649778250883025,7184895086928047809,3.0,789,[],0,msk,0,msk,0
3,7554889464530643866,7184895086928047809,4.0,936,[],1,msk,0,msk,0
4,15907910894057053620,7184895086928047809,1.0,1143,[],0,msk,0,msk,0


In [45]:
test_users_df = test_users_with_city\
    .merge(reviews[reviews.good>0][['user_id','org_id']], on='user_id', how='left', indicator=True)
good_test_users = test_users_df[test_users_df["_merge"]=='both']\
    .groupby(['user_id','city'])\
    .agg(org_id=pd.NamedAgg("org_id", aggfunc=list)).reset_index()

empty_test_users = test_users_df[test_users_df["_merge"]=='left_only'][['user_id','city']]
empty_test_users['org_id'] = [[] for _ in empty_test_users.itertuples()]
final_test_users_df = pd.concat([good_test_users, empty_test_users], ignore_index=True)
del test_users_df
del good_test_users
del empty_test_users
print(len(final_test_users_df), len(test_users_with_city))

16967 16967


In [117]:
train_reviews, draft_test_reviews = train_test_split(reviews, users_df, 
                                               min_ts=900,
                                               frac=1)
print(len(train_reviews), len(test_reviews))

3610239 19009


In [126]:
pandarallel.initialize(progress_bar=False)
mr=25
l2=1e5
result, encs = prepare_reviews_i2i(train_reviews, orgs_df,
                                   min_reviews_per_user=2,
                                   min_org_reviews=mr,
                                   min_travels_reviews=1,
                                   min_org_score=3.0)
print(len(result))
m = reviews_matrix(result, encs)
cc_mat = m.T * m
ease_sim = ease_solution(cc_mat, l2=l2).T
draft_ease_preds = i2i_predict(ease_sim, draft_test_reviews, encs, N=60)
for N in [20,60]:
    print(f"{N} {recall(draft_test_reviews, draft_ease_preds, N=N)}")
for t_size in [1,3,5,10,20,50]:
    draft_test_preds = draft_test_reviews[draft_test_reviews.org_id.str.len()<=t_size]
    print(f"<={t_size} {recall(draft_test_preds, draft_ease_preds)}")

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
1674332
20 0.13152327747075046
60 0.2000749030094278
<=1 0.06196100149040561
<=3 0.091753877736454
<=5 0.10568400108659293
<=10 0.12116398332259087
<=20 0.1273448294049524
<=50 0.13060700325430177


In [128]:
combined_enc = AttrEncoders(orgs_df, combined_features, 
                        colname='combined_id',
                        org_colname='combined_id')
combined_enc.build_attr_org_mat(ease_sim, encs)
pandarallel.initialize(progress_bar=False)
draft_combined_recs = combined_enc.attr_predict(draft_test_reviews, encs, N=60)
pandarallel.initialize(progress_bar=False)
for N in [20,60]:
    print(f"{N} {recall(draft_test_reviews, draft_combined_recs, N=N)}")
for t_size in [1,3,5,10,20]:
    draft_test_preds = draft_test_reviews[draft_test_reviews.org_id.str.len()<=t_size]
    print(f"<={t_size} {recall(draft_test_preds, draft_combined_recs)}")

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
20 0.09772319639011641
60 0.17052159675644019
>=1 0.083498712084361
>=3 0.09304996097159847
>=5 0.0946435209899824
>=10 0.09673217614114872
>=20 0.09704418651696965


In [132]:
draft_train_preds = concat_preds(draft_ease_preds, draft_combined_recs)
draft_train_preds = fallback_with_top_recs(draft_train_preds, orgs_df)
validate_preds(draft_train_preds, orgs_df, users_df, N=-1)

All good


In [138]:
pool_builder = PoolBuilder(orgs_df, reviews)
final_train_data = pool_builder.build_pool(draft_train_preds, draft_test_reviews)

In [140]:
train_ranking, valid_ranking = split_4_ranking(final_train_data)

In [142]:
train_pool = train_data_to_pool(train_ranking)
valid_pool = train_data_to_pool(valid_ranking)

In [143]:
def cb_params():
    return {
    'iterations': 2000,
    'custom_metric': ['PFound:top=20'],
    'loss_function':"YetiRank",
    'metric_period': 200
}

In [144]:
model = CatBoost(cb_params())
model.fit(train_pool, plot=True,eval_set=valid_pool, verbose=False)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoost at 0x7efb73360b80>

In [145]:
valid_scores = model.predict(valid_pool)

valid_ranking.pairs['score'] = valid_scores


In [147]:
valid_ranking.groups

array(['10007984015698932734', '10007984015698932734',
       '10007984015698932734', ..., '9994502366416070018',
       '9994502366416070018', '9994502366416070018'], dtype=object)

In [None]:
draft_final_ease_preds = i2i_predict(ease_sim, test_users_df, encs, N=40)

In [148]:
draft_final_ease_preds = i2i_predict(ease_sim, test_users_df, encs, N=40)
draft_final_combined_preds = combined_enc.attr_predict(test_users_df, encs, N=40)
draft_final_preds = concat_preds(draft_final_ease_preds, draft_final_combined_preds)
draft_final_preds = fallback_with_top_recs(draft_final_preds, orgs_df)
validate_preds(draft_train_preds, orgs_df, users_df, N=-1)

Process ForkPoolWorker-1222:
Traceback (most recent call last):
  File "/opt/Anaconda3-5.2.0-Linux-x86_64/envs/py39/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/Anaconda3-5.2.0-Linux-x86_64/envs/py39/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/Anaconda3-5.2.0-Linux-x86_64/envs/py39/lib/python3.9/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/opt/Anaconda3-5.2.0-Linux-x86_64/envs/py39/lib/python3.9/multiprocessing/queues.py", line 366, in get
    res = self._reader.recv_bytes()
  File "/opt/Anaconda3-5.2.0-Linux-x86_64/envs/py39/lib/python3.9/multiprocessing/connection.py", line 221, in recv_bytes
    buf = self._recv_bytes(maxlength)
  File "/opt/Anaconda3-5.2.0-Linux-x86_64/envs/py39/lib/python3.9/multiprocessing/connection.py", line 426, in _recv_bytes
    return self._recv(size)
  File "/opt/Anaconda3-5.2.0-Linux-x86_64/envs/py39/lib/p

KeyboardInterrupt: 

Process ForkPoolWorker-1227:
Process ForkPoolWorker-1224:
Process ForkPoolWorker-1229:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/Anaconda3-5.2.0-Linux-x86_64/envs/py39/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/opt/Anaconda3-5.2.0-Linux-x86_64/envs/py39/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/opt/Anaconda3-5.2.0-Linux-x86_64/envs/py39/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/Anaconda3-5.2.0-Linux-x86_64/envs/py39/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/Anaconda3-5.2.0-Linux-x86_64/envs/py39/lib/python3.9/multiprocessing/process.py", line 315, in 

In [None]:
final_test_data = pool_builder.build_test_pool(draft_final_preds)
final_test_pool = test_data_to_pool(final_test_data)
final_scores = model.predict(final_test_pool)

In [None]:
final_test_data.pairs['score'] = final_scores
final_test_data.pairs['user_id'] = final_test_data.groups
final_test_data.pairs['target'] = final_test_data.org_ids

final_preds = final_test_data.pairs[['user_id','target','score']].sort_values(by='score',ascending=False)\
    .groupby('user_id').agg(lambda x: list(x)[:20]).reset_index()


In [None]:
validate_preds(final_preds, orgs_df, users_df, N=20)
save_predictions(final_preds, "rerank_ans.csv")