In [51]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [7]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import sparse
import gc
import catboost

In [147]:
pd.set_option("display.max_columns", 1000)

In [288]:
from load_functions import *
from transform_functions import *
from score_functions import *
from i2i_functions import *
from attributes_recs import *
from learn_to_rank import *
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [5]:
test_users = pd.read_csv('test_users.csv', dtype=str, names=['user_id'],header=0)
users_df = load_users('users.csv', test_users)
orgs_df = load_orgs('organisations.csv')
test_users_with_city = users_df[users_df.in_test==1][['user_id','city']]
print(len(test_users),len(users_df))
orgs_df.head()

16967 1252801


Unnamed: 0,org_id,city,average_bill,rating,rubrics_id,features_id,combined_id
0,16848414477362211020,spb,1000.0,4.479702,"[30776, 31375]","[1018, 1509, 11177, 11617, 11629, 11704, 11867...","[30776, 31375, 1018, 1509, 11177, 11617, 11629..."
1,1430604733320164116,spb,1000.0,4.514509,"[30776, 30770]","[246, 1018, 11617, 11629, 11704, 11867, 20422,...","[30776, 30770, 246, 1018, 11617, 11629, 11704,..."
2,9880309324224147401,spb,1000.0,3.884615,"[30770, 30774]","[1018, 11177, 11617, 11629, 11704, 11867, 2042...","[30770, 30774, 1018, 11177, 11617, 11629, 1170..."
3,5617879987171966456,spb,1000.0,0.0,"[30774, 30775]","[1018, 1509, 10596, 11177, 11629, 11634, 11704...","[30774, 30775, 1018, 1509, 10596, 11177, 11629..."
4,5241461680470612149,spb,1000.0,4.532468,[30776],"[1018, 11177, 11617, 11629, 11704, 11867, 2042...","[30776, 1018, 11177, 11617, 11629, 11704, 1186..."


In [6]:
reviews = load_reviews("reviews.csv",users_df=users_df, orgs_df=orgs_df)
print(f"{len(reviews)}")
reviews.head()

3640835


Unnamed: 0,user_id,org_id,rating,ts,aspects,good,user_city,in_test,org_city,travel
0,16998268288908323644,7184895086928047809,2.0,105,[],0,msk,0,msk,0
1,3121447338909258868,7184895086928047809,5.0,464,[],1,msk,0,msk,0
2,1970649778250883025,7184895086928047809,3.0,789,[],0,msk,0,msk,0
3,7554889464530643866,7184895086928047809,4.0,936,[],1,msk,0,msk,0
4,15907910894057053620,7184895086928047809,1.0,1143,[],0,msk,0,msk,0


In [178]:
final_users_df = test_users_with_city\
    .merge(reviews[reviews.good>0][['user_id','org_id']], on='user_id', how='left', indicator=True)
good_test_users = final_users_df[final_users_df["_merge"]=='both']\
    .groupby(['user_id','city'])\
    .agg(org_id=pd.NamedAgg("org_id", aggfunc=list)).reset_index()

empty_test_users = final_users_df[final_users_df["_merge"]=='left_only'][['user_id','city']]
empty_test_users['org_id'] = [[] for _ in empty_test_users.itertuples()]
final_users_df = pd.concat([good_test_users, empty_test_users], ignore_index=True)
print(len(final_users_df), len(test_users_with_city))

16967 16967


In [179]:
final_users_df.head()

Unnamed: 0,user_id,city,org_id
0,10003390898943363405,msk,"[1703593138705417941, 7168943163090334846, 933..."
1,10003578788635255367,spb,"[12784724396885152442, 5756313314324569869, 97..."
2,10004443955392665333,msk,"[11165088907395841018, 7923904392511338698, 18..."
3,10004634860212544956,msk,"[13573322486152844808, 13573322486152844808, 5..."
4,10004870073377911470,msk,[14942944192619480320]


In [9]:
train_reviews, test_reviews = train_test_split(reviews, users_df, 
                                               min_ts=500,
                                               frac=0.5)
print(len(train_reviews), len(test_reviews))

3602574 22562


In [87]:
pandarallel.initialize(progress_bar=False)
mr=50
l2=1e5
result, encs = prepare_reviews_i2i(train_reviews, orgs_df,
                                   min_reviews_per_user=2,
                                   min_org_reviews=mr,
                                   min_travels_reviews=2,
                                   min_org_score=4.0)
print(len(result))
m = reviews_matrix(result, encs)
cc_mat = m.T * m
ease_sim = ease_solution(cc_mat, l2=l2).T
ease_preds = i2i_predict(ease_sim, test_reviews, encs, N=20)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
1275439


In [188]:
pandarallel.initialize(progress_bar=False)
draft_preds = i2i_predict(ease_sim, final_users_df, encs, N=40)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [189]:
draft_preds = fallback_with_top_recs(draft_preds, enrich_orgs(orgs_df, reviews))

In [15]:
reviews_orgs = reviews.merge(orgs_df.drop(columns='city'), on='org_id',suffixes=("","_org"))

In [275]:
pool_builder = PoolBuilder(orgs_df, reviews)

In [276]:
train_data = pool_builder.build_pool(ease_preds, test_reviews)

In [294]:
train_data, valid_data = split_4_ranking(train_data)

In [295]:
train_pool = train_data_to_pool(train_data)
valid_pool = train_data_to_pool(valid_data)

In [296]:
def cb_params():
    return {
    'iterations': 2000,
    'custom_metric': ['PFound:top=10'],
    'loss_function':"YetiRank",
    'metric_period': 100
}

In [297]:
model = CatBoost(cb_params())
model.fit(train_pool, plot=True, eval_set=valid_pool, verbose=False)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoost at 0x7f45bbc2afd0>

In [None]:
valid_preds = predict(model, valid_pool, valid_data.groups, valid_data.org_ids)
mnap(test_reviews, valid_preds), recall(test_reviews, valid_preds)

In [298]:
final_test_data = pool_builder.build_test_pool(draft_preds)
final_preds = predict(model, test_pool, final_test_data.groups, final_test_data.org_ids)

MemoryError: Unable to allocate 1.02 GiB for an array with shape (227, 600417) and data type float64

In [233]:
validate_preds(final_preds, orgs_df, users_df, N=20)

All good


In [234]:
save_predictions(final_preds, "rerank_ans.csv")