In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import sparse
import gc

In [74]:
from load_functions import *
from transform_functions import *
from score_functions import *
from i2i_functions import *
from attributes_recs import *
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [6]:
test_users = pd.read_csv('test_users.csv', dtype=str, names=['user_id'],header=0)
users_df = load_users('users.csv', test_users)
orgs_df = load_orgs('organisations.csv')
orgs_df=enrich_orgs(orgs_df, reviews)
combined_features = load_combined(["rubrics.csv","features.csv"], orgs_df=orgs_df)
test_users_with_city = users_df[users_df.in_test==1][['user_id','city']]
print(len(test_users),len(users_df))
orgs_df.head()

16967 1252801


Unnamed: 0,org_id,city,average_bill,rating,rubrics_id,features_id,combined_id
0,16848414477362211020,spb,1000.0,4.479702,"[30776, 31375]","[1018, 1509, 11177, 11617, 11629, 11704, 11867...","[30776, 31375, 1018, 1509, 11177, 11617, 11629..."
1,1430604733320164116,spb,1000.0,4.514509,"[30776, 30770]","[246, 1018, 11617, 11629, 11704, 11867, 20422,...","[30776, 30770, 246, 1018, 11617, 11629, 11704,..."
2,9880309324224147401,spb,1000.0,3.884615,"[30770, 30774]","[1018, 11177, 11617, 11629, 11704, 11867, 2042...","[30770, 30774, 1018, 11177, 11617, 11629, 1170..."
3,5617879987171966456,spb,1000.0,,"[30774, 30775]","[1018, 1509, 10596, 11177, 11629, 11634, 11704...","[30774, 30775, 1018, 1509, 10596, 11177, 11629..."
4,5241461680470612149,spb,1000.0,4.532468,[30776],"[1018, 11177, 11617, 11629, 11704, 11867, 2042...","[30776, 1018, 11177, 11617, 11629, 11704, 1186..."


In [7]:
reviews = load_reviews("reviews.csv",users_df=users_df, orgs_df=orgs_df)
print(f"{len(reviews)}")
reviews.head()

3640835


Unnamed: 0,user_id,org_id,rating,ts,aspects,good,user_city,in_test,org_city,travel
0,16998268288908323644,7184895086928047809,2.0,105,[],0,msk,0,msk,0
1,3121447338909258868,7184895086928047809,5.0,464,[],1,msk,0,msk,0
2,1970649778250883025,7184895086928047809,3.0,789,[],0,msk,0,msk,0
3,7554889464530643866,7184895086928047809,4.0,936,[],1,msk,0,msk,0
4,15907910894057053620,7184895086928047809,1.0,1143,[],0,msk,0,msk,0


In [45]:
test_users_df = test_users_with_city\
    .merge(reviews[reviews.good>0][['user_id','org_id']], on='user_id', how='left', indicator=True)
good_test_users = test_users_df[test_users_df["_merge"]=='both']\
    .groupby(['user_id','city'])\
    .agg(org_id=pd.NamedAgg("org_id", aggfunc=list)).reset_index()

empty_test_users = test_users_df[test_users_df["_merge"]=='left_only'][['user_id','city']]
empty_test_users['org_id'] = [[] for _ in empty_test_users.itertuples()]
test_users_df = pd.concat([good_test_users, empty_test_users], ignore_index=True)
print(len(test_users_df), len(test_users_with_city))

16967 16967


In [92]:
train_reviews, test_reviews = train_test_split(reviews, users_df, 
                                               min_ts=900,
                                               frac=1)
print(len(train_reviews), len(test_reviews))

3610239 19009


In [94]:
pandarallel.initialize(progress_bar=False)
mr=50
l2=1e5
result, encs = prepare_reviews_i2i(train_reviews, orgs_df,
                                   min_reviews_per_user=2,
                                   min_org_reviews=mr,
                                   min_travels_reviews=2,
                                   min_org_score=3.0)
print(len(result))
m = reviews_matrix(result, encs)
cc_mat = m.T * m
ease_sim = ease_solution(cc_mat, l2=l2).T
ease_preds = i2i_predict(ease_sim, test_reviews, encs, N=60)
for N in [20,60]:
    print(f"{N} {recall(test_reviews, ease_preds, N=N)}")
for t_size in [1,3,5,10,20]:
    test_preds = test_reviews[test_reviews.org_id.str.len()>=t_size]
    print(f">={t_size} {recall(test_preds, ease_preds)}")
    test_preds = test_reviews[test_reviews.org_id.str.len()<=t_size]
    print(f"<={t_size} {recall(test_preds, ease_preds)}")

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
1398599
20 0.12794991083641055
60 0.19425207547114723
>=1 0.12794991083641055
<=1 0.05740080434857973
>=3 0.15880711171241363
<=3 0.08587515651018693
>=5 0.17125753513839348
<=5 0.10062228488826563
>=10 0.17297835812116868
<=10 0.11669372815013836
>=20 0.1811580406332944
<=20 0.12361280182875026


In [12]:
combined_enc = AttrEncoders(orgs_df, combined_features, 
                        colname='combined_id',
                        org_colname='combined_id')
combined_enc.build_attr_org_mat(ease_sim, encs)
# pandarallel.initialize(progress_bar=True)
# combined_recs = combined_enc.attr_predict(test_reviews, encs, N=60)
# pandarallel.initialize(progress_bar=False)
# for N in [20,60]:
#     print(f"{N} {recall(test_reviews, combined_recs, N=N)}")
# for t_size in [1,3,5,10,20]:
#     test_preds = test_reviews[test_reviews.org_id.str.len()>=t_size]
#     print(f">={t_size} {recall(test_preds, combined_recs)}")
#     test_preds = test_reviews[test_reviews.org_id.str.len()<=t_size]
#     print(f"<={t_size} {recall(test_preds, combined_recs)}")

In [23]:
test_users_df[test_users_df.org_id==[fl]]

NameError: name 'nan' is not defined

In [46]:
for row in test_users_df.itertuples():
    try:
        if len(row.org_id)> 0:
            features = combined_enc._get_features(row.org_id) 
            if features is None:
                print(row)
                break
    except:
        print(row)
        break

In [61]:
# pandarallel.initialize(progress_bar=False)
ease_preds = i2i_predict(ease_sim, test_users_df, encs)
# attr_preds = combined_enc.attr_predict(test_users_df, encs)
# validate_preds(fallback_with_top_recs(ease_preds, orgs_df), orgs_df, users_df)

In [65]:
test_users_df.head()

Unnamed: 0,user_id,city,org_id
0,10003390898943363405,msk,"[1703593138705417941, 7168943163090334846, 933..."
1,10003578788635255367,spb,"[12784724396885152442, 5756313314324569869, 97..."
2,10004443955392665333,msk,"[11165088907395841018, 7923904392511338698, 18..."
3,10004634860212544956,msk,"[13573322486152844808, 13573322486152844808, 5..."
4,10004870073377911470,msk,[14942944192619480320]


In [88]:
combined = combine_preds(test_users_df, ease_preds, attr_preds, min_len=3)
combined.head()

Unnamed: 0,user_id,city,target
0,10003390898943363405,msk,"[12046097390037935713, 14814427257061788801, 5..."
1,10003578788635255367,spb,"[15250345250621165867, 1625971115460696067, 57..."
2,10004443955392665333,msk,"[12046097390037935713, 14814427257061788801, 8..."
3,10004634860212544956,msk,"[12046097390037935713, 14814427257061788801, 5..."
4,10004870073377911470,msk,"[12046097390037935713, 14814427257061788801, 5..."


In [89]:
validate_preds(fallback_with_top_recs(combined, orgs_df), orgs_df, users_df)

All good


In [90]:
save_predictions(fallback_with_top_recs(combined, orgs_df), path='combined_ans.csv')

In [87]:
save_predictions(fallback_with_top_recs(ease_preds, orgs_df), path='ease_ans.csv')

In [None]:
test_reviews.head()

In [None]:
compare_ranks(test_reviews[test_reviews.org_id.str.len()<3], ease_preds, combined_recs, ['ease','features']).describe()

In [None]:
compare_ranks(test_reviews[test_reviews.org_id.str.len()<3], ease_preds, rubrics_recs, ['ease','rubrics']).describe()