In [3]:
import pandas as pd, numpy as np
from tqdm.notebook import tqdm
import os, sys, pickle, glob, gc
from collections import Counter
import itertools
pd.set_option('display.max_columns', None)
from otto_utils import *

In [4]:
files = list(filter(os.path.isfile, glob.glob("models_new/*")))
files.sort(key=lambda x: os.path.getmtime(x))
models_all = [x for x in files if (x.startswith("models_new/lgb_") and not (x.startswith("models_new/lgb_oof")))][-9:]

models = dict()

models['clicks'] = [x for x in models_all if x.startswith("models_new/lgb_clicks")]
models['carts'] = [x for x in models_all if x.startswith("models_new/lgb_carts")]
models['orders'] = [x for x in models_all if x.startswith("models_new/lgb_orders")]

In [5]:
models['clicks']

['models_new/lgb_clicks_2000_tr_16_lv_gbdt_fold_0_0.9347_0.545548.pkl',
 'models_new/lgb_clicks_2000_tr_16_lv_gbdt_fold_1_0.9347_0.530706.pkl',
 'models_new/lgb_clicks_2000_tr_16_lv_gbdt_fold_2_0.9361_0.537066.pkl']

In [6]:
models['carts']

['models_new/lgb_carts_2000_tr_36_lv_dart_fold_0_0.9274_0.423204.pkl',
 'models_new/lgb_carts_2000_tr_36_lv_dart_fold_1_0.9265_0.433516.pkl',
 'models_new/lgb_carts_2000_tr_36_lv_dart_fold_2_0.9271_0.404942.pkl']

In [7]:
models['orders']

['models_new/lgb_orders_2000_tr_16_lv_gbdt_fold_0_0.9716_0.682353.pkl',
 'models_new/lgb_orders_2000_tr_16_lv_gbdt_fold_1_0.9650_0.637750.pkl',
 'models_new/lgb_orders_2000_tr_16_lv_gbdt_fold_2_0.9629_0.607176.pkl']

In [8]:
out_dict = {
    'orders': [
        'a2s_actions_rel', 'a2s_best_action_type', 'a2s_actions_num', 'a2s_last_action_index', 'ts_diff',
        'a2s_carts_rel', 'a2s_last_click_index', 'a2s_last_cart_index', 'a2s_carts_num', 'wgt_v51ha_sum',
        'OR_estimation_frCA_trn', 'aid_CL2OR_trn', 'ts_diff_clicks', 'wgt_v21m_sum', 'session_actions',
        'session_carts', 'wgt_rel_v51ha_mean', 'wgt_v21k_sum', 'v51ha_indmin', 'wgt_rel_v51ha_sum',
        'wgt_rel_v21m_sum', 'wgt_v31m_sum', 'session_carts_avg_real', 'ts_diff_carts', 'ts_diff_orders_rel',
        'a2s_clicks_rel', 'OR_estimation_frCL_trn', 'session_clicks', 'carts_rating_full', 'ts_diff_clicks_rel',
        'session_carts_avg_hour', 'ts_diff_carts_rel', 'ts_diff_rel', 'wgt_rel_v31m_sum', 'wgt_rel_v21m_mean',
        'session_full_length', 'aid_CL2CA_tst', 'aid_CA2OR_trn', 'v21m_indmin', 'a2s_orders_rel',
        'session_items_carted', 'aid_CA_rank_int_tst_vs_trn', 'session_avg_real_length', 'ts_diff_orders',
        'carts_rating_train', 'v21k_num', 'wgt_v51ha_mean', 'orders_rating_full', 'aid_multi_orders_percent_train',
        'aid_multi_clicks_percent_full'
    ],
    'carts': [
        'a2s_last_action_index', 'ts_diff', 'a2s_actions_rel', 'ts_diff_rel', 'wgt_v51ha_sum', 'wgt_v21m_sum',
        'wgt_rel_v21m_sum', 'wgt_rel_v51ha_sum', 'session_clicks', 'wgt_v31m_sum', 'a2s_clicks_rel',
        'wgt_rel_v51ha_mean', 'wgt_v21k_sum', 'v51ha_indmin', 'session_items_clicked', 'wgt_rel_v31m_sum',
        'session_items', 'v21m_indmin', 'wgt_rel_v21k_sum', 'wgt_v11m_sum', 'a2s_last_click_index', 
        'wgt_rel_v21m_mean', 'aid_CL2CA_tst', 'ts_diff_carts_rel', 'CA_estimation_frCL_trn', 'aid_CL2CA_trn',
        'ts_diff_clicks_rel', 'clicks_rating_train', 'ts_diff_orders_rel', 'aid_CL_vs_mean_trn', 'ts_diff_clicks',
        'aid_CL_rank_int_tst_vs_trn', 'session_full_length', 'aid_multi_clicks_percent_full', 'wgt_v51ha_mean',
        'session_avg_real_items_num', 'a2s_actions_num', 'aid_CL_rank_int_trn', 'v31m_num', 'v31m_indmin',
        'a2s_clicks_num', 'CA_estimation_frCL_tst', 'wgt_rel_v21k_mean', 'aid_CL_vs_mean_tst_vs_trn',
        'aid_CL_rank_pct_tst_vs_trn', 'aid_CA_rank_int_tst_vs_trn', 'aid_CA_vs_mean_tst', 'clicks_rating_full',
        'session_click_diff_mean', 'aid_CA_vs_mean_tst_vs_trn'
    ],
    'clicks': [
        'a2s_last_action_index', 'a2s_actions_rel', 'ts_diff', 'session_actions', 'wgt_rel_v31m_mean', 
        'wgt_v31m_sum', 'wgt_rel_v11m_mean', 'v31m_indmin', 'wgt_rel_v31m_sum', 'wgt_v11m_sum', 'v11m_indmin',
        'session_items', 'session_clicks', 'session_full_length', 'wgt_v11m_mean', 'aid_CL_rank_int_tst_vs_trn',
        'wgt_v31m_mean', 'ts_diff_carts_rel', 'ts_diff_rel', 'aid_CL_vs_mean_tst_vs_trn', 'a2s_actions_num',
        'session_avg_real_items_num', 'v31m_num', 'ts_diff_orders_rel', 'ts_diff_clicks_rel', 
        'session_items_clicked', 'aid_multi_clicks_percent_full', 'aid_CL_rank_pct_tst_vs_trn', 'wgt_v51ha_sum',
        'aid_CL_vs_mean_trn', 'v21k_num', 'wgt_rel_v21k_sum', 'wgt_v21m_sum', 'carts_rating_train', 'wgt_v21m_mean', 
        'wgt_rel_v11m_sum', 'aid_CA_vs_mean_trn', 'ts_diff_clicks', 'wgt_v21k_sum', 'v11m_num', 
        'a2s_last_click_index', 'clicks_rating_full', 'wgt_rel_v21k_mean', 'clicks_rating_train', 
        'aid_clicks_favourite_dow_diff_test', 'a2s_carts_rel', 'wgt_rel_v21m_mean', 'wgt_v51ha_mean', 
        'wgt_rel_v51ha_mean', 'CA_estimation_frCL_trn'
    ]
}

In [9]:
full_feats = list(set(out_dict['clicks']) | set(out_dict['carts']) | set(out_dict['orders']))
len(full_feats)

82

In [10]:
bigram_feats = [
    'bigram_normed_clicks_sum', 'bigram_normed_clicks_mean', 'bigram_normed_clicks_max', 'bigram_normed_clicks_min', 
    'bigram_normed_clicks_last', 'bigram_normed_carts_sum', 'bigram_normed_carts_mean', 'bigram_normed_carts_max', 
    'bigram_normed_carts_min', 'bigram_normed_carts_last', 'bigram_more_clicks_sum', 'bigram_more_clicks_mean', 
    'bigram_more_clicks_max', 'bigram_more_clicks_min', 'bigram_more_clicks_last', 'bigram_more_carts_sum', 
    'bigram_more_carts_mean', 'bigram_more_carts_max', 'bigram_more_carts_min', 'bigram_more_carts_last' 
]

In [11]:
pub_sh1_1 = ['emb_diff_sh1_1_pub','emb_angle_sh1_1_pub']
pub_sh2_1 = ['emb_diff_sh2_1_pub','emb_angle_sh2_1_pub']
w2v = ['emb_diff_w2v_100','emb_angle_w2v_100']
bpr = ['bpr']

embs_feats = pub_sh1_1 + pub_sh2_1 + w2v + bpr + bigram_feats

In [12]:
needcols = ['session','aid'] + full_feats + [
    'matrices_num','matrices_numsum','matrices_wgt_rel_mean'
] + pub_sh1_1 + pub_sh2_1 + w2v

In [13]:
VER = '2'

target = 'clicks'

features = out_dict['clicks'] + [
    'matrices_num','matrices_numsum','matrices_wgt_rel_mean'
] + embs_feats

print(f"Predict {target}:\n")
for batch in range(4):
    print(f"\nbatch {batch}:")
    print("_"*30)
    
    train = pd.read_parquet(f"feats/feats_1_batch_{batch}.pqt",columns = needcols)
    
    train = pd.concat(
        [
            train,
            pd.read_parquet(f"matrices/bigram_test_sirius_batch_{batch}.parquet", columns = bigram_feats)
        ],
        axis=1
    )
    train['bpr'] = pd.read_parquet(f"matrices/bpr_test_batch_{batch}.parquet")['bpr']
    
    N = len(models[target])

    for i, model_file in enumerate(models[target]):
        with open(model_file, 'rb') as file:
            model = pickle.load(file)
        print(f"Predict {target} with model {i} ...")    
        pred = model.predict(train[features])
        print("done")
        if i==0:
            preds = (1/N) * pred
        else:
            preds += (1/N) * pred

    save_object(preds, f"output/pred_{target}_v{VER + 'abcd'[batch]}.pkl")
    gc_clear()

    print(f"Make {target} submission...")

    prepare_submission(
        train[['session','aid']].copy(), 
        preds, 
        target, 
        VER + 'abcd'[batch]
    )

Predict clicks:


batch 0:
______________________________
Predict clicks with model 0 ...
done
Predict clicks with model 1 ...
done
Predict clicks with model 2 ...
done
Make clicks submission...

batch 1:
______________________________
Predict clicks with model 0 ...
done
Predict clicks with model 1 ...
done
Predict clicks with model 2 ...
done
Make clicks submission...

batch 2:
______________________________
Predict clicks with model 0 ...
done
Predict clicks with model 1 ...
done
Predict clicks with model 2 ...
done
Make clicks submission...

batch 3:
______________________________
Predict clicks with model 0 ...
done
Predict clicks with model 1 ...
done
Predict clicks with model 2 ...
done
Make clicks submission...


In [14]:
VER = '2'

target = 'carts'

features = out_dict['carts'] + [
    'matrices_num','matrices_numsum','matrices_wgt_rel_mean'
] + embs_feats

print(f"Predict {target}:\n")
for batch in range(4):
    print(f"\nbatch {batch}:")
    print("_"*30)
    train = pd.read_parquet(f"feats/feats_1_batch_{batch}.pqt")
    
    train = pd.concat(
        [
            train,
            pd.read_parquet(f"matrices/bigram_test_sirius_batch_{batch}.parquet", columns = bigram_feats)
        ],
        axis=1
    )
    train['bpr'] = pd.read_parquet(f"matrices/bpr_test_batch_{batch}.parquet")['bpr']
    
    N = len(models[target])

    for i, model_file in enumerate(models[target]):
        with open(model_file, 'rb') as file:
            model = pickle.load(file)
        print(f"Predict {target} with model {i} ...")    
        pred = model.predict(train[features])
        print("done")
        if i==0:
            preds = (1/N) * pred
        else:
            preds += (1/N) * pred

    save_object(preds, f"output/pred_{target}_v{VER + 'abcd'[batch]}.pkl")
    gc_clear()

    print(f"Make {target} submission...")

    prepare_submission(
        train[['session','aid']].copy(), 
        preds, 
        target, 
        VER + 'abcd'[batch]
    )
    
######################################################################

VER = '2'

target = 'orders'

features = out_dict['orders'] + [
    'carts_pred'
] + embs_feats

print(f"Predict {target}:\n")
for batch in range(4):
    print(f"\nbatch {batch}:")
    print("_"*30)
    train = pd.read_parquet(f"feats/feats_1_batch_{batch}.pqt")
    with open(f"output/pred_carts_v{VER + 'abcd'[batch]}.pkl", 'rb') as file:
        carts_pred = pickle.load(file)
    train['carts_pred'] = carts_pred
    del carts_pred
    gc_clear()
    
    train = pd.concat(
        [
            train,
            pd.read_parquet(f"matrices/bigram_test_sirius_batch_{batch}.parquet", columns = bigram_feats)
        ],
        axis=1
    )
    train['bpr'] = pd.read_parquet(f"matrices/bpr_test_batch_{batch}.parquet")['bpr']
    
    N = len(models[target])

    for i, model_file in enumerate(models[target]):
        with open(model_file, 'rb') as file:
            model = pickle.load(file)
        print(f"Predict {target} with model {i} ...")    
        pred = model.predict(train[features])
        print("done")
        if i==0:
            preds = (1/N) * pred
        else:
            preds += (1/N) * pred

    save_object(preds, f"output/pred_{target}_v{VER + 'abcd'[batch]}.pkl")
    gc_clear()

    print(f"Make {target} submission...")

    prepare_submission(
        train[['session','aid']].copy(), 
        preds, 
        target, 
        VER + 'abcd'[batch]
    )

Predict carts:


batch 0:
______________________________
Predict carts with model 0 ...
done
Predict carts with model 1 ...
done
Predict carts with model 2 ...
done
Make carts submission...

batch 1:
______________________________
Predict carts with model 0 ...
done
Predict carts with model 1 ...
done
Predict carts with model 2 ...
done
Make carts submission...

batch 2:
______________________________
Predict carts with model 0 ...
done
Predict carts with model 1 ...
done
Predict carts with model 2 ...
done
Make carts submission...

batch 3:
______________________________
Predict carts with model 0 ...
done
Predict carts with model 1 ...
done
Predict carts with model 2 ...
done
Make carts submission...
Predict orders:


batch 0:
______________________________
Predict orders with model 0 ...
done
Predict orders with model 1 ...
done
Predict orders with model 2 ...
done
Make orders submission...

batch 1:
______________________________
Predict orders with model 0 ...
done
Predict orders 

In [15]:
VER = {'clicks':'2','carts':'2','orders':'2'}

sub = pd.concat(
    [
        pd.read_parquet(f"output/sub_{target}_lgbm_v{VER[target] + v}.pqt")
        for target in ['clicks','carts','orders']
        for v in 'abcd'
    ],
    ignore_index=True
)

sub.to_csv('output/submission2.csv',index=False)
sub

Unnamed: 0,session_type,labels
0,12900000_clicks,1635995 515459 606565 18262 1349330 196149 761...
1,12900400_clicks,1199617 1539309 349404 891513 666350 136822 15...
2,12900800_clicks,322935 87613 1260870 102466 1131560 728827 175...
3,12901200_clicks,324603 1321398 977147 1685607 461211 36276 996...
4,12901600_clicks,1033509 111668 27663 995815 1710980 1030029 12...
...,...,...
50143,14569603_orders,182696 375761 1380414 1557744 843567 278972 13...
50144,14570003_orders,27659 1063957 1725935 766922 884771 1104060 17...
50145,14570403_orders,866342 1592514 217213 1206098 619203 1084805 1...
50146,14570803_orders,1387843 210222 387793 1808913 282621 1653338 8...


In [16]:
dct = {0:'a',1:'b',2:'c',3:'d'}

outs = []

for i in range(4):
    b = pd.read_parquet(f"feats/feats_1_batch_{i}.pqt",columns = ['session','aid'])
    with open(f"output/pred_clicks_v2{dct[i]}.pkl", 'rb') as file:
        b['clicks_pred'] = pickle.load(file)
    with open(f"output/pred_carts_v2{dct[i]}.pkl", 'rb') as file:
        b['carts_pred'] = pickle.load(file)
    with open(f"output/pred_orders_v2{dct[i]}.pkl", 'rb') as file:
        b['orders_pred'] = pickle.load(file)
    outs.append(b)

out = pd.concat(outs, ignore_index=True)
        
out.to_parquet(f"output/alvor_raw_predictions_599.parquet",index=False)

In [None]:
a = pd.read_parquet("output/alvor_candidates_big.parquet")
with open("models_new/lgb_oof_clicks.pkl","rb") as f:
    b = pickle.load(f)
a['clicks_pred'] = b
a.to_parquet("output/alvor_oof_clicks_v2.parquet",index=False)

In [None]:
a = pd.concat(
    [
        pd.read_parquet("feats/feats_0_batch_0_small.pqt",columns = ['session','aid']),
        pd.read_parquet("feats/feats_0_batch_1_small.pqt",columns = ['session','aid']),
        pd.read_parquet("feats/feats_0_batch_2_small.pqt",columns = ['session','aid']),
        pd.read_parquet("feats/feats_0_batch_3_small.pqt",columns = ['session','aid']),
    ],
    ignore_index=True
)
with open("models_new/lgb_oof_carts.pkl","rb") as f:
    b = pickle.load(f)
with open("models_new/lgb_oof_orders.pkl","rb") as f:
    c = pickle.load(f)
a['orders_pred'] = c
a['carts_pred'] = b
a.to_parquet("output/alvor_oof_carts_orders_v2.parquet",index=False)