In [14]:
data_path_train_orig = 'data/semrush_cup_train_data.csv'
data_path_test_orig = 'data/semrush_cup_test_data.csv'
data_path_sample_orig = 'data/semrush_cup_sub.csv'
data_path_dict_orig = 'data/semrush_cup_categories_and_demo.csv'

cache_path = 'cache'

In [15]:
%pip install lightgbm

Note: you may need to restart the kernel to use updated packages.


In [26]:
import numpy as np
import pandas as pd
import os
import pickle
from pathlib import Path
import lightgbm as lgb

from sklearn.metrics import f1_score  
from sklearn.model_selection import train_test_split

In [17]:
# https://datascience.blog.wzb.eu/2016/08/12/a-tip-for-the-impatient-simple-caching-with-python-pickle-and-decorators/
def cached(cachefile):
    """
    A function that creates a decorator which will use "cachefile" for caching the results of the decorated function "fn".
    """
    def decorator(fn):  # define a decorator for a function "fn"
        def wrapped(*args, **kwargs):   # define a wrapper that will finally call "fn" with all arguments            
            # if cache exists -> load it and return its content
            if os.path.exists(cachefile):
                    with open(cachefile, 'rb') as cachehandle:
                        print("using cached result from '%s'" % cachefile)
                        return pickle.load(cachehandle)

            # execute the function with all arguments passed
            res = fn(*args, **kwargs)

            # write to cache file
            with open(cachefile, 'wb') as cachehandle:
                print("saving result to cache '%s'" % cachefile)
                pickle.dump(res, cachehandle)

            return res

        return wrapped

    return decorator   # return this "customized" decorator that uses "cachefile"

In [38]:
@cached('split_data_train_valis.cache')
def split_data_train_valid(data_path):
    train_data = pd.read_csv(data_path_train_orig)
    print(f'train_data size: {len(train_data)}')

    train_device_id_list = train_data.device_id.unique()
    print(f'train_device_id_list: {len(train_device_id_list)}')

    train_train_device_id_list, train_valid_device_id_list = train_test_split(train_device_id_list, train_size=0.7, random_state=17)
    train_train_data_path = data_path + '.train_train_0.7.csv'
    train_valid_data_path = data_path + '.train_valid_0.3.csv'    

    print(f'train_train_device_id_list: {len(train_train_device_id_list)}')
    train_train = train_data.loc[train_data.device_id.isin(train_train_device_id_list)]
    print(f'train_train: {len(train_train)}')
    train_train.to_csv(train_train_data_path, index=False)

    print(f'train_valid_device_id_list: {len(train_valid_device_id_list)}')
    train_valid = train_data.loc[train_data.device_id.isin(train_valid_device_id_list)]
    print(f'train_valid: {len(train_valid)}')
    train_valid.to_csv(train_valid_data_path, index=False)
    
    return train_train_data_path, train_valid_data_path

train_train_data_path, train_valid_data_path = split_data_train_valid(data_path_train_orig)

train_data size: 13040643
train_device_id_list: 200000
train_train_device_id_list: 140000
train_train: 9095130
train_valid_device_id_list: 60000
train_valid: 3945513
saving result to cache 'split_data_train_valis.cache'


In [24]:
@cached('get_stat_dicts.cache')
def get_stat_dicts(data_path, sample_frac=1.):
    print(f'loading data from {data_path}')
    train_data = pd.read_csv(data_path_train_orig)
    train_data = train_data[~train_data.referrer_domain.isna()]

    print(f'sampling: {sample_frac}')
    train_data = train_data.sample(frac=sample_frac)

    # domain stats
    domain_count_ref = train_data.groupby('domain').size()
    print(domain_count_ref.describe())
    domain_count_ref = domain_count_ref.to_dict()

    domain_count_device = train_data.groupby('domain').device_id.nunique()
    print(domain_count_device.describe())
    domain_count_device = domain_count_device.to_dict()

    domain_count_referrer_domain = train_data.groupby('domain').referrer_domain.nunique()
    print(domain_count_referrer_domain.describe())
    domain_count_referrer_domain = domain_count_referrer_domain.to_dict()

    # referrer_domain stats
    referrer_domain_count_ref = train_data.groupby('referrer_domain').size()
    print(referrer_domain_count_ref.describe())
    referrer_domain_count_ref = referrer_domain_count_ref.to_dict()

    referrer_domain_count_device = train_data.groupby('referrer_domain').device_id.nunique()
    print(referrer_domain_count_device.describe())
    referrer_domain_count_device = referrer_domain_count_device.to_dict()

    referrer_domain_count_domain = train_data.groupby('referrer_domain').domain.nunique()
    print(referrer_domain_count_domain.describe())
    referrer_domain_count_domain = referrer_domain_count_domain.to_dict()

    # domain_referrer vs domain stats
    domain_referrer_domain_count_ref = train_data.groupby(['domain', 'referrer_domain']).size()
    print(domain_referrer_domain_count_ref.describe())
    domain_referrer_domain_count_ref = domain_referrer_domain_count_ref.to_dict()

    domain_referrer_domain_count_device = train_data.groupby(['domain', 'referrer_domain']).device_id.nunique()
    print(domain_referrer_domain_count_device.describe())
    domain_referrer_domain_count_device = domain_referrer_domain_count_device.to_dict()
    
    return (domain_count_ref, domain_count_device, domain_count_referrer_domain,
            referrer_domain_count_ref, referrer_domain_count_device, referrer_domain_count_domain,
            domain_referrer_domain_count_ref, domain_referrer_domain_count_device)
            
(domain_count_ref, domain_count_device, domain_count_referrer_domain, 
 referrer_domain_count_ref, referrer_domain_count_device, referrer_domain_count_domain,
 domain_referrer_domain_count_ref, domain_referrer_domain_count_device) = get_stat_dicts(train_train_data_path, sample_frac=0.7)

using cached result from 'get_stat_dicts.cache'


# Build features

In [39]:
def process(data):
    data_check = data[data.referrer_num!='target'].copy().set_index('event_group_id')
    data_check = data_check[['domain', 'timestamp', 'is_referrer', 'referrer_num']]
    # data_check['timestamp_dif2'] = data_check.groupby(level=0)['timestamp'].transform(max) - data_check['timestamp']
    # data_check['timestamp_dif3'] = data_check.groupby(level=0)['timestamp'].transform(min) - data_check['timestamp']
    # data_check['timestamp_dif4'] = data_check.groupby(level=0)['timestamp'].transform('mean') - data_check['timestamp']
    # data_check['timestamp_dif5'] = data_check.groupby(level=0)['timestamp'].transform(max) - data_check.groupby(level=0)['timestamp'].transform(min)
    # data_check['timestamp_dif6'] = data_check.groupby(level=0)['timestamp'].transform(max) - data_check.groupby(level=0)['timestamp'].transform('mean')

    data_check = data_check.rename(columns={'domain':'referrer_domain_to_check', 'timestamp':'timestamp_check'})
    data_check.referrer_num = data_check.referrer_num.astype('int')

    data_fact = data[data.referrer_num=='target'].copy().set_index('event_group_id')
    data_fact = data_fact[['domain', 'timestamp']]
    data_fact = data_fact.rename(columns={'timestamp':'timestamp_fact'})

    data_new = data_fact.join(data_check)
    data_new['timestamp_dif'] = data_new['timestamp_fact'] - data_new['timestamp_check'] 
    data_new = data_new.drop(columns=['timestamp_fact', 'timestamp_check'])    
    return data_new

def process_features(data):
    data['domain_count_ref'] = data.domain.map(lambda x: domain_count_ref.get(x, 0))
    data['domain_count_device'] = data.domain.map(lambda x: domain_count_device.get(x, 0))
    data['domain_count_referrer_domain'] = data.domain.map(lambda x: domain_count_referrer_domain.get(x, 0))
    
    data['referrer_domain_count_ref'] = data.referrer_domain_to_check.map(lambda x: referrer_domain_count_ref.get(x, 0))
    data['referrer_domain_count_device'] = data.referrer_domain_to_check.map(lambda x: referrer_domain_count_device.get(x, 0))
    data['referrer_domain_count_domain'] = data.referrer_domain_to_check.map(lambda x: referrer_domain_count_domain.get(x, 0))

    data['domain_referrer_domain_count_ref'] = list(map(lambda x, y: domain_referrer_domain_count_ref.get((x,y), 0), data['domain'].values, data['referrer_domain_to_check'].values))
    data['domain_referrer_domain_count_device'] = list(map(lambda x, y: domain_referrer_domain_count_device.get((x,y), 0), data['domain'].values, data['referrer_domain_to_check'].values))
    
    data['domain_referrer_domain_count_ref_vs_domain_count_ref'] = data['domain_referrer_domain_count_ref'] / data['domain_count_ref'] + 0.00001
    data['domain_referrer_domain_count_ref_vs_referrer_domain_count_ref'] = data['domain_referrer_domain_count_ref'] / data['referrer_domain_count_ref'] + 0.00001

    data['domain_referrer_domain_count_device_ref_vs_domain_count_device'] = data['domain_referrer_domain_count_device'] / data['domain_count_device'] + 0.00001
    data['domain_referrer_domain_count_device_ref_vs_referrer_domain_count_device_id'] = data['domain_referrer_domain_count_device'] / data['referrer_domain_count_device'] + 0.00001

    return data

# Load data

In [27]:
train_train = pd.read_csv(train_train_data_path)

In [28]:
train_train_features = process(train_train)
train_train_features = process_features(train_train_features)

In [30]:
train_train_features[:11]

Unnamed: 0_level_0,domain,referrer_domain_to_check,is_referrer,referrer_num,timestamp_dif,domain_count_ref,domain_count_device,domain_count_referrer_domain,referrer_domain_count_ref,referrer_domain_count_device,referrer_domain_count_domain,domain_referrer_domain_count_ref,domain_referrer_domain_count_device,domain_referrer_domain_count_ref_vs_domain_count_ref,domain_referrer_domain_count_ref_vs_referrer_domain_count_ref,domain_referrer_domain_count_device_ref_vs_domain_count_device,domain_referrer_domain_count_device_ref_vs_referrer_domain_count_device_id
event_group_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,ttTLHs.jg,tWtXRJ.jl,False,0,591,585,327,73,30,24,1,0,0,1e-05,1e-05,1e-05,1e-05
0,ttTLHs.jg,ttdXfB.jg,False,1,259,585,327,73,35,13,19,0,0,1e-05,1e-05,1e-05,1e-05
0,ttTLHs.jg,ttnqBJ.jg,False,2,227,585,327,73,196,90,45,0,0,1e-05,1e-05,1e-05,1e-05
0,ttTLHs.jg,ttdXfB.jg,False,3,193,585,327,73,35,13,19,0,0,1e-05,1e-05,1e-05,1e-05
0,ttTLHs.jg,tWVsHB.HB,False,4,159,585,327,73,1864,776,153,0,0,1e-05,1e-05,1e-05,1e-05
0,ttTLHs.jg,tttRWL.jg,False,5,111,585,327,73,6694,4407,616,0,0,1e-05,1e-05,1e-05,1e-05
0,ttTLHs.jg,ttGfsg.jg,False,6,105,585,327,73,538,301,88,0,0,1e-05,1e-05,1e-05,1e-05
0,ttTLHs.jg,ttXfgB.jg,False,7,66,585,327,73,21,15,4,0,0,1e-05,1e-05,1e-05,1e-05
0,ttTLHs.jg,ttUHGL.AQ,True,8,63,585,327,73,4878,1069,315,6,5,0.010266,0.00124,0.015301,0.004687
0,ttTLHs.jg,ttfLTH.jg,False,9,6,585,327,73,197,103,43,0,0,1e-05,1e-05,1e-05,1e-05


In [31]:
train_train_features.dtypes

domain                                                                         object
referrer_domain_to_check                                                       object
is_referrer                                                                      bool
referrer_num                                                                    int64
timestamp_dif                                                                   int64
domain_count_ref                                                                int64
domain_count_device                                                             int64
domain_count_referrer_domain                                                    int64
referrer_domain_count_ref                                                       int64
referrer_domain_count_device                                                    int64
referrer_domain_count_domain                                                    int64
domain_referrer_domain_count_ref                      

In [40]:
train_valid = pd.read_csv(train_valid_data_path)

In [41]:
train_valid_features = process(train_valid)
train_valid_features = process_features(train_valid_features)

# Train model

In [None]:
# min_size = train_train_features.groupby('is_referrer').size().min()
# train_train_features_v2 = train_train_features.groupby('is_referrer').apply(lambda x: x.sample(min_size))

In [42]:
def get_x(data):
    data = data.iloc[:,4:]
    # data = data.drop(columns=['domain_count_ref', 'domain_count_device', 'domain_count_referrer_domain'])
    # data = data.iloc[:,:2]
    return data#.values.astype('float32')

def get_y(data):
    return data.is_referrer
    
def get_lgb_dataset(data):
    return lgb.Dataset(get_x(data), label=get_y(data))

In [117]:
model_list = []
train_valid_pred_list = []

for target in range(10):
    print(f'processing target: {target}')
    
    train_train_features_sample = train_train_features[train_train_features.referrer_num == target]
    train_train_dataset = get_lgb_dataset(train_train_features_sample)
    
    train_valid_features_sample = train_valid_features[train_valid_features.referrer_num == target]
    train_valid_dataset = get_lgb_dataset(train_valid_features_sample)

    # https://lightgbm.readthedocs.io/en/latest/Parameters.html
    param = {'objective': 'binary'}
    param['learning_rate'] = 0.05
    # param['num_leaves'] = 64
    param['verbose'] = 0
    param['metric'] = 'auc'

    evals_result = {} 

    model = lgb.train(param, train_train_dataset, num_boost_round=1000, 
                      valid_sets=(train_train_dataset, train_valid_dataset), 
                      # feval=lgb_f1_score,
                      callbacks=[lgb.early_stopping(stopping_rounds=50),
                                 lgb.log_evaluation(50),
                                 lgb.record_evaluation(evals_result)])
    
    train_valid_pred = train_valid_features_sample[['referrer_num', 'is_referrer']].copy()
    train_valid_pred['is_referrer_pred'] = model.predict(get_x(train_valid_features_sample))
    
    model_list.append(model)
    train_valid_pred_list.append(train_valid_pred)    

processing target: 0
You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 50 rounds
[50]	training's auc: 0.974172	valid_1's auc: 0.970202
[100]	training's auc: 0.98076	valid_1's auc: 0.974274
[150]	training's auc: 0.983194	valid_1's auc: 0.975589
[200]	training's auc: 0.984429	valid_1's auc: 0.976238
[250]	training's auc: 0.985568	valid_1's auc: 0.976519
[300]	training's auc: 0.986709	valid_1's auc: 0.976764
[350]	training's auc: 0.987282	valid_1's auc: 0.977137
[400]	training's auc: 0.988153	valid_1's auc: 0.977188
Early stopping, best iteration is:
[372]	training's auc: 0.987547	valid_1's auc: 0.9773
processing target: 1
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 50 rounds
[50]	training's auc: 0.972418	valid_1's auc: 0.970551
[100]	training's auc: 0.97887	valid_1's auc: 0.975384
[150]	training's auc: 

In [118]:
train_valid_pred = pd.concat(train_valid_pred_list)
train_valid_pred = train_valid_pred.pivot(columns='referrer_num', values=['is_referrer', 'is_referrer_pred'])
train_valid_pred.head()

Unnamed: 0_level_0,is_referrer,is_referrer,is_referrer,is_referrer,is_referrer,is_referrer,is_referrer,is_referrer,is_referrer,is_referrer,is_referrer_pred,is_referrer_pred,is_referrer_pred,is_referrer_pred,is_referrer_pred,is_referrer_pred,is_referrer_pred,is_referrer_pred,is_referrer_pred,is_referrer_pred
referrer_num,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9
event_group_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
7,False,False,False,False,False,False,True,False,False,False,0.000972,0.00141,0.000263,0.001282,0.000426,0.001446,0.565994,0.001882,0.748844,0.027848
9,False,False,False,False,False,False,False,False,False,True,0.000517,0.001036,0.000426,0.000913,0.000575,0.000485,0.000506,0.121743,0.002443,0.957497
13,False,False,False,False,False,False,False,False,False,True,0.001419,0.000746,0.002752,0.000291,0.000455,0.007867,0.004841,0.009304,0.024466,0.941382
16,False,False,False,True,False,False,False,False,False,False,5.4e-05,0.000101,0.00077,0.414153,0.000146,0.000226,0.000428,0.002849,0.293961,0.002434
31,False,False,False,False,False,False,False,False,True,False,0.000698,0.006754,0.00155,0.001883,0.001485,0.001147,0.092824,0.004741,0.669954,0.03571


In [119]:
train_valid_pred['referrer_num'] = np.argmax(train_valid_pred.loc[:,('is_referrer', slice(None))].values, axis=1)
train_valid_pred['referrer_num_pred'] = np.argmax(train_valid_pred.loc[:,('is_referrer_pred', slice(None))].values, axis=1)
f1_score(train_valid_pred.referrer_num, train_valid_pred.referrer_num_pred, average='macro')

0.8352092113671384

In [120]:
f1_score(train_valid_pred.referrer_num, train_valid_pred.referrer_num_pred, average=None)

array([0.78530743, 0.79781146, 0.81432732, 0.8072534 , 0.8157244 ,
       0.82670741, 0.83855017, 0.85484846, 0.87462801, 0.93693406])

In [129]:
def get_f1_max(y_fact, y_pred):
    data = pd.DataFrame({'y_fact':y_fact, 'y_pred':y_pred})
    
    data = data.sort_values('y_pred', ascending=False)
    data['tp'] = data.y_fact.cumsum().astype('int')
    data['tp'] = data.groupby('y_pred').tp.transform(max)

    data['fp'] = (data.y_fact==False).cumsum().astype('int')
    data['fp'] = data.groupby('y_pred').fp.transform(max)
    
    data = data.sort_values('y_pred', ascending=True)
    data['fn'] = data.y_fact.cumsum().astype('int')
    data['fn'] = data.groupby('y_pred').fn.transform(max)
    
    data['precision'] = data.tp / (data.tp + data.fp)
    data['recall'] = data.tp / (data.tp + data.fn)    
    
    data['f1'] = 2 * ( data['precision'] * data['recall'] ) / ( data['precision'] + data['recall'] )
    # data = data.sort_values('f1', ascending=False)
    
    f1_max = data.f1.max()
    f1_max_pos = data.f1.argmax()
    f1_max_th = data.iloc[f1_max_pos]['y_pred']
    
    print(f'f1_max={f1_max} f1_max_th={f1_max_th}')
    # print(f'f1_val={f1_score(y_fact, y_pred>=f1_max_th, average="binary")}')
    return f1_max, f1_max_th

In [130]:
f1_max_list = []
f1_max_th_list = []
for target in range(10):    
    f1_max, f1_max_th = get_f1_max(train_valid_pred.loc[:,('is_referrer', target)].astype('bool'),
                                   train_valid_pred.loc[:,('is_referrer_pred', target)])
    
    f1_max_list.append(f1_max)
    f1_max_th_list.append(f1_max_th)

np.average(f1_max_list)

f1_max=0.6385042435071151 f1_max_th=0.27649032642002536
f1_max=0.6595680683390482 f1_max_th=0.35274728623333607
f1_max=0.6594063634422379 f1_max_th=0.3225965596165658
f1_max=0.6679933665008293 f1_max_th=0.33435131132345264
f1_max=0.6903316920493149 f1_max_th=0.35024106925231596
f1_max=0.7101969365426696 f1_max_th=0.3425351623010106
f1_max=0.7377268798617114 f1_max_th=0.36364928184189743
f1_max=0.7702209330851264 f1_max_th=0.379027759924218
f1_max=0.8166709328262698 f1_max_th=0.4514386025711837
f1_max=0.926784628055575 f1_max_th=0.49829193656719


0.7277404044209898

In [131]:
# target_stat = train_valid_features[train_valid_features.is_referrer==True].referrer_num.value_counts(normalize=True).sort_index()
# target_stat

# train_valid_pred['referrer_num'] = np.argmax(train_valid_pred.loc[:,('is_referrer', slice(None))].values, axis=1)
# train_valid_pred['referrer_num_pred'] = 9

# for target, target_rate in target_stat.iloc[:-1].iteritems():
#     print(target, target_rate)
    
#     pred_curr = train_valid_pred.loc[train_valid_pred.referrer_num_pred > target]
#     # print(f'items={len(pred_curr)}')
    
#     target_th_pos = int(len(train_valid_pred) * target_rate)
#     # print(f'target_th_pos={target_th_pos}')
    
#     # print(pred_curr.loc[:,('is_referrer_pred', target)].sort_values(ascending=False))
#     # print(pred_curr.loc[:,('is_referrer_pred', target)].describe())

#     target_th = pred_curr.loc[:,('is_referrer_pred', target)].sort_values(ascending=False).iloc[target_th_pos]
#     # print(f'target_th={target_th}')
    
#     # print(sum(pred_curr.loc[:,('is_referrer_pred', target)] >= target_th))
#     # print(sum(pred_curr.loc[:,('is_referrer_pred', target)] <= target_th))
    
#     train_valid_pred.loc[(train_valid_pred.loc[:,('is_referrer_pred', target)] >= target_th) &
#                         (train_valid_pred.referrer_num_pred > target),
#                         'referrer_num_pred'] =target

# f1_score(train_valid_pred.referrer_num, train_valid_pred.referrer_num_pred, average='macro')

0 0.02071745803397429
1 0.023984967227328867
2 0.025791576405907166
3 0.028950354491291755
4 0.03571398700245063
5 0.04559736591921
6 0.06147210768282857
7 0.09150698527669279
8 0.1494718177332073


0.7164490506533157

In [57]:
# train_valid_pred = train_valid_features[['referrer_num', 'is_referrer']].copy()
# train_valid_pred['is_referrer_pred'] = model.predict(get_x(train_valid_features))
# train_valid_pred = train_valid_pred.pivot(columns='referrer_num', values=['is_referrer', 'is_referrer_pred'])
# train_valid_pred.head()

In [58]:
# train_train_data = get_lgb_dataset(train_train_features_v2)
# train_test_data = get_lgb_dataset(train_test_features)

In [59]:
# # https://lightgbm.readthedocs.io/en/latest/Parameters.html

# param = {'objective': 'binary'}
# param['learning_rate'] = 0.1#0.01
# # param['num_leaves'] = 64
# param['verbose'] = 1
# param['metric'] = 'auc'

# evals_result = {} 

# model = lgb.train(param, train_train_data, num_boost_round=100, valid_sets=train_test_data, 
#                   callbacks=[lgb.early_stopping(stopping_rounds=50),
#                              lgb.log_evaluation(1),
#                              lgb.record_evaluation(evals_result)])

In [60]:
# [41]	valid_0's auc: 0.845916

In [61]:
# print('Plotting metrics recorded during training...')
# ax = lgb.plot_metric(evals_result, metric='auc')
# # plt.show()

In [62]:
# [50]	valid_0's auc: 0.838309

In [63]:
# lgb.plot_importance(model)

In [64]:
# train_test_features.dtypes[3:]

In [66]:
# train_test_pred = train_test_features[['referrer_num', 'is_referrer']].copy()
# train_test_pred['is_referrer_pred'] = model.predict(get_x(train_test_features))
# train_test_pred = train_test_pred.pivot(columns='referrer_num', values=['is_referrer', 'is_referrer_pred'])
# train_test_pred.head()

# Validation

## Pred by target rate

In [None]:
# target_stat = train_train_features[train_train_features.is_referrer==True].referrer_num.value_counts(normalize=True).sort_index()
# target_stat

In [67]:
# train_test_pred['referrer_num'] = np.argmax(train_test_pred.loc[:,('is_referrer', slice(None))].values, axis=1)
# train_test_pred['referrer_num_pred'] = 9

# for target, target_rate in target_stat.iloc[:-1].iteritems():
#     print(target, target_rate)
    
#     pred_curr = train_test_pred.loc[train_test_pred.referrer_num_pred > target]
#     # print(f'items={len(pred_curr)}')
    
#     target_th_pos = int(len(train_test_pred) * target_rate)
#     # print(f'target_th_pos={target_th_pos}')
    
#     # print(pred_curr.loc[:,('is_referrer_pred', target)].sort_values(ascending=False))
#     # print(pred_curr.loc[:,('is_referrer_pred', target)].describe())

#     target_th = pred_curr.loc[:,('is_referrer_pred', target)].sort_values(ascending=False).iloc[target_th_pos]
#     # print(f'target_th={target_th}')
    
#     # print(sum(pred_curr.loc[:,('is_referrer_pred', target)] >= target_th))
#     # print(sum(pred_curr.loc[:,('is_referrer_pred', target)] <= target_th))
    
#     train_test_pred.loc[(train_test_pred.loc[:,('is_referrer_pred', target)] >= target_th) &
#                         (train_test_pred.referrer_num_pred > target),
#                         'referrer_num_pred'] =target

# f1_score(train_test_pred.referrer_num, train_test_pred.referrer_num_pred, average='macro')

## Simple

In [68]:
# train_test_pred['referrer_num'] = np.argmax(train_test_pred.loc[:,('is_referrer', slice(None))].values, axis=1)
# train_test_pred['referrer_num_pred'] = np.argmax(train_test_pred.loc[:,('is_referrer_pred', slice(None))].values, axis=1)
# f1_score(train_test_pred.referrer_num, train_test_pred.referrer_num_pred, average='macro')

In [None]:
# auc: 0.842891 private: 0.58151635040473 public: 0.577699280352

# Prediction

In [123]:
def save_submission(data, path):
    data['referrer_num_pred'] = np.argmax(data.loc[:,('is_referrer_pred', slice(None))].values, axis=1)    
    pred = data[['referrer_num_pred']].reset_index().rename(columns={'referrer_num_pred':'referrer_num'})
    
    sample = pd.read_csv(data_path_sample_orig)
    assert all(pred.event_group_id == sample.event_group_id)
    assert all(pred.referrer_num.isin(list(range(10))))
    
    print(pred.referrer_num.value_counts().sort_index())
    
    pred.to_csv(path, index=False)
    return pred

# save_submission(test_pred, 'results/result_v2.csv')

## Load test data

In [71]:
test_data = pd.read_csv(data_path_test_orig)

test_data.head(11)

test_data['is_referrer'] = False
test_data.head(11)

test_features = process(test_data)
test_features = process_features(test_features)

## Pred by position specific model

In [126]:
test_pred_list = []

for target, model in zip(range(10), model_list):
    print(f'processing target: {target}')
    
    test_features_sample = test_features[test_features.referrer_num == target]    
    test_pred = test_features_sample[['referrer_num']].copy()
    test_pred['is_referrer_pred'] = model.predict(get_x(test_features_sample))
    
    test_pred_list.append(test_pred)    

processing target: 0
processing target: 1
processing target: 2
processing target: 3
processing target: 4
processing target: 5
processing target: 6
processing target: 7
processing target: 8
processing target: 9


In [127]:
test_pred = pd.concat(test_pred_list)
test_pred = test_pred.pivot(columns='referrer_num', values=['is_referrer_pred'])
test_pred.head()

Unnamed: 0_level_0,is_referrer_pred,is_referrer_pred,is_referrer_pred,is_referrer_pred,is_referrer_pred,is_referrer_pred,is_referrer_pred,is_referrer_pred,is_referrer_pred,is_referrer_pred
referrer_num,0,1,2,3,4,5,6,7,8,9
event_group_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
1,0.000474,9.7e-05,0.000121,0.564639,0.115744,0.000467,0.000399,0.000282,0.000238,0.013579
4,0.000588,4.5e-05,0.114396,0.138396,0.000258,0.000405,0.000315,0.004006,0.015174,0.00922
5,2.6e-05,1.6e-05,4e-06,0.729386,0.000612,0.002097,0.000139,0.001492,0.00051,0.001477
8,0.00182,0.147017,0.015969,0.003063,0.015649,0.045753,0.04121,0.052939,0.287367,0.151054
12,0.001196,0.001472,0.002336,0.000929,0.001774,0.000352,9.1e-05,0.012248,0.11393,0.001499


In [128]:
test_pred_sub = save_submission(test_pred, 'results/result_v3.csv')
test_pred_sub

0      8909
1     10314
2     10707
3     12009
4     14892
5     19171
6     26308
7     40865
8     71180
9    338144
Name: referrer_num, dtype: int64


Unnamed: 0_level_0,event_group_id,referrer_num
referrer_num,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,3
1,4,3
2,5,3
3,8,8
4,12,8
...,...,...
552494,1737999,9
552495,1738003,2
552496,1738006,8
552497,1738008,9


## Pred by single model

In [None]:
# test_pred = test_features[['referrer_num', 'is_referrer']].copy()
# test_pred['is_referrer_pred'] = model.predict(get_x(test_features))
# test_pred = test_pred.pivot(columns='referrer_num', values=['is_referrer', 'is_referrer_pred'])
# test_pred.head()

# print(train_train_features[train_train_features.is_referrer==True].referrer_num.value_counts().sort_index())
# print(train_train_features[train_train_features.is_referrer==True].referrer_num.value_counts(normalize=True).sort_index())
# print(train_test_features[train_test_features.is_referrer==True].referrer_num.value_counts().sort_index())
# print(train_test_features[train_test_features.is_referrer==True].referrer_num.value_counts(normalize=True).sort_index())

In [None]:
# from sklearn.metrics import f1_score

# def lgb_f1_score(y_hat, data):
#     y_true = data.get_label()
#     y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
#     return 'f1', f1_score(y_true, y_hat), True

# Results

In [None]:
# model per target: private: 0.7403175951534714 public: 0.604222330379 
# result_v3.csv private: 0.8352092113671384 private f1 best mean: 0.7277404044209898 pubic: 0.688577267262