In [1]:
%pip install lightgbm
%pip install fastparquet

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import numpy as np
import pandas as pd
import dask.dataframe as dd
import os
import pickle
from pathlib import Path
import lightgbm as lgb

from sklearn.metrics import f1_score  
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

# Helper functions

## DataReader class

In [3]:
class DataReader:
    def __init__(self):
        self.data_path_train_orig = 'data/semrush_cup_train_data.csv'
        self.data_path_test_orig = 'data/semrush_cup_test_data.csv'
        self.data_path_sample_orig = 'data/semrush_cup_sub.csv'
        self.data_path_dict_orig = 'data/semrush_cup_categories_and_demo.csv'
        
        self.data_path_train_fold_list = []
        self.data_path_test_parquet = 'parquet/test.parquet'
        
    def read_raw_csv(csv_path):
        data = pd.read_csv(csv_path)
        print(data.dtypes)
        data.loc[data.referrer_num=='target', 'referrer_num']=10
        data.referrer_num = data.referrer_num.astype('int8')
        data = data.set_index('event_group_id')
        return data

    def save_to_parquet(data, parquet_path):
        data.to_parquet(parquet_path, index=True)
        
    # def read_parquet(self, parquet_path):
    #     data = dd.read_parquet(parquet_path, columns=['device_id', 'timestamp', 'domain', 'referrer_domain', 'referrer_num', 'is_referrer'])
    #     return data

    def process_train_csv(self):
        self.data_path_train_fold_list = [self.get_train_fold_name(i) for i in range(self.get_train_cv_count())]
        
        if any([not(os.path.exists(i)) for i in self.data_path_train_fold_list]):            
            kfold = KFold(n_splits = self.get_train_cv_count(), shuffle=True, random_state=32)
            
            train_data = DataReader.read_raw_csv(self.data_path_train_orig)
            print(f'train_data size: {len(train_data)}')

            train_device_id_list = train_data.device_id.unique()
            print(f'train_device_id_list: {len(train_device_id_list)}')
            
            for (train_index, valid_index), train_cv_fold_path in zip(kfold.split(train_device_id_list), self.data_path_train_fold_list):
                print(f'process: {train_cv_fold_path}')
                train_device_cv_fold =train_device_id_list[valid_index]
                print(f'train_device_cv_fold: {len(train_device_cv_fold)}')                
                train_cv_fold = train_data.loc[train_data.device_id.isin(train_device_cv_fold)]
                print(f'train_cv_fold: {len(train_cv_fold)}')                
                
                DataReader.save_to_parquet(train_cv_fold, train_cv_fold_path)
                
    def process_test_csv(self):
        if not(os.path.exists(self.data_path_test_parquet)):            
            test_data = DataReader.read_raw_csv(self.data_path_test_orig)
            print(f'test_data size: {len(test_data)}')
            DataReader.save_to_parquet(test_data, self.data_path_test_parquet)

    def get_train_cv_count(self):
        return 5
    
    def get_train_fold_name(self, fold_id):
        return f'parquet/test_fold_{fold_id}_{self.get_train_cv_count()}.parquet'
        
    # def get_train_cv_fold(self, fold_id):
    #     self.process_train_csv()
    #     return self.get_train_fold_name(fold_id)
    
    def get_train_cv_train_path_list(self, fold_id):
        self.process_train_csv()
        data_path_list = [self.get_train_fold_name(i) for i in range(self.get_train_cv_count()) if i!=fold_id]  
        print(f'get_train_cv_train_path_list: {", ".join(data_path_list)}')        
        return data_path_list
               
    def get_train_cv_valid_path(self, fold_id):
        self.process_train_csv()
        data_path = self.get_train_fold_name(fold_id)
        print(f'get_train_cv_valid_path: {data_path}')
        return data_path    
               
    def get_test_path(self):
        self.process_test_csv()
        data_path = self.data_path_test_parquet
        print(f'get_test_path: {data_path}')
        return data_path                   

#     def get_train_cv_train(self, fold_id, columns=None):
#         data_path_list = self.get_train_cv_train_path_list(fold_id)
#         return dd.read_parquet(data_path_list, columns)
    

    
#     def get_train_cv_valid(self, fold_id, columns=None):
#         self.process_train_csv()
#         data_path = self.get_train_cv_valid_path(self, fold_id)
#         return dd.read_parquet(data_path, columns)
        
#     def get_test(self):
#         if os.path.exists(self.data_path_test_parquet):
#             test = pd.read_parquet(self.data_path_test_parquet)

#         else:
#             test = DataReader.read_raw_csv(self.data_path_test_orig)
#             DataReader.save_to_parquet(test, self.data_path_test_parquet)
#         return test

# print(data_reader.get_train_cv_train(2))
# print(data_reader.get_train_cv_valid(2))

In [4]:
# from joblib import Memory
# memory = Memory('cache', verbose=0)
# @memory.cache

## FeatureGenerator class

In [5]:
class device_data_counter:
    def __init__(self):
        self.device_id_prev =-1
        self.data_dict = {}
        self.counter_list = []

    def fill(self, device_id, key):
        counter_list = []

        if self.device_id_prev==device_id:
            value = self.data_dict.get(key, 0)
            value = value + 1
            self.data_dict[key] = value
            self.counter_list.append(value)    
        else:
            self.device_id_prev = device_id

            self.data_dict = {}
            self.counter_list.append(0)

    def get_list(self):
        return self.counter_list
        
class FeatureGenerator:
    def __init__(self, data_path_list, sample_frac=0.7):
        print(f'FeatureGenerator:init: {", ".join(data_path_list)}')        
        train_data = dd.read_parquet(data_path_list).compute()
        train_data = train_data[~train_data.referrer_domain.isna()]

        print(f'sampling: {sample_frac}')
        train_data = train_data.sample(frac=sample_frac)

        print('domain_count_ref')
        domain_count_ref = train_data.groupby('domain').size()
        # print(domain_count_ref.describe())
        self.domain_count_ref = domain_count_ref.to_dict()

        print('domain_count_device')
        domain_count_device = train_data.groupby('domain').device_id.nunique()
        # print(domain_count_device.describe())
        self.domain_count_device = domain_count_device.to_dict()

        print('domain_count_referrer_domain')
        domain_count_referrer_domain = train_data.groupby('domain').referrer_domain.nunique()
        # print(domain_count_referrer_domain.describe())
        self.domain_count_referrer_domain = domain_count_referrer_domain.to_dict()

        print('referrer_domain_count_ref')
        referrer_domain_count_ref = train_data.groupby('referrer_domain').size()
        # print(referrer_domain_count_ref.describe())
        self.referrer_domain_count_ref = referrer_domain_count_ref.to_dict()

        print('referrer_domain_count_device')
        referrer_domain_count_device = train_data.groupby('referrer_domain').device_id.nunique()
        # print(referrer_domain_count_device.describe())
        self.referrer_domain_count_device = referrer_domain_count_device.to_dict()

        print('referrer_domain_count_domain')              
        referrer_domain_count_domain = train_data.groupby('referrer_domain').domain.nunique()
        # print(referrer_domain_count_domain.describe())
        self.referrer_domain_count_domain = referrer_domain_count_domain.to_dict()

        print('domain_referrer_domain_count_ref')   
        domain_referrer_domain_count_ref = train_data.groupby(['domain', 'referrer_domain']).size()
        # print(domain_referrer_domain_count_ref.describe())
        self.domain_referrer_domain_count_ref = domain_referrer_domain_count_ref.to_dict()

        print('domain_referrer_domain_count_device')  
        domain_referrer_domain_count_device = train_data.groupby(['domain', 'referrer_domain']).device_id.nunique()
        # print(domain_referrer_domain_count_device.describe())
        self.domain_referrer_domain_count_device = domain_referrer_domain_count_device.to_dict()
        
    def _process_stage1(self, data):
        print('processing_stage1')
        
        print('processing_stage1:counters: begin')
        domain_counter = device_data_counter()              # Сколько раз этот device_id заходил на этот домен
        for index, device_id, domain in data[['device_id', 'domain']].itertuples():
            domain_counter.fill(device_id, domain)
        data['domain_counter'] = domain_counter.get_list()
        print('processing_stage1:counters: end')

        data_check = data[data.referrer_num!=10].copy()
        data_check = data_check[['domain', 'timestamp', 'domain_counter', 'is_referrer', 'referrer_num']]
        data_check = data_check.rename(columns={'domain':'referrer_domain_to_check',
                                                'timestamp':'timestamp_check',
                                                'domain_counter':'domain_counter_to_check',
                                               })        

        data_fact = data[data.referrer_num==10].copy()
        data_fact = data_fact[['domain', 'timestamp', 'domain_counter']]
        data_fact = data_fact.rename(columns={'timestamp':'timestamp_fact'})

        data_new = data_fact.join(data_check)
        data_new['timestamp_dif'] = data_new['timestamp_fact'] - data_new['timestamp_check'] 
        data_new = data_new.drop(columns=['timestamp_fact', 'timestamp_check'])    
        return data_new

    def _process_stage2(self, data):
        print('processing_stage2')
        
        data['domain_count_ref'] = data.domain.map(lambda x: self.domain_count_ref.get(x, 0))
        data['domain_count_device'] = data.domain.map(lambda x: self.domain_count_device.get(x, 0))
        data['domain_count_referrer_domain'] = data.domain.map(lambda x: self.domain_count_referrer_domain.get(x, 0))

        data['referrer_domain_count_ref'] = data.referrer_domain_to_check.map(lambda x: self.referrer_domain_count_ref.get(x, 0))
        data['referrer_domain_count_device'] = data.referrer_domain_to_check.map(lambda x: self.referrer_domain_count_device.get(x, 0))
        data['referrer_domain_count_domain'] = data.referrer_domain_to_check.map(lambda x: self.referrer_domain_count_domain.get(x, 0))

        data['domain_referrer_domain_count_ref'] = list(map(lambda x, y: self.domain_referrer_domain_count_ref.get((x,y), 0), data['domain'].values, data['referrer_domain_to_check'].values))
        data['domain_referrer_domain_count_device'] = list(map(lambda x, y: self.domain_referrer_domain_count_device.get((x,y), 0), data['domain'].values, data['referrer_domain_to_check'].values))

        data['domain_referrer_domain_count_ref_vs_domain_count_ref'] = data['domain_referrer_domain_count_ref'] / data['domain_count_ref'] + 0.00001
        data['domain_referrer_domain_count_ref_vs_referrer_domain_count_ref'] = data['domain_referrer_domain_count_ref'] / data['referrer_domain_count_ref'] + 0.00001

        data['domain_referrer_domain_count_device_ref_vs_domain_count_device'] = data['domain_referrer_domain_count_device'] / data['domain_count_device'] + 0.00001
        data['domain_referrer_domain_count_device_ref_vs_referrer_domain_count_device_id'] = data['domain_referrer_domain_count_device'] / data['referrer_domain_count_device'] + 0.00001                                 

        data['domain_counter_to_check_vs_domain_counter_to_check'] = data['domain_counter'] / data['domain_counter_to_check'] + 0.00001
        
        data['domain_levels'] = data.domain.str.count('\\.')
        data['referrer_domain_to_check_levels'] = data.referrer_domain_to_check.str.count('\\.')
        

##
        # domain_dict = pd.read_csv('data/semrush_cup_categories_and_demo.csv').drop(columns=['category_1', 'category_2'])
        # domain_dict
#         data = data.merge(domain_dict, left_on='domain', right_on='domain', suffixes=(None,'_domain_dict'), how='left')
#         data = data.merge(domain_dict, left_on='referrer_domain_to_check', right_on='domain', suffixes=(None,'_referrer_domain_to_check'), how='left')        
#         data = data.drop(columns=['domain_referrer_domain_to_check'])
       
#         for i in range(8):
#             column = data.columns[-1-i] + '_vs_' + data.columns[-9-i]
#             print(column)
#             data[column] = data.iloc[:,-1-i] / data.iloc[:,-9-i]
#         data = data.fillna(0)
##                
        return data
    
    def process_data(self, data):
        data = self._process_stage1(data)
        data = self._process_stage2(data)        
        return data

In [6]:
# feature_generator = FeatureGenerator(train_train_path_list[0])

# train_train_features = dd.read_parquet(
#     train_train_path_list[0],
#     columns=['device_id', 'timestamp', 'domain', 'referrer_domain', 'referrer_num', 'is_referrer']).compute()[:100000]

# train_train_features = feature_generator.process_data(train_train_features)

In [7]:
# train_train_features.head()

In [8]:
# def get_domain_counter(data):    
#     counter = []
#     domain_dict = {}
#     device_id_old =-1

#     for index, device_id, domain in data.itertuples():
#         if device_id_old==device_id:
#             counter_cur = domain_dict.get(domain, 0)
#             counter_cur = counter_cur + 1
#             domain_dict[domain] = counter_cur
#             counter.append(counter_cur)
#         else:
#             domain_dict = {}
#             device_id_old = device_id
            
#     return counter

In [9]:
# a, b = get_domain_counter(train_train_features.iloc[:10])

In [10]:
# a

In [11]:
# b

In [12]:
# domain_dict

In [13]:
# domain_dict = pd.read_csv('data/semrush_cup_categories_and_demo.csv')itertuplescolumns=['category_1', 'category_2'])
# domain_dict

In [14]:
# data.merge(domain_dict, on='domain', suffixes=(None,'_domain_dict'), how='left').merge(domain_dict, on='domain', suffixes=(None,'_domain_dict'), how='left')

## LGB helpers

In [15]:
def get_lgb_x(data):
    data = data.drop(columns=['domain', 'referrer_domain_to_check', 'is_referrer'])
    # data = data.iloc[:,:2]
    return data#.values.astype('float32')

def get_lgb_y(data):
    return data.is_referrer
    
def get_lgb_dataset(data):
    return lgb.Dataset(get_lgb_x(data), label=get_lgb_y(data))

# Process Data

In [49]:
data_reader = DataReader()
train_train_path_list = data_reader.get_train_cv_train_path_list(0)
train_valid_path = data_reader.get_train_cv_valid_path(0)

get_train_cv_train_path_list: parquet/test_fold_1_5.parquet, parquet/test_fold_2_5.parquet, parquet/test_fold_3_5.parquet, parquet/test_fold_4_5.parquet
get_train_cv_valid_path: parquet/test_fold_0_5.parquet


In [50]:
feature_generator = FeatureGenerator(train_train_path_list)

FeatureGenerator:init: parquet/test_fold_1_5.parquet, parquet/test_fold_2_5.parquet, parquet/test_fold_3_5.parquet, parquet/test_fold_4_5.parquet
sampling: 0.7
domain_count_ref
domain_count_device
domain_count_referrer_domain
referrer_domain_count_ref
referrer_domain_count_device
referrer_domain_count_domain
domain_referrer_domain_count_ref
domain_referrer_domain_count_device


In [51]:
train_train_features = dd.read_parquet(
    train_train_path_list,
    columns=['device_id', 'timestamp', 'domain', 'referrer_domain', 'referrer_num', 'is_referrer']).compute()
train_train_features = feature_generator.process_data(train_train_features)

processing_stage1
processing_stage1:counters: begin
processing_stage1:counters: end
processing_stage2


In [52]:
train_valid_features = dd.read_parquet(
    train_valid_path,
    columns=['device_id', 'timestamp', 'domain', 'referrer_domain', 'referrer_num', 'is_referrer']).compute()
train_valid_features = feature_generator.process_data(train_valid_features)

processing_stage1
processing_stage1:counters: begin
processing_stage1:counters: end
processing_stage2


In [53]:
train_train_features.dtypes

domain                                                                         object
domain_counter                                                                  int64
referrer_domain_to_check                                                       object
domain_counter_to_check                                                         int64
is_referrer                                                                      bool
referrer_num                                                                     int8
timestamp_dif                                                                   int64
domain_count_ref                                                                int64
domain_count_device                                                             int64
domain_count_referrer_domain                                                    int64
referrer_domain_count_ref                                                       int64
referrer_domain_count_device                          

In [54]:
train_train_features

Unnamed: 0_level_0,domain,domain_counter,referrer_domain_to_check,domain_counter_to_check,is_referrer,referrer_num,timestamp_dif,domain_count_ref,domain_count_device,domain_count_referrer_domain,...,referrer_domain_count_domain,domain_referrer_domain_count_ref,domain_referrer_domain_count_device,domain_referrer_domain_count_ref_vs_domain_count_ref,domain_referrer_domain_count_ref_vs_referrer_domain_count_ref,domain_referrer_domain_count_device_ref_vs_domain_count_device,domain_referrer_domain_count_device_ref_vs_referrer_domain_count_device_id,domain_counter_to_check_vs_domain_counter_to_check,domain_levels,referrer_domain_to_check_levels
event_group_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,ttTLHs.jg,1,tWtXRJ.jl,0,False,0,591,441,263,61,...,1,0,0,0.000010,0.000010,0.000010,0.000010,inf,1,1.0
0,ttTLHs.jg,1,ttdXfB.jg,1,False,1,259,441,263,61,...,12,0,0,0.000010,0.000010,0.000010,0.000010,1.000010,1,1.0
0,ttTLHs.jg,1,ttnqBJ.jg,1,False,2,227,441,263,61,...,41,0,0,0.000010,0.000010,0.000010,0.000010,1.000010,1,1.0
0,ttTLHs.jg,1,ttdXfB.jg,2,False,3,193,441,263,61,...,12,0,0,0.000010,0.000010,0.000010,0.000010,0.500010,1,1.0
0,ttTLHs.jg,1,tWVsHB.HB,1,False,4,159,441,263,61,...,132,0,0,0.000010,0.000010,0.000010,0.000010,1.000010,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1738011,ttFJVq.mp,1,ttCGfG.jg,3,False,5,493,52,48,6,...,0,0,0,0.000010,,0.000010,,0.333343,1,1.0
1738011,ttFJVq.mp,1,tWACUB.HU,2,False,6,126,52,48,6,...,2,0,0,0.000010,0.000010,0.000010,0.000010,0.500010,1,1.0
1738011,ttFJVq.mp,1,ttkGGn.jg,1,False,7,77,52,48,6,...,0,0,0,0.000010,,0.000010,,1.000010,1,1.0
1738011,ttFJVq.mp,1,ttCGfG.jg,4,False,8,38,52,48,6,...,0,0,0,0.000010,,0.000010,,0.250010,1,1.0


# Train model

## Train model for each target

In [55]:
# model_list = []
# train_valid_pred_list = []
# evals_result_list = []

# for target in range(10):
#     print(f'processing target: {target}')
    
#     train_train_features_sample = train_train_features[train_train_features.referrer_num == target]
#     train_train_dataset = get_lgb_dataset(train_train_features_sample)
    
#     train_valid_features_sample = train_valid_features[train_valid_features.referrer_num == target]
#     train_valid_dataset = get_lgb_dataset(train_valid_features_sample)

#     # https://lightgbm.readthedocs.io/en/latest/Parameters.html
#     param = {'objective': 'binary'}
#     param['learning_rate'] = 0.05
#     # param['num_leaves'] = 64
#     param['verbose'] = 1
#     param['metric'] = 'auc'

#     evals_result = {} 

#     model = lgb.train(param, train_train_dataset, num_boost_round=1000, 
#                       valid_sets=(train_train_dataset, train_valid_dataset), 
#                       callbacks=[lgb.early_stopping(stopping_rounds=50),
#                                  lgb.log_evaluation(50),
#                                  lgb.record_evaluation(evals_result)])
    
#     train_valid_pred = train_valid_features_sample[['referrer_num', 'is_referrer']].copy()
#     train_valid_pred['is_referrer_pred'] = model.predict(get_lgb_x(train_valid_features_sample))
    
#     model_list.append(model)
#     train_valid_pred_list.append(train_valid_pred)    
#     evals_result_list.append(evals_result)

In [56]:
# train_valid_pred = pd.concat(train_valid_pred_list)
# train_valid_pred = train_valid_pred.pivot(columns='referrer_num', values=['is_referrer', 'is_referrer_pred'])
# train_valid_pred.head()

In [57]:
# train_valid_pred['referrer_num'] = np.argmax(train_valid_pred.loc[:,('is_referrer', slice(None))].values, axis=1)
# train_valid_pred['referrer_num_pred'] = np.argmax(train_valid_pred.loc[:,('is_referrer_pred', slice(None))].values, axis=1)
# f1_score(train_valid_pred.referrer_num, train_valid_pred.referrer_num_pred, average='macro')

In [58]:
#0.6843094490101649

In [59]:
# f1_score(train_valid_pred.referrer_num, train_valid_pred.referrer_num_pred, average=None)

In [60]:
# array([0.61589708, 0.63361428, 0.64362202, 0.64605263, 0.64929778,
#       0.66171917, 0.68258622, 0.70391753, 0.74135913, 0.86502866])

In [61]:
train_valid_features

Unnamed: 0_level_0,domain,domain_counter,referrer_domain_to_check,domain_counter_to_check,is_referrer,referrer_num,timestamp_dif,domain_count_ref,domain_count_device,domain_count_referrer_domain,...,referrer_domain_count_domain,domain_referrer_domain_count_ref,domain_referrer_domain_count_device,domain_referrer_domain_count_ref_vs_domain_count_ref,domain_referrer_domain_count_ref_vs_referrer_domain_count_ref,domain_referrer_domain_count_device_ref_vs_domain_count_device,domain_referrer_domain_count_device_ref_vs_referrer_domain_count_device_id,domain_counter_to_check_vs_domain_counter_to_check,domain_levels,referrer_domain_to_check_levels
event_group_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,ttvtHn.jg,1,ttXCXB.AV,0,False,0,667,17542,7963,2973,...,9,0,0,0.000010,0.000010,0.000010,0.000010,inf,1,1
6,ttvtHn.jg,1,ttCVLL.jg,1,False,1,642,17542,7963,2973,...,11117,0,0,0.000010,0.000010,0.000010,0.000010,1.00001,1,1
6,ttvtHn.jg,1,tWtkoT.AV,1,False,2,636,17542,7963,2973,...,17128,0,0,0.000010,0.000010,0.000010,0.000010,1.00001,1,1
6,ttvtHn.jg,1,ttmoHU.jg,1,False,3,628,17542,7963,2973,...,35,0,0,0.000010,0.000010,0.000010,0.000010,1.00001,1,1
6,ttvtHn.jg,1,ttFZvU.jg,1,False,4,311,17542,7963,2973,...,539,2,2,0.000124,0.000224,0.000261,0.000458,1.00001,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1738007,ttvTjG.jg,3,ttnNnJ.jg,1,False,5,44,150,83,66,...,7,0,0,0.000010,0.000010,0.000010,0.000010,3.00001,1,1
1738007,ttvTjG.jg,3,ttCFtX.mT,1,False,6,43,150,83,66,...,18,0,0,0.000010,0.000010,0.000010,0.000010,3.00001,1,1
1738007,ttvTjG.jg,3,ttNggB.Hm,3,False,7,40,150,83,66,...,11050,1,1,0.006677,0.000055,0.012058,0.000524,1.00001,1,1
1738007,ttvTjG.jg,3,ttfjxW.jg,3,True,8,3,150,83,66,...,10,1,1,0.006677,0.083343,0.012058,0.500010,1.00001,1,1


## Train single model

In [62]:
train_train_features_sample = train_train_features.copy()#[train_train_features.referrer_num == target]
train_train_dataset = get_lgb_dataset(train_train_features_sample)

train_valid_features_sample = train_valid_features.copy()#[train_valid_features.referrer_num == target]
train_valid_dataset = get_lgb_dataset(train_valid_features_sample)

In [None]:
# https://lightgbm.readthedocs.io/en/latest/Parameters.html
param = {'objective': 'binary'}
param['learning_rate'] = 0.5#0.1
# param['num_leaves'] = 64
param['verbose'] = 1
param['metric'] = 'auc'

evals_result = {} 

np.random.seed(30)
model = lgb.train(param, train_train_dataset, num_boost_round=100, 
                  valid_sets=(train_train_dataset, train_valid_dataset), 
                  callbacks=[lgb.early_stopping(stopping_rounds=50),
                             lgb.log_evaluation(10),
                             lgb.record_evaluation(evals_result)])

[LightGBM] [Info] Number of positive: 949336, number of negative: 8544024
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3978
[LightGBM] [Info] Number of data points in the train set: 9493360, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.100000 -> initscore=-2.197225
[LightGBM] [Info] Start training from score -2.197225
Training until validation scores don't improve for 50 rounds
[10]	training's auc: 0.980519	valid_1's auc: 0.941981
[20]	training's auc: 0.983394	valid_1's auc: 0.950515
[30]	training's auc: 0.984267	valid_1's auc: 0.95274
[40]	training's auc: 0.984753	valid_1's auc: 0.953997
[50]	training's auc: 0.985124	valid_1's auc: 0.954802


In [None]:
# Early stopping, best iteration is:
# [551]	training's auc: 0.986841	valid_1's auc: 0.958268

In [None]:
lgb.plot_metric(evals_result, xlim=(0,700), ylim=(0.935,0.959))

In [None]:
train_valid_pred = train_valid_features_sample[['referrer_num', 'is_referrer']].copy()
train_valid_pred['is_referrer_pred'] = model.predict(get_lgb_x(train_valid_features_sample))

In [None]:
train_valid_pred.shape

In [None]:
train_valid_pred

In [None]:
train_valid_pred = train_valid_pred.pivot(columns='referrer_num', values=['is_referrer', 'is_referrer_pred'])
train_valid_pred.head()

In [None]:
train_valid_pred['referrer_num'] = np.argmax(train_valid_pred.loc[:,('is_referrer', slice(None))].values, axis=1)
train_valid_pred['referrer_num_pred'] = np.argmax(train_valid_pred.loc[:,('is_referrer_pred', slice(None))].values, axis=1)
f1_score(train_valid_pred.referrer_num, train_valid_pred.referrer_num_pred, average='macro')

In [None]:
# 0.6912703643368734

In [None]:
f1_score(train_valid_pred.referrer_num, train_valid_pred.referrer_num_pred, average=None)

In [None]:
lgb.plot_importance(model)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
import matplotlib.pyplot as plt

In [None]:
cmp = ConfusionMatrixDisplay(confusion_matrix(train_valid_pred.referrer_num, train_valid_pred.referrer_num_pred))
fig, ax = plt.subplots(figsize=(10,10))
cmp.plot(ax=ax)

In [None]:
def get_f1_max(y_fact, y_pred):
    data = pd.DataFrame({'y_fact':y_fact, 'y_pred':y_pred})
    
    data = data.sort_values('y_pred', ascending=False)
    data['tp'] = data.y_fact.cumsum().astype('int')
    data['tp'] = data.groupby('y_pred').tp.transform(max)

    data['fp'] = (data.y_fact==False).cumsum().astype('int')
    data['fp'] = data.groupby('y_pred').fp.transform(max)
    
    data = data.sort_values('y_pred', ascending=True)
    data['fn'] = data.y_fact.cumsum().astype('int')
    data['fn'] = data.groupby('y_pred').fn.transform(max)
    
    data['precision'] = data.tp / (data.tp + data.fp)
    data['recall'] = data.tp / (data.tp + data.fn)    
    
    data['f1'] = 2 * ( data['precision'] * data['recall'] ) / ( data['precision'] + data['recall'] )
    # data = data.sort_values('f1', ascending=False)
    
    f1_max = data.f1.max()
    f1_max_pos = data.f1.argmax()
    f1_max_th = data.iloc[f1_max_pos]['y_pred']
    
    print(f'f1_max={f1_max} f1_max_th={f1_max_th}')
    # print(f'f1_val={f1_score(y_fact, y_pred>=f1_max_th, average="binary")}')
    return f1_max, f1_max_th

In [None]:
f1_max_list = []
f1_max_th_list = []
for target in range(10):    
    f1_max, f1_max_th = get_f1_max(train_valid_pred.loc[:,('is_referrer', target)].astype('bool'),
                                   train_valid_pred.loc[:,('is_referrer_pred', target)])
    
    f1_max_list.append(f1_max)
    f1_max_th_list.append(f1_max_th)

np.average(f1_max_list)

In [None]:
# target_stat = train_valid_features[train_valid_features.is_referrer==True].referrer_num.value_counts(normalize=True).sort_index()
# target_stat

# train_valid_pred['referrer_num'] = np.argmax(train_valid_pred.loc[:,('is_referrer', slice(None))].values, axis=1)
# train_valid_pred['referrer_num_pred'] = 9

# for target, target_rate in target_stat.iloc[:-1].iteritems():
#     print(target, target_rate)
    
#     pred_curr = train_valid_pred.loc[train_valid_pred.referrer_num_pred > target]
#     # print(f'items={len(pred_curr)}')
    
#     target_th_pos = int(len(train_valid_pred) * target_rate)
#     # print(f'target_th_pos={target_th_pos}')
    
#     # print(pred_curr.loc[:,('is_referrer_pred', target)].sort_values(ascending=False))
#     # print(pred_curr.loc[:,('is_referrer_pred', target)].describe())

#     target_th = pred_curr.loc[:,('is_referrer_pred', target)].sort_values(ascending=False).iloc[target_th_pos]
#     # print(f'target_th={target_th}')
    
#     # print(sum(pred_curr.loc[:,('is_referrer_pred', target)] >= target_th))
#     # print(sum(pred_curr.loc[:,('is_referrer_pred', target)] <= target_th))
    
#     train_valid_pred.loc[(train_valid_pred.loc[:,('is_referrer_pred', target)] >= target_th) &
#                         (train_valid_pred.referrer_num_pred > target),
#                         'referrer_num_pred'] =target

# f1_score(train_valid_pred.referrer_num, train_valid_pred.referrer_num_pred, average='macro')

In [None]:
# train_valid_pred = train_valid_features[['referrer_num', 'is_referrer']].copy()
# train_valid_pred['is_referrer_pred'] = model.predict(get_x(train_valid_features))
# train_valid_pred = train_valid_pred.pivot(columns='referrer_num', values=['is_referrer', 'is_referrer_pred'])
# train_valid_pred.head()

In [None]:
# train_train_data = get_lgb_dataset(train_train_features_v2)
# train_test_data = get_lgb_dataset(train_test_features)

In [None]:
# # https://lightgbm.readthedocs.io/en/latest/Parameters.html

# param = {'objective': 'binary'}
# param['learning_rate'] = 0.1#0.01
# # param['num_leaves'] = 64
# param['verbose'] = 1
# param['metric'] = 'auc'

# evals_result = {} 

# model = lgb.train(param, train_train_data, num_boost_round=100, valid_sets=train_test_data, 
#                   callbacks=[lgb.early_stopping(stopping_rounds=50),
#                              lgb.log_evaluation(1),
#                              lgb.record_evaluation(evals_result)])

In [None]:
# [41]	valid_0's auc: 0.845916

In [None]:
# print('Plotting metrics recorded during training...')
# ax = lgb.plot_metric(evals_result, metric='auc')
# # plt.show()

In [None]:
# [50]	valid_0's auc: 0.838309

In [None]:
# lgb.plot_importance(model)

In [None]:
# train_test_features.dtypes[3:]

In [None]:
# train_test_pred = train_test_features[['referrer_num', 'is_referrer']].copy()
# train_test_pred['is_referrer_pred'] = model.predict(get_x(train_test_features))
# train_test_pred = train_test_pred.pivot(columns='referrer_num', values=['is_referrer', 'is_referrer_pred'])
# train_test_pred.head()

# Validation

## Pred by target rate

In [None]:
# target_stat = train_train_features[train_train_features.is_referrer==True].referrer_num.value_counts(normalize=True).sort_index()
# target_stat

In [None]:
# train_test_pred['referrer_num'] = np.argmax(train_test_pred.loc[:,('is_referrer', slice(None))].values, axis=1)
# train_test_pred['referrer_num_pred'] = 9

# for target, target_rate in target_stat.iloc[:-1].iteritems():
#     print(target, target_rate)
    
#     pred_curr = train_test_pred.loc[train_test_pred.referrer_num_pred > target]
#     # print(f'items={len(pred_curr)}')
    
#     target_th_pos = int(len(train_test_pred) * target_rate)
#     # print(f'target_th_pos={target_th_pos}')
    
#     # print(pred_curr.loc[:,('is_referrer_pred', target)].sort_values(ascending=False))
#     # print(pred_curr.loc[:,('is_referrer_pred', target)].describe())

#     target_th = pred_curr.loc[:,('is_referrer_pred', target)].sort_values(ascending=False).iloc[target_th_pos]
#     # print(f'target_th={target_th}')
    
#     # print(sum(pred_curr.loc[:,('is_referrer_pred', target)] >= target_th))
#     # print(sum(pred_curr.loc[:,('is_referrer_pred', target)] <= target_th))
    
#     train_test_pred.loc[(train_test_pred.loc[:,('is_referrer_pred', target)] >= target_th) &
#                         (train_test_pred.referrer_num_pred > target),
#                         'referrer_num_pred'] =target

# f1_score(train_test_pred.referrer_num, train_test_pred.referrer_num_pred, average='macro')

## Simple

In [None]:
# train_test_pred['referrer_num'] = np.argmax(train_test_pred.loc[:,('is_referrer', slice(None))].values, axis=1)
# train_test_pred['referrer_num_pred'] = np.argmax(train_test_pred.loc[:,('is_referrer_pred', slice(None))].values, axis=1)
# f1_score(train_test_pred.referrer_num, train_test_pred.referrer_num_pred, average='macro')

In [None]:
# auc: 0.842891 private: 0.58151635040473 public: 0.577699280352

# Prediction

In [None]:
def save_submission(data, path):
    data['referrer_num_pred'] = np.argmax(data.loc[:,('is_referrer_pred', slice(None))].values, axis=1)    
    pred = data[['referrer_num_pred']].reset_index().rename(columns={'referrer_num_pred':'referrer_num'})
    
    sample = pd.read_csv(data_path_sample_orig)
    assert all(pred.event_group_id == sample.event_group_id)
    assert all(pred.referrer_num.isin(list(range(10))))
    
    print(pred.referrer_num.value_counts().sort_index())
    
    pred.to_csv(path, index=False)
    return pred

# save_submission(test_pred, 'results/result_v2.csv')

## Load test data

In [None]:
test_data = pd.read_csv(data_path_test_orig)

test_data.head(11)

test_data['is_referrer'] = False
test_data.head(11)

test_features = process(test_data)
test_features = process_features(test_features)

## Pred by position specific model

In [None]:
test_pred_list = []

for target, model in zip(range(10), model_list):
    print(f'processing target: {target}')
    
    test_features_sample = test_features[test_features.referrer_num == target]    
    test_pred = test_features_sample[['referrer_num']].copy()
    test_pred['is_referrer_pred'] = model.predict(get_x(test_features_sample))
    
    test_pred_list.append(test_pred)    

In [None]:
test_pred = pd.concat(test_pred_list)
test_pred = test_pred.pivot(columns='referrer_num', values=['is_referrer_pred'])
test_pred.head()

In [None]:
test_pred_sub = save_submission(test_pred, 'results/result_v3.csv')
test_pred_sub

## Pred by single model

In [None]:
# test_pred = test_features[['referrer_num', 'is_referrer']].copy()
# test_pred['is_referrer_pred'] = model.predict(get_x(test_features))
# test_pred = test_pred.pivot(columns='referrer_num', values=['is_referrer', 'is_referrer_pred'])
# test_pred.head()

# print(train_train_features[train_train_features.is_referrer==True].referrer_num.value_counts().sort_index())
# print(train_train_features[train_train_features.is_referrer==True].referrer_num.value_counts(normalize=True).sort_index())
# print(train_test_features[train_test_features.is_referrer==True].referrer_num.value_counts().sort_index())
# print(train_test_features[train_test_features.is_referrer==True].referrer_num.value_counts(normalize=True).sort_index())

In [None]:
# from sklearn.metrics import f1_score

# def lgb_f1_score(y_hat, data):
#     y_true = data.get_label()
#     y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
#     return 'f1', f1_score(y_true, y_hat), True

# Results

In [None]:
# model per target: private: 0.7403175951534714 public: 0.604222330379 
# result_v3.csv private: 0.8352092113671384 private f1 best mean: 0.7277404044209898 pubic: 0.688577267262