In [1]:
import sys
sys.path.append('/home/sergak/.local/lib/python3.8/site-packages')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
import json
import plotly.express as px
import collections
import time

## Read data

In [3]:
PAYMENTS_TRAIN_PATH = 'data/payments_train.csv'
TARGET_TRAIN_PATH = 'data/target_train.csv'
PAYMENTS_TEST_PATH = 'data/payments_test.csv'
CLIENT_ID_TEST_PATH = 'data/client_id_test.csv'
SEED = 42

In [4]:
payments_dtypes = {
    'client_id': str,
    'contractor_id': str,
    'is_outgoing': bool,
    'amount': 'uint64',
    'dt_day': 'uint16',
    'dt_hour': 'uint8',
    'channel': pd.CategoricalDtype()
}
for i in range(12):
    payments_dtypes[f'flag_{i}'] = bool

In [5]:
data = pd.read_csv(PAYMENTS_TRAIN_PATH, dtype=payments_dtypes)

In [6]:
target_dtypes = {
    'client_id': str
}
for i in range(35):
    target_dtypes[f'type_{i}'] = int

In [7]:
target = pd.read_csv(TARGET_TRAIN_PATH, dtype=target_dtypes).set_index('client_id')

In [8]:
data['contractor_id'] = data['contractor_id'].fillna('-1')

## Feature Engineering

In [9]:
data.head()

Unnamed: 0,client_id,contractor_id,is_outgoing,amount,dt_day,dt_hour,channel,flag_0,flag_1,flag_2,flag_3,flag_4,flag_5,flag_6,flag_7,flag_8,flag_9,flag_10,flag_11
0,569703,-1,True,8674442,56,12,app,False,False,False,False,False,False,False,False,False,False,False,False
1,696595,3920,True,5714350,311,19,web,False,False,False,False,False,False,False,False,False,False,False,False
2,368467,-1,True,3720501,175,13,,False,False,False,True,False,False,False,False,False,False,False,False
3,421133,-1,True,311542,68,14,,False,False,False,True,False,False,False,False,False,False,False,False
4,365044,24686,True,705918747,171,15,app,False,False,False,False,False,False,False,False,False,False,False,False


In [10]:
good_two_flags = []
for i in tqdm(range(12)):
    for j in range(12):
        if i == j:
            continue
        if (data['flag_{}'.format(i)] & data['flag_{}'.format(j)]).sum() > 1000:
            good_two_flags.append('flag_{}_{}'.format(i, j))
len(good_two_flags)

100%|███████████████████████████████████████████| 12/12 [00:04<00:00,  2.67it/s]


12

In [11]:
def add_double_flags(res, x):
    double_flags = pd.DataFrame({'client_id': x['client_id']})

    for i in range(12):
        for j in range(12):
            f_name = 'flag_{}_{}'.format(i, j)
            if f_name not in good_two_flags:
                continue
            double_flags[f_name] = x['flag_{}'.format(i)] & x['flag_{}'.format(j)]
    res[[f'{el}_persent' for el in good_two_flags]] = double_flags.groupby('client_id')[good_two_flags].sum()
    mask = res[[f'{el}_persent' for el in good_two_flags]].sum(axis=1) > 0
    res.loc[mask, [f'flag_{i}_persent' for i in range(12)]] = 0
    return res

In [12]:
def build_days_features(res, x):
    grouped = x.groupby('client_id')['dt_day']
    add = collections.defaultdict(list)
    for name, group in grouped:
        cnt = np.array(collections.Counter(group).most_common())[:, 1]
        extra = {}
#         extra['quantile_25_trans_per_day'] = np.quantile(cnt, 0.25)
#         extra['quantile_50_trans_per_day'] = np.quantile(cnt, 0.50)
#         extra['quantile_75_trans_per_day'] = np.quantile(cnt, 0.75)
        extra['mean_trans_per_day'] = cnt.mean()
#         extra['max_trans_per_day'] = cnt.max()
        for k, v in extra.items():
            add[k].append(v)
            
    for k, v in add.items():
        res[k] = v
        
    return res

In [13]:
def build_hours_features(res, x):
    grouped = x.groupby('client_id')['dt_hour']
    add = collections.defaultdict(list)
    for name, group in grouped:
        counter = collections.Counter(group)
        cnt = np.array([counter[i] if i in counter else 0 for i in range(24)])
        
        extra = {}
        for i in range(24):
            extra['persent_trans_per_hour_{}'.format(i)] = cnt[i] / cnt.sum()
        
#         extra['persent_trans_night'] = cnt[0:6].sum() / cnt.sum()
#         extra['persent_trans_morning'] = cnt[6:12].sum() / cnt.sum()
#         extra['persent_trans_midday'] = cnt[12:18].sum() / cnt.sum()
#         extra['persent_trans_evening'] = cnt[18:].sum() / cnt.sum()
        
        for k, v in extra.items():
            add[k].append(v)
            
    for k, v in add.items():
        res[k] = v
        
    return res

In [14]:
def build_channels_features(res, x):
    for channel_name in x['channel'].unique():
        if 'nan' == str(channel_name):
            continue
        now = x[x['channel'] == channel_name].groupby('client_id')['amount'].agg(['median', 'count'])
        res = res.merge(now, on='client_id', how='left', suffixes=['', '_channel_{}'.format(channel_name)])
    
    for col in res.columns:
        if 'sum_channel' in col or 'count_channel' in col:
            res[col] = res[col] / res['count']
    return res

In [None]:
def build_top_ids_features(res, x):
    grouped = x.groupby('client_id')['contractor_id']
    top_ids = []
    cnt_not_nan_contactor_id = []
    top_k = 10
    for name, group in grouped:
        now = 0
        top_now = []
        cnt = collections.Counter(group).most_common()
        cnt_not_nan_contactor_id.append(0)
        for k, v in cnt:
            cnt_not_nan_contactor_id[-1] += (k != '-1')
            
        for k, v in cnt:
            if k != '-1':
                top_now.append(k)
                
            if len(top_now) == top_k:
                break
                
        top_now += ['-1'] * (top_k - len(top_now))
        top_ids.append(top_now)
    
    res['cnt_not_nan_contactor_id'] = cnt_not_nan_contactor_id
    top_ids = np.array(top_ids)
    for i in range(top_k):
        res['top_{}_id'.format(i)] = top_ids[:, i]
    return res

In [15]:
def get_base_features(x):
    res = x.groupby(['client_id'])['amount'].agg(['sum', 'median', 'count', 'std'])
    tmp = x.groupby(['client_id'])['dt_day'].agg(['min', 'max'])                
    res['frequency'] = res['count'] / (tmp['max'] - tmp['min'] + 1)
    res = build_days_features(res, x)
#     res = build_hours_features(res, x)
    res = build_channels_features(res, x)
#     res = add_double_flags(res, x)
    res = build_top_ids_features(res, x)
    
    res[[f'flag_{i}_persent' for i in range(12)]] = x.groupby('client_id')[[f'flag_{i}' for i in range(12)]].sum()
    
    for el in res.columns:
        if 'persent' in el:
            res[el] = res[el] / res['count']
    
    return res

In [16]:
def get_features(x):
    res = pd.DataFrame()
    res = get_base_features(x)
    for c in res.columns:
        if c != 'client_id':
            del res[c]
#     res['random'] = np.random.random((len(res)))

    for val in [False, True]:
        now = x[x['is_outgoing'] == val]
        res = res.merge(get_base_features(now), how='left', on='client_id', suffixes=['', '_out_{}'.format(val)])
        
    top_cols = [el for el in res.columns if 'top' in el]
    res[top_cols] = res[top_cols].fillna('-1')
    
    flag_cols = [el for el in res.columns if 'flag' in el]
    res[flag_cols] = res[flag_cols].fillna(0)
    
    channel_cols = [el for el in res.columns if 'channel' in el]
    res[channel_cols] = res[channel_cols].fillna(0)
    
    return res

In [17]:
def use_features_instead_id(features):
    res = features.copy()
    res['client_id'] = features.index
    for el in res.columns.copy():
        if 'top' in el:
            k = '_'.join(el.split('_')[1:])
            res = res.merge(features, left_on=el, how='left', right_on='client_id', suffixes=['', '_k_{}'.format(k)])
            
    res.drop([el for el in res.columns if 'top' in el], axis=1, inplace=True)
    res.set_index("client_id", inplace = True)
    return res

In [18]:
%%time
features = get_features(data)
X = features
features.head()

CPU times: user 2min 2s, sys: 6.39 s, total: 2min 9s
Wall time: 2min 9s


Unnamed: 0_level_0,sum,median,count,std,frequency,mean_trans_per_day,median_channel_pos,count_channel_pos,median_channel_atm,count_channel_atm,...,top_0_id_out_True,top_1_id_out_True,top_2_id_out_True,top_3_id_out_True,top_4_id_out_True,top_5_id_out_True,top_6_id_out_True,top_7_id_out_True,top_8_id_out_True,top_9_id_out_True
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100045,2038264000.0,6667310.0,101.0,31349970.0,0.286932,1.485294,0.0,0.0,0.0,0.0,...,100045,77142,979723,393717,15624,-1,-1,-1,-1,-1
100055,7629424000.0,23651070.0,136.0,111851200.0,0.373626,1.271028,0.0,0.0,0.0,0.0,...,963007,869701,913518,805244,4230,996871,906906,708612,747613,803706
100068,9885211000.0,98949985.0,62.0,162013900.0,0.362573,1.319149,139311.5,0.032258,0.0,0.0,...,922296,502405,100068,348567,634507,615615,308292,-1,-1,-1
100076,474854200.0,15459270.0,9.0,82677730.0,0.091837,1.0,0.0,0.0,0.0,0.0,...,28547,673163,567238,509530,-1,-1,-1,-1,-1,-1
100089,24685370000.0,26916187.0,925.0,15119590.0,2.527322,2.65043,0.0,0.0,0.0,0.0,...,174173,14980,926929,863770,344098,299311,960130,555885,589103,257769


In [19]:
# concated = use_features_instead_id(features)
# X = concated
# concated.head()

In [20]:
X.shape

(67005, 70)

In [21]:
X.columns

Index(['sum', 'median', 'count', 'std', 'frequency', 'mean_trans_per_day',
       'median_channel_pos', 'count_channel_pos', 'median_channel_atm',
       'count_channel_atm', 'flag_0_persent', 'flag_1_persent',
       'flag_2_persent', 'flag_3_persent', 'flag_4_persent', 'flag_5_persent',
       'flag_6_persent', 'flag_7_persent', 'flag_8_persent', 'flag_9_persent',
       'flag_10_persent', 'flag_11_persent', 'cnt_not_nan_contactor_id',
       'top_0_id', 'top_1_id', 'top_2_id', 'top_3_id', 'top_4_id', 'top_5_id',
       'top_6_id', 'top_7_id', 'top_8_id', 'top_9_id', 'sum_out_True',
       'median_out_True', 'count_out_True', 'std_out_True',
       'frequency_out_True', 'mean_trans_per_day_out_True',
       'median_channel_app', 'count_channel_app', 'median_channel_web',
       'count_channel_web', 'median_channel_pos_out_True',
       'count_channel_pos_out_True', 'median_channel_atm_out_True',
       'count_channel_atm_out_True', 'flag_0_persent_out_True',
       'flag_1_persent_ou

### Split to train & val data

In [26]:
import sklearn.utils as sku
from skmultilearn.model_selection import iterative_train_test_split

def stratified_split_cached(X, y, split_idx_file):
    if os.path.isfile(split_idx_file):
        with open(split_idx_file, 'r') as f:
            split_json = json.load(f)
        train_idx, val_idx = split_json['train'], split_json['val']
    else:
        y_shuffle = sku.shuffle(y, random_state=SEED)  # https://cpb-us-e1.wpmucdn.com/journeys.dartmouth.edu/dist/8/830/files/2020/06/EIqwWwsX0AAeh-o.jpeg
        train_idx, _, val_idx, _ = iterative_train_test_split(np.expand_dims(y_shuffle.index, 1), np.array(y_shuffle), test_size=0.01)
        train_idx, val_idx = train_idx.squeeze(1), val_idx.squeeze(1)
        with open(split_idx_file, 'w') as f:
            json.dump({'train': list(train_idx), 'val': list(val_idx)}, f)
    return X.loc[train_idx], y.loc[train_idx], X.loc[val_idx], y.loc[val_idx]

In [27]:
X_train, y_train, X_val, y_val = stratified_split_cached(X, target, 'split_cache_1.json')

In [28]:
len(X_train), len(y_train), len(X_val), len(y_val)

(66194, 66194, 811, 811)

## Train model

In [29]:
from catboost import CatBoostClassifier

class MultiLabelModel:
    def __init__(self, n_classes):
        self.model = [self.get_base_model() for i in range(n_classes)]
        self.n_classes = n_classes
        
    def get_base_model(self):
        model = CatBoostClassifier(iterations=1000,
                               random_state=42,
                               learning_rate=0.03,
                               max_depth=7,
                               task_type='CPU',
                               verbose=False,
                               cat_features=[el for el in X.columns if 'top' in el])
        return model
    
    def fit(self, X_train, y_train, X_val, y_val):
        for i in tqdm(range(self.n_classes)):
            self.model[i].fit(X_train, y_train['type_{}'.format(i)],
                               eval_set=(X_val, y_val['type_{}'.format(i)]),
                               use_best_model=False,
                               plot=False
                             )
        
    def predict(self, X):
        preds = None
        for i in range(self.n_classes):
            now = self.model[i].predict(X).reshape(-1, 1)
            preds = np.concatenate([preds, now], axis=-1) if preds is not None else now
        return preds

In [30]:
from sklearn.multiclass import OneVsRestClassifier

model = MultiLabelModel(35) #depth = 7 + lr = 0.03
model.fit(X_train, y_train, X_val, y_val)

100%|████████████████████████████████████████| 35/35 [1:59:07<00:00, 204.20s/it]


In [31]:
preds = model.predict(X_val)
preds

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [32]:
from sklearn.metrics import fbeta_score, classification_report

fbeta_score(y_val, preds, beta=0.5, average='micro', zero_division=0) #test_size=0.01

0.40780141843971635

In [35]:
from sklearn.metrics import fbeta_score, classification_report

fbeta_score(y_val, preds, beta=0.5, average='micro', zero_division=0) #test_size=0.15

0.42752315390185125

In [90]:
pd.options.mode.chained_assignment = None  # default='warn'
columns = X_train.columns
clf = model.estimators_[0]
import numpy as np
if hasattr(clf, 'coef_'):
    df_importances = sorted(list(zip(columns, clf.coef_.ravel())), key=lambda tpl: tpl[1], reverse=True)
else:
    df_importances= sorted(list(zip(columns, np.array(clf.feature_importances_).ravel())), key=lambda tpl: tpl[1], reverse=True)
    
df_importances = pd.DataFrame(df_importances, columns=['feature', 'importance'])
df_importances = df_importances.set_index('feature')
# df_importances.plot(kind='bar', figsize=(15, 3))
# plt.show()
df_importances.head(50)

Unnamed: 0_level_0,importance
feature,Unnamed: 1_level_1
top_8_id_out_True,15.945695
top_4_id_out_True,9.753824
top_0_id,6.723945
top_2_id_out_True,4.737697
flag_6_persent,4.420022
top_1_id,3.496123
std,3.280087
top_1_id_out_True,2.725503
flag_1_persent_out_True,2.583324
count_channel_web,2.388702


## Make submit

In [33]:
payments_test = pd.read_csv(PAYMENTS_TEST_PATH, dtype=payments_dtypes)

In [34]:
payments_test['contractor_id'] = payments_test['contractor_id'].fillna('-1')
payments_test['contractor_id'] = payments_test['contractor_id'].astype(str)

In [35]:
X_test = get_features(payments_test)

In [36]:
preds_test = model.predict(X_test)
preds_test = pd.DataFrame(preds_test, columns = ['type_{}'.format(i) for i in range(35)])
preds_test['client_id'] = X_test.index
preds_test = preds_test.set_index('client_id')
preds_test

Unnamed: 0_level_0,type_0,type_1,type_2,type_3,type_4,type_5,type_6,type_7,type_8,type_9,...,type_25,type_26,type_27,type_28,type_29,type_30,type_31,type_32,type_33,type_34
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100050,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100128,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100159,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10018,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
100237,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999572,1,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,1,0,0,0,0
99966,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
999662,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
999674,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [37]:
preds_test.to_csv('submission2.csv')