In [1]:
# Minimalistic version of the catbost model using several "magical" features. The idea can be developed further.

In [2]:
import numpy as np
import gc, os
import pandas as pd
import numpy as np
import catboost as ctb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm

from scipy.stats import gmean
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

In [3]:
VERSION = 3
SEED = 42

In [4]:
test_df_src = pd.read_csv('../input/santander-customer-transaction-prediction/test.csv') 
train_df_src = pd.read_csv('../input/santander-customer-transaction-prediction/train.csv')
with open('../input/data-separation/synthetic_samples_indexes.npy', 'rb') as f:
    test_synth = np.load(f)
    test_synth = list(test_synth)
with open('../input/data-separation/public_LB.npy', 'rb') as f:
    test_pub = np.load(f)  
    test_pub = list(test_pub.reshape(-1)[0])
with open('../input/data-separation/private_LB.npy', 'rb') as f:
    test_priv = np.load(f)  
    test_priv = list(test_priv.reshape(-1)[0])


In [5]:
var_cols = [c for c in list(train_df_src.columns) if 'var_' in c]

In [6]:
all_df_real = pd.concat([train_df_src, test_df_src.loc[test_pub + test_priv]],
                        axis=0, copy=False, sort=False).set_index('ID_code').astype('float32')
all_df_synth = pd.concat([train_df_src, test_df_src.loc[test_synth]],
                        axis=0, copy=False, sort=False).set_index('ID_code').astype('float32')

In [7]:
def get_all_freq(df, columns, frqs_series=None):
    freq_df = pd.DataFrame(index=df.index)
    f_s = frqs_series if frqs_series else {}
    for col in tqdm(columns):        
        if not frqs_series:
            f_s[col] = df[f'{col}'].value_counts()
        freq_df[f'{col}_freq_N'] = df[f'{col}'].map(f_s[col])        
        freq_df[f'{col}_freq_1'] = (freq_df[f'{col}_freq_N'] > 1).astype('category')
        freq_df[f'{col}_mul_freq'] = df[col]*freq_df[f'{col}_freq_N']
        freq_df[f'{col}_div_freq'] = df[col]/freq_df[f'{col}_freq_N']          
    return freq_df, f_s

In [8]:
all_df_real_uflag, f_s = get_all_freq(all_df_real, var_cols)
all_df_synth_uflag, f_s = get_all_freq(all_df_synth, var_cols, f_s)

all_df_real = pd.concat([all_df_real, all_df_real_uflag],
                        axis=1, copy=False, sort=False)
all_df_synth = pd.concat([all_df_synth, all_df_synth_uflag],
                         axis=1, copy=False, sort=False)

train_df = all_df_real[all_df_real.index.str.contains('train') | all_df_real.index.str.contains('pred')].copy()
train_df = train_df.loc[train_df_src['ID_code']]


100%|██████████| 200/200 [02:41<00:00,  1.89s/it]
100%|██████████| 200/200 [02:35<00:00,  1.87s/it]


In [9]:
del all_df_real_uflag, all_df_synth_uflag, train_df_src
gc.collect()

63

In [10]:
test_df_real = all_df_real[all_df_real.index.str.contains('test')]
test_df_synth = all_df_synth[all_df_synth.index.str.contains('test')]
test_df = pd.concat([test_df_real, test_df_synth], axis=0, copy=True, sort=False)
test_df = test_df.loc[test_df_src['ID_code']]

In [11]:
del all_df_real, all_df_synth, test_df_src
gc.collect()

39

In [12]:
features = [c for c in train_df.columns if c not in ['ID_code', 'target']]
target = train_df['target']

In [13]:
def add_freq(df):
    df_res = pd.DataFrame(index=df.index)
    for c in tqdm(var_cols):
        df_res[f'{c}_freq_mean'] = df.groupby(f'{c}_freq_1')[c].transform(np.mean).astype('float32')       
        df_res[f'{c}_freq_mean_unq'] = df[c]
        idx = df[df[f'{c}_freq_1']==False].index
        df_res.loc[idx, [f'{c}_freq_mean_unq']] = df_res.loc[idx][f'{c}_freq_mean_unq'].mean()
        df_res[f'{c}_freq_std'] = df.groupby(f'{c}_freq_1')[c].transform(np.std).astype('float32')
    return df_res

    
def add_features(df):
    df_count = add_freq(df)
    colums = [f'{x}_freq_N' for x in var_cols] + \
             [f'{x}_freq_1' for x in var_cols] + \
             [f'{x}_mul_freq' for x in var_cols] + \
             ['target']
    res = pd.concat([df[colums], df_count], axis=1, copy=False, sort=False)
    return res


In [14]:
param = {
    'random_seed': SEED,
    'gpu_ram_part': 0.95,
    'iterations': 200000,
    'learning_rate': 0.04,
    'l2_leaf_reg': 5,
    'depth': 1,
    'thread_count': 4,
    'custom_metric': ['Logloss', 'AUC:hints=skip_train~false'],
    'od_type': 'Iter',
    'od_wait': 500,
    'task_type': 'GPU',
    'eval_metric': 'AUC',
    'use_best_model': True
}


In [15]:
%%time

num_folds = 5
features = [c for c in train_df.columns if c not in ['ID_code', 'target']]
print('Training the Model:')
val_list = []
predictions = []
clf = ctb.CatBoostClassifier(**param)
for i in range(2):
    folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=SEED*i)
    oof = np.zeros(len(train_df))
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):
        gc.collect()
        print("Fold idx:{}".format(fold_ + 1))
        trn_data = add_features(train_df.iloc[trn_idx])
        y_trn = trn_data['target']
        features_ext = [c for c in trn_data.columns if c not in ['ID_code', 'target']]
        trn_data = trn_data[features_ext]
        val_data  = add_features(train_df.iloc[val_idx])
        y_val = val_data['target']
        val_data = val_data[features_ext]         
        
        clf.fit(X=trn_data, y=y_trn, eval_set=[(val_data, y_val)], verbose=5000, early_stopping_rounds = 1001) 
        del trn_data, val_data
        gc.collect() 
        
        train_data = add_features(train_df.iloc[val_idx])[features_ext]        
        oof[val_idx] = clf.predict_proba(train_data)[:,1]
        del train_data
        gc.collect()
        test_data = add_features(test_df)[features_ext]
        predictions.append(clf.predict_proba(test_data)[:,1])  
        del test_data
        
    val_list.append(oof)
    
oof = gmean(val_list, 0)
predictions_gmean = gmean(predictions, 0)

Training the Model:
Fold idx:1


100%|██████████| 200/200 [01:25<00:00,  1.34it/s]
100%|██████████| 200/200 [00:21<00:00,  5.84it/s]


0:	learn: 0.5003349	test: 0.5012853	best: 0.5012853 (0)	total: 14.3ms	remaining: 47m 34s
5000:	learn: 0.9113664	test: 0.9028375	best: 0.9028375 (5000)	total: 48.6s	remaining: 31m 36s
10000:	learn: 0.9240053	test: 0.9148424	best: 0.9148461 (9995)	total: 1m 37s	remaining: 30m 49s
15000:	learn: 0.9284670	test: 0.9183331	best: 0.9183356 (14999)	total: 2m 27s	remaining: 30m 13s
20000:	learn: 0.9308599	test: 0.9196522	best: 0.9196570 (19969)	total: 3m 14s	remaining: 29m 9s
25000:	learn: 0.9324611	test: 0.9201638	best: 0.9201714 (24788)	total: 4m 1s	remaining: 28m 9s
30000:	learn: 0.9336808	test: 0.9203852	best: 0.9203852 (30000)	total: 4m 47s	remaining: 27m 7s
35000:	learn: 0.9346292	test: 0.9205505	best: 0.9205627 (34340)	total: 5m 35s	remaining: 26m 22s
bestTest = 0.9205631614
bestIteration = 35182
Shrink model to first 35183 iterations.


100%|██████████| 200/200 [00:17<00:00,  8.45it/s]
100%|██████████| 200/200 [01:26<00:00,  1.48it/s]


Fold idx:2


100%|██████████| 200/200 [01:23<00:00,  1.38it/s]
100%|██████████| 200/200 [00:20<00:00,  6.06it/s]


0:	learn: 0.5004244	test: 0.5009275	best: 0.5009275 (0)	total: 11.1ms	remaining: 37m 8s
5000:	learn: 0.9121085	test: 0.8972465	best: 0.8972465 (5000)	total: 48.3s	remaining: 31m 21s
10000:	learn: 0.9251766	test: 0.9100738	best: 0.9100738 (10000)	total: 1m 35s	remaining: 30m 7s
15000:	learn: 0.9295444	test: 0.9132279	best: 0.9132279 (15000)	total: 2m 24s	remaining: 29m 37s
20000:	learn: 0.9319051	test: 0.9145735	best: 0.9145735 (20000)	total: 3m 11s	remaining: 28m 43s
25000:	learn: 0.9335058	test: 0.9151518	best: 0.9151553 (24993)	total: 3m 56s	remaining: 27m 33s
bestTest = 0.915412426
bestIteration = 27738
Shrink model to first 27739 iterations.


100%|██████████| 200/200 [00:17<00:00,  8.44it/s]
100%|██████████| 200/200 [01:24<00:00,  1.50it/s]


Fold idx:3


100%|██████████| 200/200 [01:23<00:00,  1.40it/s]
100%|██████████| 200/200 [00:19<00:00,  6.22it/s]


0:	learn: 0.5006047	test: 0.5002063	best: 0.5002063 (0)	total: 11.6ms	remaining: 38m 39s
5000:	learn: 0.9110608	test: 0.9036845	best: 0.9036867 (4999)	total: 46s	remaining: 29m 52s
10000:	learn: 0.9242120	test: 0.9146181	best: 0.9146181 (10000)	total: 1m 32s	remaining: 29m 26s
15000:	learn: 0.9286343	test: 0.9174887	best: 0.9174925 (14969)	total: 2m 18s	remaining: 28m 29s
20000:	learn: 0.9309280	test: 0.9185212	best: 0.9185338 (19672)	total: 3m 6s	remaining: 27m 56s
25000:	learn: 0.9325321	test: 0.9189886	best: 0.9189886 (25000)	total: 3m 52s	remaining: 27m 9s
30000:	learn: 0.9337587	test: 0.9192309	best: 0.9192346 (29986)	total: 4m 38s	remaining: 26m 19s
35000:	learn: 0.9347235	test: 0.9193633	best: 0.9193927 (34615)	total: 5m 25s	remaining: 25m 33s
bestTest = 0.9194000959
bestIteration = 35501
Shrink model to first 35502 iterations.


100%|██████████| 200/200 [00:17<00:00,  8.47it/s]
100%|██████████| 200/200 [01:24<00:00,  1.50it/s]


Fold idx:4


100%|██████████| 200/200 [01:23<00:00,  1.26it/s]
100%|██████████| 200/200 [00:20<00:00,  6.14it/s]


0:	learn: 0.5006255	test: 0.5001230	best: 0.5001230 (0)	total: 12.5ms	remaining: 41m 34s
5000:	learn: 0.9104437	test: 0.9062429	best: 0.9062529 (4997)	total: 46.6s	remaining: 30m 15s
10000:	learn: 0.9240581	test: 0.9166297	best: 0.9166297 (10000)	total: 1m 33s	remaining: 29m 33s
15000:	learn: 0.9284055	test: 0.9189826	best: 0.9189907 (14910)	total: 2m 19s	remaining: 28m 40s
20000:	learn: 0.9308148	test: 0.9200698	best: 0.9200705 (19995)	total: 3m 5s	remaining: 27m 50s
25000:	learn: 0.9323762	test: 0.9205175	best: 0.9205199 (24989)	total: 3m 52s	remaining: 27m 10s
30000:	learn: 0.9335913	test: 0.9207216	best: 0.9207250 (29956)	total: 4m 39s	remaining: 26m 26s
35000:	learn: 0.9345304	test: 0.9208861	best: 0.9208926 (34803)	total: 5m 26s	remaining: 25m 36s
bestTest = 0.9209322929
bestIteration = 35993
Shrink model to first 35994 iterations.


100%|██████████| 200/200 [00:17<00:00,  7.71it/s]
100%|██████████| 200/200 [01:24<00:00,  1.48it/s]


Fold idx:5


100%|██████████| 200/200 [01:22<00:00,  1.39it/s]
100%|██████████| 200/200 [00:19<00:00,  5.66it/s]


0:	learn: 0.5006356	test: 0.5000827	best: 0.5000827 (0)	total: 10.9ms	remaining: 36m 16s
5000:	learn: 0.9115497	test: 0.9035952	best: 0.9035952 (5000)	total: 48.2s	remaining: 31m 19s
10000:	learn: 0.9244413	test: 0.9147803	best: 0.9147803 (9999)	total: 1m 36s	remaining: 30m 29s
15000:	learn: 0.9287002	test: 0.9177686	best: 0.9177710 (14999)	total: 2m 22s	remaining: 29m 17s
20000:	learn: 0.9309794	test: 0.9190052	best: 0.9190144 (19972)	total: 3m 10s	remaining: 28m 30s
25000:	learn: 0.9325782	test: 0.9195784	best: 0.9195784 (25000)	total: 3m 55s	remaining: 27m 31s
30000:	learn: 0.9337433	test: 0.9199827	best: 0.9199885 (29968)	total: 4m 41s	remaining: 26m 36s
35000:	learn: 0.9346794	test: 0.9200759	best: 0.9201120 (34603)	total: 5m 28s	remaining: 25m 50s
bestTest = 0.9201120436
bestIteration = 34603
Shrink model to first 34604 iterations.


100%|██████████| 200/200 [00:17<00:00,  8.44it/s]
100%|██████████| 200/200 [01:24<00:00,  1.50it/s]


Fold idx:1


100%|██████████| 200/200 [01:23<00:00,  1.32it/s]
100%|██████████| 200/200 [00:20<00:00,  5.57it/s]


0:	learn: 0.5006251	test: 0.5001249	best: 0.5001249 (0)	total: 10.2ms	remaining: 34m 7s
5000:	learn: 0.9116111	test: 0.9018823	best: 0.9018823 (5000)	total: 47.5s	remaining: 30m 52s
10000:	learn: 0.9245886	test: 0.9130987	best: 0.9130987 (9999)	total: 1m 34s	remaining: 30m 4s
15000:	learn: 0.9289383	test: 0.9162770	best: 0.9162789 (14988)	total: 2m 21s	remaining: 29m 3s
20000:	learn: 0.9313692	test: 0.9175245	best: 0.9175249 (19996)	total: 3m 8s	remaining: 28m 16s
25000:	learn: 0.9329447	test: 0.9180964	best: 0.9180974 (24999)	total: 3m 54s	remaining: 27m 24s
30000:	learn: 0.9341401	test: 0.9184160	best: 0.9184163 (29997)	total: 4m 40s	remaining: 26m 26s
bestTest = 0.9185055494
bestIteration = 33095
Shrink model to first 33096 iterations.


100%|██████████| 200/200 [00:17<00:00,  8.43it/s]
100%|██████████| 200/200 [01:24<00:00,  1.50it/s]


Fold idx:2


100%|██████████| 200/200 [01:23<00:00,  1.36it/s]
100%|██████████| 200/200 [00:20<00:00,  5.51it/s]


0:	learn: 0.5005045	test: 0.5006072	best: 0.5006072 (0)	total: 11.8ms	remaining: 39m 15s
5000:	learn: 0.9114090	test: 0.9046431	best: 0.9046438 (4999)	total: 46.8s	remaining: 30m 26s
10000:	learn: 0.9247279	test: 0.9147528	best: 0.9147529 (9992)	total: 1m 34s	remaining: 30m
15000:	learn: 0.9291270	test: 0.9171062	best: 0.9171078 (14996)	total: 2m 21s	remaining: 29m 8s
20000:	learn: 0.9315041	test: 0.9179859	best: 0.9179923 (19893)	total: 3m 7s	remaining: 28m 11s
25000:	learn: 0.9331130	test: 0.9182504	best: 0.9182715 (24934)	total: 3m 54s	remaining: 27m 22s
30000:	learn: 0.9343130	test: 0.9184184	best: 0.9184467 (29307)	total: 4m 41s	remaining: 26m 32s
bestTest = 0.9184467494
bestIteration = 29307
Shrink model to first 29308 iterations.


100%|██████████| 200/200 [00:18<00:00,  7.84it/s]
100%|██████████| 200/200 [01:25<00:00,  1.50it/s]


Fold idx:3


100%|██████████| 200/200 [01:23<00:00,  1.37it/s]
100%|██████████| 200/200 [00:20<00:00,  6.06it/s]


0:	learn: 0.5004213	test: 0.5009400	best: 0.5009400 (0)	total: 11ms	remaining: 36m 44s
5000:	learn: 0.9098891	test: 0.9065774	best: 0.9065780 (4998)	total: 47.2s	remaining: 30m 38s
10000:	learn: 0.9231024	test: 0.9182640	best: 0.9182640 (10000)	total: 1m 34s	remaining: 29m 56s
15000:	learn: 0.9275668	test: 0.9215872	best: 0.9215875 (14999)	total: 2m 20s	remaining: 28m 56s
20000:	learn: 0.9298936	test: 0.9226838	best: 0.9226866 (19977)	total: 3m 8s	remaining: 28m 13s
25000:	learn: 0.9314995	test: 0.9231888	best: 0.9231973 (24933)	total: 3m 53s	remaining: 27m 12s
30000:	learn: 0.9326745	test: 0.9234711	best: 0.9234721 (29988)	total: 4m 38s	remaining: 26m 19s
35000:	learn: 0.9336115	test: 0.9236298	best: 0.9236490 (34183)	total: 5m 22s	remaining: 25m 22s
bestTest = 0.9236490428
bestIteration = 34183
Shrink model to first 34184 iterations.


100%|██████████| 200/200 [00:18<00:00,  8.37it/s]
100%|██████████| 200/200 [01:25<00:00,  1.49it/s]


Fold idx:4


100%|██████████| 200/200 [01:24<00:00,  1.35it/s]
100%|██████████| 200/200 [00:20<00:00,  5.98it/s]


0:	learn: 0.5004800	test: 0.5007054	best: 0.5007054 (0)	total: 10.6ms	remaining: 35m 15s
5000:	learn: 0.9114280	test: 0.9021452	best: 0.9021497 (4998)	total: 46.1s	remaining: 29m 57s
10000:	learn: 0.9245293	test: 0.9134278	best: 0.9134319 (9999)	total: 1m 32s	remaining: 29m 14s
15000:	learn: 0.9288689	test: 0.9164436	best: 0.9164476 (14992)	total: 2m 19s	remaining: 28m 45s
20000:	learn: 0.9312744	test: 0.9177918	best: 0.9177955 (19994)	total: 3m 7s	remaining: 28m 6s
25000:	learn: 0.9328682	test: 0.9182472	best: 0.9182472 (25000)	total: 3m 53s	remaining: 27m 16s
bestTest = 0.9183042049
bestIteration = 25901
Shrink model to first 25902 iterations.


100%|██████████| 200/200 [00:17<00:00,  8.42it/s]
100%|██████████| 200/200 [01:24<00:00,  1.49it/s]


Fold idx:5


100%|██████████| 200/200 [01:24<00:00,  1.35it/s]
100%|██████████| 200/200 [00:20<00:00,  6.23it/s]


0:	learn: 0.5005944	test: 0.5002475	best: 0.5002475 (0)	total: 16.9ms	remaining: 56m 17s
5000:	learn: 0.9121132	test: 0.8993902	best: 0.8993942 (4997)	total: 48.1s	remaining: 31m 14s
10000:	learn: 0.9250720	test: 0.9116746	best: 0.9116762 (9992)	total: 1m 35s	remaining: 30m 20s
15000:	learn: 0.9293404	test: 0.9150252	best: 0.9150259 (14998)	total: 2m 22s	remaining: 29m 16s
20000:	learn: 0.9316664	test: 0.9162859	best: 0.9162939 (19985)	total: 3m 8s	remaining: 28m 17s
25000:	learn: 0.9331907	test: 0.9169751	best: 0.9169786 (24991)	total: 3m 56s	remaining: 27m 32s
30000:	learn: 0.9343565	test: 0.9173765	best: 0.9173810 (29917)	total: 4m 42s	remaining: 26m 42s
35000:	learn: 0.9353094	test: 0.9176589	best: 0.9176615 (34962)	total: 5m 31s	remaining: 26m 1s
40000:	learn: 0.9360949	test: 0.9177826	best: 0.9177861 (39589)	total: 6m 17s	remaining: 25m 11s
bestTest = 0.9178408384
bestIteration = 42405
Shrink model to first 42406 iterations.


100%|██████████| 200/200 [00:17<00:00,  8.46it/s]
100%|██████████| 200/200 [01:24<00:00,  1.48it/s]


CPU times: user 1h 17min 26s, sys: 1h 8min, total: 2h 25min 27s
Wall time: 1h 36min 48s


In [16]:
sub = pd.DataFrame({"ID_code": test_df.index.values})
sub["target"] = predictions_gmean
sub.to_csv('submission_cb_{}_seed_v{}.csv'.format(SEED, VERSION), index=False)

In [17]:
roc = roc_auc_score(target, oof)
print("CV score: {:<8.5f}".format(roc))

CV score: 0.91992 


In [18]:
oof_train = pd.DataFrame()
oof_train[f'oof_cb_{num_folds}_{SEED}_{VERSION}'] = oof
oof_test = pd.DataFrame()
oof_test[f'oof_cb_{num_folds}_{SEED}_{VERSION}'] = predictions_gmean
oof_train.to_csv('train_cb_roc_{}_seed_{}_v{}.csv'.format(
    roc, SEED, VERSION), index=False)
oof_test.to_csv('test_cb_roc_{}_seed_{}_v{}.csv'.format(
    roc, SEED, VERSION), index=False)