# Task 2

In [54]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import imblearn
import matplotlib.pyplot as plt
from itertools import combinations
from collections import Counter
%matplotlib osx

## Preprocessing

In [55]:
x_train = pd.read_csv("Task2/X_train.csv", encoding='windows-1251')
y_train = pd.read_csv("Task2/y_train.csv", encoding='windows-1251', header=None)
x_test = pd.read_csv("Task2/X_test.csv", encoding='windows-1251')

In [56]:
x_train.columns = ['id', 'sex', 'local', 'histol', 'rdate', 'v', 'doze', 'idoze', 'mv12', 'mv10', 'nv12', 'nv10']
x_test.columns = x_train.columns

In [57]:
cf = [0, 1, 2]

In [58]:
def preprocess(df):
    df.drop(labels=['rdate', 'id'], inplace=True, axis=1)
    temp = df.columns
    df = pd.DataFrame(np.where(np.array(df) == 'н/д', np.nan, df))
    df.columns = temp
    for col in ['v', 'doze', 'idoze', 'mv12', 'mv10', 'nv12', 'nv10']:
        df[col] = df[col].map(lambda x: float(str(x).replace(',', '.')))
    df.local = df.local.fillna('н/д')
    
    df.sex.fillna('Ж', inplace=True)
    
    for col in cf:
        le = LabelEncoder()
        df[df.columns[col]] = le.fit_transform(df[df.columns[col]])
        
    for col in df.columns:
        if col != 'sex':
            df[col].fillna(df[col].mean(), inplace=True)
    
    df.local = df.local.apply(lambda x: int(x))
    
    for col in df.columns[3:]:
        df[col+'^2'] = df[col]**2
        
#     df['n*m10'] = df.nv10 * df.mv10
#     df['m-n12'] = df.mv12 - df.nv12
#     df['conc'] = df.doze / df.v
#     df['iconc'] = df.idoze / df.v
    
#     df.drop(['mv12', 'mv10', 'nv12', 'nv10'], axis=1, inplace=True)
#    df.sex = df.sex.map({'М': 1, 'м': 1, 'Ж': 0, 'ж': 0})
    
    return df

In [59]:
x_train = preprocess(x_train)
x_test = preprocess(x_test)

## Smoothed likelihood

### Train

In [60]:
df = x_train.copy()
df['y'] = y_train

In [61]:
alpha = 40
global_mean = df.y.mean()

In [62]:
cols = df.columns[[1, 2]].values

In [63]:
from collections import defaultdict
for col in cols:
    skf = StratifiedKFold(shuffle=True, random_state=42)
    for train_index, test_index in skf.split(x_train, y_train):
        dd = dict(df.iloc[train_index].groupby(col).apply(lambda df: (df['y'].mean()*len(df) + alpha*global_mean)/(len(df) + alpha)).iteritems())
        df.loc[test_index, 'sl_'+col] = df.loc[test_index][col].apply(lambda x: dd.get(x, global_mean))

In [64]:
x_train = df.drop(['y'], axis=1)

### Test

In [65]:
for col in cols:
    dd = dict(df.groupby(col).apply(lambda df: (df['y'].mean()*len(df) + alpha*global_mean)/(len(df) + alpha)).iteritems())
    x_test['sl_'+col] = df[col].apply(lambda x: dd.get(x, global_mean))

## ADASYN

In [66]:
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import EditedNearestNeighbours

In [67]:
ads = ADASYN()
enn = EditedNearestNeighbours()

In [68]:
x_rs, y_rs = enn.fit_sample(*ads.fit_sample(x_train, np.ravel(y_train)))

In [69]:
print(x_rs.shape)
print(y_rs.shape)

(5716, 19)
(5716,)


In [70]:
x_rs_tr, x_rs_t, y_rs_tr, y_rs_t = train_test_split(x_rs, y_rs, test_size=0.3)

In [71]:
cbc = CatBoostClassifier(iterations=10000, eval_metric='F1', use_best_model=True, od_type='Iter', od_wait=100)

In [72]:
cbc.fit(x_rs_tr, y_rs_tr, eval_set=(x_rs_t, y_rs_t))

0:	learn: 0.7490401	test: 0.7251656	best: 0.7251656 (0)	total: 21.5ms	remaining: 3m 34s
1:	learn: 0.8349655	test: 0.8076256	best: 0.8076256 (1)	total: 31ms	remaining: 2m 35s
2:	learn: 0.8316679	test: 0.8059150	best: 0.8076256 (1)	total: 41.8ms	remaining: 2m 19s
3:	learn: 0.8342125	test: 0.8057814	best: 0.8076256 (1)	total: 52.8ms	remaining: 2m 11s
4:	learn: 0.8248848	test: 0.8014842	best: 0.8076256 (1)	total: 62.1ms	remaining: 2m 4s
5:	learn: 0.8319046	test: 0.8082808	best: 0.8082808 (5)	total: 73.2ms	remaining: 2m 1s
6:	learn: 0.8371396	test: 0.8141432	best: 0.8141432 (6)	total: 87.6ms	remaining: 2m 5s
7:	learn: 0.8400153	test: 0.8059150	best: 0.8141432 (6)	total: 101ms	remaining: 2m 5s
8:	learn: 0.8406572	test: 0.8059150	best: 0.8141432 (6)	total: 117ms	remaining: 2m 9s
9:	learn: 0.8532423	test: 0.8196115	best: 0.8196115 (9)	total: 135ms	remaining: 2m 15s
10:	learn: 0.8644386	test: 0.8301544	best: 0.8301544 (10)	total: 149ms	remaining: 2m 15s
11:	learn: 0.8598271	test: 0.8269581	best

95:	learn: 0.9528024	test: 0.9287647	best: 0.9297297 (94)	total: 1.66s	remaining: 2m 51s
96:	learn: 0.9528024	test: 0.9277978	best: 0.9297297 (94)	total: 1.7s	remaining: 2m 53s
97:	learn: 0.9520649	test: 0.9287647	best: 0.9297297 (94)	total: 1.72s	remaining: 2m 53s
98:	learn: 0.9516783	test: 0.9287647	best: 0.9297297 (94)	total: 1.73s	remaining: 2m 53s
99:	learn: 0.9524512	test: 0.9277978	best: 0.9297297 (94)	total: 1.75s	remaining: 2m 53s
100:	learn: 0.9539934	test: 0.9268293	best: 0.9297297 (94)	total: 1.77s	remaining: 2m 53s
101:	learn: 0.9543782	test: 0.9268293	best: 0.9297297 (94)	total: 1.78s	remaining: 2m 52s
102:	learn: 0.9539934	test: 0.9258590	best: 0.9297297 (94)	total: 1.79s	remaining: 2m 51s
103:	learn: 0.9543782	test: 0.9277978	best: 0.9297297 (94)	total: 1.8s	remaining: 2m 51s
104:	learn: 0.9536082	test: 0.9287647	best: 0.9297297 (94)	total: 1.81s	remaining: 2m 50s
105:	learn: 0.9543782	test: 0.9286360	best: 0.9297297 (94)	total: 1.82s	remaining: 2m 49s
106:	learn: 0.955

187:	learn: 0.9702467	test: 0.9513705	best: 0.9522968 (186)	total: 3.61s	remaining: 3m 8s
188:	learn: 0.9709724	test: 0.9513705	best: 0.9522968 (186)	total: 3.65s	remaining: 3m 9s
189:	learn: 0.9705989	test: 0.9522968	best: 0.9522968 (189)	total: 3.7s	remaining: 3m 11s
190:	learn: 0.9705989	test: 0.9522968	best: 0.9522968 (190)	total: 3.73s	remaining: 3m 11s
191:	learn: 0.9702251	test: 0.9522968	best: 0.9522968 (191)	total: 3.76s	remaining: 3m 12s
192:	learn: 0.9702251	test: 0.9522968	best: 0.9522968 (192)	total: 3.79s	remaining: 3m 12s
193:	learn: 0.9702251	test: 0.9522968	best: 0.9522968 (193)	total: 3.83s	remaining: 3m 13s
194:	learn: 0.9705989	test: 0.9522968	best: 0.9522968 (194)	total: 3.85s	remaining: 3m 13s
195:	learn: 0.9713457	test: 0.9532215	best: 0.9532215 (195)	total: 3.89s	remaining: 3m 14s
196:	learn: 0.9717186	test: 0.9532215	best: 0.9532215 (196)	total: 3.93s	remaining: 3m 15s
197:	learn: 0.9717186	test: 0.9532215	best: 0.9532215 (197)	total: 3.95s	remaining: 3m 15s
19

279:	learn: 0.9835007	test: 0.9651568	best: 0.9651568 (279)	total: 6.16s	remaining: 3m 33s
280:	learn: 0.9831360	test: 0.9651568	best: 0.9651568 (280)	total: 6.19s	remaining: 3m 34s
281:	learn: 0.9831360	test: 0.9651568	best: 0.9651568 (281)	total: 6.22s	remaining: 3m 34s
282:	learn: 0.9831360	test: 0.9651568	best: 0.9651568 (282)	total: 6.24s	remaining: 3m 34s
283:	learn: 0.9831360	test: 0.9651568	best: 0.9651568 (283)	total: 6.26s	remaining: 3m 34s
284:	learn: 0.9831360	test: 0.9651568	best: 0.9651568 (284)	total: 6.27s	remaining: 3m 33s
285:	learn: 0.9831360	test: 0.9651568	best: 0.9651568 (285)	total: 6.29s	remaining: 3m 33s
286:	learn: 0.9831360	test: 0.9651568	best: 0.9651568 (286)	total: 6.29s	remaining: 3m 33s
287:	learn: 0.9831360	test: 0.9651568	best: 0.9651568 (287)	total: 6.3s	remaining: 3m 32s
288:	learn: 0.9831360	test: 0.9651568	best: 0.9651568 (288)	total: 6.32s	remaining: 3m 32s
289:	learn: 0.9831360	test: 0.9651568	best: 0.9651568 (289)	total: 6.33s	remaining: 3m 31s


372:	learn: 0.9903743	test: 0.9687500	best: 0.9687500 (372)	total: 8.27s	remaining: 3m 33s
373:	learn: 0.9903743	test: 0.9687500	best: 0.9687500 (373)	total: 8.3s	remaining: 3m 33s
374:	learn: 0.9903743	test: 0.9687500	best: 0.9687500 (374)	total: 8.34s	remaining: 3m 33s
375:	learn: 0.9903743	test: 0.9687500	best: 0.9687500 (375)	total: 8.35s	remaining: 3m 33s
376:	learn: 0.9903743	test: 0.9687500	best: 0.9687500 (376)	total: 8.36s	remaining: 3m 33s
377:	learn: 0.9903743	test: 0.9687500	best: 0.9687500 (377)	total: 8.38s	remaining: 3m 33s
378:	learn: 0.9903743	test: 0.9687500	best: 0.9687500 (378)	total: 8.39s	remaining: 3m 32s
379:	learn: 0.9903743	test: 0.9687500	best: 0.9687500 (379)	total: 8.4s	remaining: 3m 32s
380:	learn: 0.9903743	test: 0.9687500	best: 0.9687500 (380)	total: 8.41s	remaining: 3m 32s
381:	learn: 0.9903743	test: 0.9687500	best: 0.9687500 (381)	total: 8.42s	remaining: 3m 32s
382:	learn: 0.9903743	test: 0.9687500	best: 0.9687500 (382)	total: 8.43s	remaining: 3m 31s
3

469:	learn: 0.9918120	test: 0.9696444	best: 0.9696444 (469)	total: 9.75s	remaining: 3m 17s
470:	learn: 0.9921708	test: 0.9696444	best: 0.9696444 (470)	total: 9.76s	remaining: 3m 17s
471:	learn: 0.9921708	test: 0.9696444	best: 0.9696444 (471)	total: 9.77s	remaining: 3m 17s
472:	learn: 0.9921708	test: 0.9696444	best: 0.9696444 (472)	total: 9.79s	remaining: 3m 17s
473:	learn: 0.9921708	test: 0.9696444	best: 0.9696444 (473)	total: 9.8s	remaining: 3m 16s
474:	learn: 0.9921708	test: 0.9696444	best: 0.9696444 (474)	total: 9.81s	remaining: 3m 16s
475:	learn: 0.9921708	test: 0.9696444	best: 0.9696444 (475)	total: 9.83s	remaining: 3m 16s
476:	learn: 0.9921708	test: 0.9696444	best: 0.9696444 (476)	total: 9.86s	remaining: 3m 16s
477:	learn: 0.9921708	test: 0.9705373	best: 0.9705373 (477)	total: 9.89s	remaining: 3m 17s
478:	learn: 0.9921708	test: 0.9705373	best: 0.9705373 (478)	total: 9.91s	remaining: 3m 16s
479:	learn: 0.9921708	test: 0.9705373	best: 0.9705373 (479)	total: 9.93s	remaining: 3m 16s


563:	learn: 0.9936034	test: 0.9714286	best: 0.9714286 (563)	total: 11.6s	remaining: 3m 14s
564:	learn: 0.9936034	test: 0.9714286	best: 0.9714286 (564)	total: 11.7s	remaining: 3m 14s
565:	learn: 0.9936034	test: 0.9714286	best: 0.9714286 (565)	total: 11.7s	remaining: 3m 14s
566:	learn: 0.9936034	test: 0.9714286	best: 0.9714286 (566)	total: 11.7s	remaining: 3m 14s
567:	learn: 0.9936034	test: 0.9714286	best: 0.9714286 (567)	total: 11.7s	remaining: 3m 14s
568:	learn: 0.9936034	test: 0.9714286	best: 0.9714286 (568)	total: 11.8s	remaining: 3m 14s
569:	learn: 0.9936034	test: 0.9714286	best: 0.9714286 (569)	total: 11.8s	remaining: 3m 14s
570:	learn: 0.9936034	test: 0.9714286	best: 0.9714286 (570)	total: 11.8s	remaining: 3m 14s
571:	learn: 0.9936034	test: 0.9714286	best: 0.9714286 (571)	total: 11.8s	remaining: 3m 14s
572:	learn: 0.9936034	test: 0.9714286	best: 0.9714286 (572)	total: 11.9s	remaining: 3m 15s
573:	learn: 0.9936034	test: 0.9714286	best: 0.9714286 (573)	total: 11.9s	remaining: 3m 15s

<catboost.core.CatBoostClassifier at 0x1a099396a0>

In [73]:
sorted(list(zip(cbc.feature_importances_, x_train.columns)), reverse=True)

[(21.02229029987883, 'sex'),
 (19.396208956554187, 'histol'),
 (11.415970932432895, 'local'),
 (10.326582572376239, 'sl_histol'),
 (7.459233887173678, 'doze^2'),
 (5.890710749141173, 'doze'),
 (5.281106299527042, 'sl_local'),
 (3.903063073173154, 'v^2'),
 (2.461958762519707, 'v'),
 (2.2632856563004187, 'idoze'),
 (1.9404423392711854, 'mv12^2'),
 (1.6830525909792928, 'idoze^2'),
 (1.1781675856017264, 'mv10'),
 (1.1306814203739441, 'nv12'),
 (1.0486700422426043, 'nv12^2'),
 (0.9891905950503405, 'mv10^2'),
 (0.9140453636794508, 'nv10'),
 (0.8544958726556796, 'mv12'),
 (0.8408430010684561, 'nv10^2')]

In [74]:
cbc = CatBoostClassifier(iterations=624, eval_metric='F1')
cbc.fit(x_rs, y_rs)

0:	learn: 0.7553509	total: 17.3ms	remaining: 10.8s
1:	learn: 0.7926728	total: 29.8ms	remaining: 9.26s
2:	learn: 0.8351133	total: 41.6ms	remaining: 8.61s
3:	learn: 0.8332894	total: 52.4ms	remaining: 8.13s
4:	learn: 0.8413323	total: 63.9ms	remaining: 7.91s
5:	learn: 0.8501688	total: 76.8ms	remaining: 7.91s
6:	learn: 0.8433289	total: 95.2ms	remaining: 8.39s
7:	learn: 0.8443726	total: 105ms	remaining: 8.07s
8:	learn: 0.8421053	total: 115ms	remaining: 7.88s
9:	learn: 0.8410201	total: 127ms	remaining: 7.81s
10:	learn: 0.8451996	total: 144ms	remaining: 8.01s
11:	learn: 0.8441454	total: 154ms	remaining: 7.84s
12:	learn: 0.8403042	total: 164ms	remaining: 7.71s
13:	learn: 0.8423338	total: 175ms	remaining: 7.61s
14:	learn: 0.8440567	total: 185ms	remaining: 7.49s
15:	learn: 0.8423645	total: 196ms	remaining: 7.45s
16:	learn: 0.8389390	total: 218ms	remaining: 7.77s
17:	learn: 0.8579161	total: 243ms	remaining: 8.18s
18:	learn: 0.8613139	total: 256ms	remaining: 8.15s
19:	learn: 0.8641509	total: 271ms	

167:	learn: 0.9689823	total: 2.75s	remaining: 7.47s
168:	learn: 0.9695106	total: 2.77s	remaining: 7.45s
169:	learn: 0.9689823	total: 2.78s	remaining: 7.44s
170:	learn: 0.9692465	total: 2.8s	remaining: 7.41s
171:	learn: 0.9689823	total: 2.81s	remaining: 7.4s
172:	learn: 0.9692465	total: 2.83s	remaining: 7.37s
173:	learn: 0.9697746	total: 2.84s	remaining: 7.35s
174:	learn: 0.9700384	total: 2.85s	remaining: 7.32s
175:	learn: 0.9697746	total: 2.86s	remaining: 7.29s
176:	learn: 0.9697746	total: 2.88s	remaining: 7.27s
177:	learn: 0.9700384	total: 2.89s	remaining: 7.25s
178:	learn: 0.9705657	total: 2.9s	remaining: 7.22s
179:	learn: 0.9708291	total: 2.93s	remaining: 7.22s
180:	learn: 0.9713555	total: 2.95s	remaining: 7.22s
181:	learn: 0.9713555	total: 2.97s	remaining: 7.21s
182:	learn: 0.9713555	total: 2.98s	remaining: 7.18s
183:	learn: 0.9713555	total: 2.99s	remaining: 7.15s
184:	learn: 0.9713555	total: 3s	remaining: 7.12s
185:	learn: 0.9713555	total: 3.01s	remaining: 7.09s
186:	learn: 0.9716

335:	learn: 0.9853683	total: 5.03s	remaining: 4.31s
336:	learn: 0.9853683	total: 5.04s	remaining: 4.29s
337:	learn: 0.9851123	total: 5.05s	remaining: 4.28s
338:	learn: 0.9851123	total: 5.07s	remaining: 4.26s
339:	learn: 0.9851123	total: 5.08s	remaining: 4.24s
340:	learn: 0.9851123	total: 5.09s	remaining: 4.23s
341:	learn: 0.9851123	total: 5.11s	remaining: 4.21s
342:	learn: 0.9851123	total: 5.12s	remaining: 4.19s
343:	learn: 0.9851123	total: 5.13s	remaining: 4.17s
344:	learn: 0.9851123	total: 5.14s	remaining: 4.15s
345:	learn: 0.9853683	total: 5.15s	remaining: 4.13s
346:	learn: 0.9856242	total: 5.17s	remaining: 4.13s
347:	learn: 0.9856242	total: 5.18s	remaining: 4.11s
348:	learn: 0.9856242	total: 5.2s	remaining: 4.09s
349:	learn: 0.9856242	total: 5.21s	remaining: 4.08s
350:	learn: 0.9856242	total: 5.22s	remaining: 4.06s
351:	learn: 0.9856242	total: 5.23s	remaining: 4.04s
352:	learn: 0.9856242	total: 5.25s	remaining: 4.03s
353:	learn: 0.9856242	total: 5.26s	remaining: 4.01s
354:	learn: 0

504:	learn: 0.9901985	total: 7.77s	remaining: 1.83s
505:	learn: 0.9904523	total: 7.79s	remaining: 1.82s
506:	learn: 0.9904523	total: 7.8s	remaining: 1.8s
507:	learn: 0.9904523	total: 7.81s	remaining: 1.78s
508:	learn: 0.9904523	total: 7.83s	remaining: 1.77s
509:	learn: 0.9904523	total: 7.85s	remaining: 1.75s
510:	learn: 0.9904523	total: 7.86s	remaining: 1.74s
511:	learn: 0.9904523	total: 7.87s	remaining: 1.72s
512:	learn: 0.9904523	total: 7.89s	remaining: 1.71s
513:	learn: 0.9904523	total: 7.9s	remaining: 1.69s
514:	learn: 0.9904523	total: 7.92s	remaining: 1.68s
515:	learn: 0.9907059	total: 7.93s	remaining: 1.66s
516:	learn: 0.9907059	total: 7.95s	remaining: 1.64s
517:	learn: 0.9907059	total: 7.96s	remaining: 1.63s
518:	learn: 0.9907059	total: 7.97s	remaining: 1.61s
519:	learn: 0.9907059	total: 7.99s	remaining: 1.6s
520:	learn: 0.9907059	total: 8.01s	remaining: 1.58s
521:	learn: 0.9907059	total: 8.02s	remaining: 1.57s
522:	learn: 0.9907059	total: 8.04s	remaining: 1.55s
523:	learn: 0.99

<catboost.core.CatBoostClassifier at 0x1a09939208>

In [75]:
pd.DataFrame(cbc.predict(x_test)).to_csv('adasyn+enn_sl=40+sq_cb.csv', index=False, header=False)

In [76]:
corr = pd.DataFrame(x_rs, columns=x_train.columns).corr()

In [77]:
import seaborn as sns

In [78]:
sns.heatmap(corr, xticklabels=corr.columns.values, yticklabels=corr.columns.values)

<matplotlib.axes._subplots.AxesSubplot at 0x1a0988f940>

In [53]:
x_train

Unnamed: 0,sex,local,histol,v,doze,idoze,mv12,mv10,nv12,nv10,sl_local,sl_histol
0,0,3,3,1.688530,24.000000,50.800000,5.758600,7.178132,4.070070,5.489602,0.061200,0.078760
1,0,7,3,11.647100,15.000000,39.900000,19.326548,25.466207,7.679448,13.819107,0.042790,0.075969
2,0,3,3,0.787996,24.000000,42.100000,4.360778,5.793180,3.572782,5.005184,0.057726,0.082774
3,0,13,3,0.348741,24.000000,53.000000,1.959479,2.689443,1.610738,2.340702,0.070119,0.075969
4,0,13,3,1.200860,24.000000,53.000000,6.306214,8.506877,5.105354,7.306017,0.070119,0.075969
5,0,3,3,0.175569,24.000000,74.100000,0.913051,1.200339,0.737482,1.024770,0.051862,0.075969
6,0,3,3,0.136242,24.000000,57.100000,0.855290,1.108709,0.719048,0.972467,0.057726,0.082774
7,0,13,3,2.705600,24.000000,55.500000,11.203863,15.177466,8.498263,12.471866,0.047797,0.078760
8,0,9,3,0.031273,24.000000,69.800000,10.471858,14.099976,10.440585,14.068702,0.066553,0.082774
9,0,9,3,2.282560,24.000000,49.200000,10.471858,14.099976,8.189298,11.817416,0.067672,0.075969
