#### Некрасова Татьяна Валерьевна

Команда: Polosataya

Соревнование: Digital Reputation Challenge 

10 сентября - 10 октября 2019 года

Платформа: https://boosters.pro/championship/digital_reputation_challenge/data

3 место

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import catboost
from lightgbm import LGBMClassifier
#from xgboost import XGBClassifier
from catboost import CatBoostClassifier, Pool,cv
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.sparse as sp
from scipy.sparse import csr_matrix, hstack

In [3]:
from sklearn.model_selection import StratifiedKFold, ShuffleSplit, KFold, train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.calibration import CalibratedClassifierCV
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import TfidfVectorizer

### Основная идея
Данные анонимные, предсказать надо человеческое поведение. При выборе моделей я вспомнила "Мудрость толпы" Фрэнсиса Гальтона. Число наблюдений можно считать небольшим. Число фич тоже (вся Х2 по сути одна фича). Поэтому для хорошего результата нужны несколько принципиально отличающихся моделей, которые будут замечать разные особенности в данных. 

Были выбраны:
- LogisticRegression - хорошо отлавливает линейные зависимости между переменными.
- catboost - строит симметричные неглубокие дереверья.
- lightgbm - строит глубокие деревья.

### Чтение и обработка Х1 и Х2

Можно загружать, сразу указав индекс, но он может понадобить, чтобы смержить таблицы. Поэтому индекс установим уже для готового датасета.

In [4]:
#Если использовать не разреженную матрицу то мержить Х1 и Х2 по индексу
#X_train = X1_train.merge(X2_train, on='id', how='inner')
#X_train = X_train.set_index('id')

In [5]:
X1_train = pd.read_csv('train/X1.csv')
X1_test = pd.read_csv('test/X1.csv')

X2_train = pd.read_csv('train/X2.csv')
X2_test = pd.read_csv('test/X2.csv')
Y = pd.read_csv('train/Y.csv')

In [6]:
X1_train.head()

Unnamed: 0,id,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,3,1,-1.0,-1.0,107.0,255.0,537.0,10.0,41.0,0.0,...,0,0,0,0,0,0,1,0,1,0
1,5,0,0.0,0.0,20.0,0.0,188.0,1.0,25.0,2.0,...,0,0,0,0,0,0,0,0,0,0
2,6,1,0.0,0.0,158.0,155.0,3092.0,3.0,218.0,29.0,...,0,0,0,0,0,0,0,1,0,0
3,8,1,0.0,0.0,102.0,343.0,341.0,0.0,24.0,2.0,...,0,0,0,0,0,0,1,0,0,0
4,10,1,0.0,0.0,1.0,1.0,33.0,0.0,41.0,1.0,...,0,0,0,0,0,0,1,0,1,0


В таблицах Х1 и Х3 колонки нумеруются по порядку. Чтобы не путаться, переименуем колонки Х1, добавив префикс 'Х1_'.

In [7]:
X1_train.columns = ['X1_' + str(i) for i in range(0, 26)]
X1_train.rename({'X1_0': 'id'}, axis=1, inplace=True)
X1_test.columns = ['X1_' + str(i) for i in range(0, 26)]
X1_test.rename({'X1_0': 'id'}, axis=1, inplace=True)

In [8]:
#В этом ноутбуке 'id', как фича, т.к. некоторые признаки сильно коррелируют с id. 
X1_train = X1_train.set_index('id')
X1_test = X1_test.set_index('id')

X1_train['id'] = X1_train.index
X1_test['id'] = X1_test.index

Анализ показал, что распределение некоторых количественных переменных в Х1 ассиметрично. Исправим это.

In [9]:
skewed_col = ['X1_4', 'X1_5', 'X1_6', 'X1_7', 'X1_9',]
for c in skewed_col:
    X1_train[c] = np.log(X1_train[c]+2)
    X1_test[c] = np.log(X1_test[c]+2)
#другое
    skewed_col = ['X1_8']
for c in skewed_col:
    X1_train[c] = np.log(X1_train[c]+0.5)
    X1_test[c] = np.log(X1_test[c]+0.5)

Сразу пометим категориальные переменные, чтобы использовать их в бустингах. В последних решениях все данные были собраны в разреженную матрицу и необходимость в выделении категориальных фич отпала.

In [10]:
cat_cols = ['X1_1', 'X1_2', 'X1_3', 'X1_10', 'X1_11', 'X1_12', 'X1_13', 'X1_15', 'X1_16', 'X1_17', 'X1_18', 
            'X1_20', 'X1_21', 'X1_22', 'X1_23', 'X1_24', 'X1_25']
cat_feat = [X1_train.columns.get_loc(c) for c in cat_cols if c in X1_train]
#cat_feat = [c for c in cat_cols if c in X1_train]

Сначала долго работала с Х3, т.к. она уже в виде таблицы. Но по Х3 идет сильное переобучение, выделить полезный сигнал очень трудно. В телеграме упомянули, что Х3 строится по Х2. А Х2 представляет собой посещаемые пользователем сайты.

Сразу вспомнилось решение InClass соревнования с Kaggle про идентификацию пользователей сайта. Поскольку то решение до сих пор держится в топе, использовать Tfidf вместо того, чтобы работать с Х2 как с матрицей, показалось хорошим выбором.

In [11]:
X2_train.head()

Unnamed: 0,id,A
0,3,5
1,3,70340
2,3,72868
3,3,73471
4,3,74998


Переводим числа в текстовую строку для каждого пользователя для дальнейшего анализа Tfidf.

In [12]:
df_train = X2_train.groupby('id').agg({'A':lambda x: ' '.join(map(str, x))})
df_train.columns = ['A']
df_train.head()

Unnamed: 0_level_0,A
id,Unnamed: 1_level_1
3,5 70340 72868 73471 74998 76085 76344 77490 68...
5,52464 53049 55398 63794 63865 146853 124018 15...
6,109524 108377 108057 107971 105711 107366 1069...
8,76104 108776 82494 106467 189148 172358 130830...
10,128427 133569 132866 132460 131402 130676 1287...


In [13]:
df_test = X2_test.groupby('id').agg({'A':lambda x: ' '.join(map(str, x))})
df_test.columns = ['A']
df_test.head()

Unnamed: 0_level_0,A
id,Unnamed: 1_level_1
0,3516 159588 146272 139933 139877 135954 133247...
1,81239 85770 81928 82168 83379 83461 83805 8682...
2,90856 43470 29426 16362 14919 13951 27987
4,78992 80049 83151 83345 89434 89678 90936 9312...
7,93635 89074 88919 83782 86227 82642 93714 8749...


Включаем ngram(от 1, 9) (1,5 тоже хорошо работает) и ограничиваем фичи до 50000. Для разных 'y' эти числа разные, но такая подгонка только ухудшала результат, поэтому взяла максимум (выбранный для на у4).

In [14]:
tfidfn = TfidfVectorizer(ngram_range=(1, 9),max_features=50000, sublinear_tf=True)

In [15]:
X_train_sparse = tfidfn.fit_transform(df_train.A)
X_test_sparse = tfidfn.transform(df_test.A)

Сравнение необработанных и обработанных колонок из Х1 показало, что лучше использовать StandardScaler. Объединим обе таблицы в одну матрицу.

In [16]:
scaler = StandardScaler()

tmp_train = scaler.fit_transform(X1_train)
tmp_test = scaler.transform(X1_test)

X_train = csr_matrix(hstack([tmp_train, X_train_sparse]))
X_test = csr_matrix(hstack([tmp_test, X_test_sparse]))

X_train_ = csr_matrix(hstack([X1_train, X_train_sparse]))
X_test_ = csr_matrix(hstack([X1_test, X_test_sparse]))

### Валидация

У меня был опыт участия в подобном сореновании (поведение людей, ROC AUC ~ 0.6). Сложная схема валидации создавала ложную уверенность, что все под контролем. Как результат - падение на 250 мест из топа. В телеграме дискутировали о том, что надо использовать еще более сложную схему валидации. Но, по моему мнению, это похоже на "колосса на глинянных ногах".

Для кроссвалидации был проверены все валидаторы из sklearn.model_selection. Для них рассчитывалось mean и std по одному набору данных и сравнивалось с лидербордом. У ShuffleSplit было самое низкое стандартное отклонение. Остальные немного завышали скор и он очень сильно варьировался по фолдам.

Для проверки качества моделей использовались три модели (logit, lgbc, catc) с зафиксированными настройками. В cross_val_score менялась проверяемая модель и менялся 'у' из Y. Если значение cross_val_score было ниже 0,59 - модель считалась плохой.

In [17]:
ss = ShuffleSplit(n_splits=5, test_size=0.3, random_state=42)

#logit = make_pipeline(StandardScaler(), LogisticRegression(solver = 'lbfgs', random_state=42))
logit = LogisticRegression(random_state=42, C= 0.1, solver= 'lbfgs', max_iter= 100, class_weight= 'balanced', 
                           penalty='l2')
lgbc = LGBMClassifier(objective = 'binary',  metric = 'auc', random_state=42,  boosting_type= 'gbdt',  
                      learning_rate= 0.01, max_depth= 2, num_leaves= 2, n_estimators= 1000, 
                      subsample= 0.9, subsample_freq= 1, colsample_bytree= 0.9,) #device = 'gpu',
catc = CatBoostClassifier(loss_function = 'Logloss', eval_metric = 'AUC', random_state=42, logging_level='Silent',
                          task_type='GPU', leaf_estimation_iterations=1, depth = 2, l2_leaf_reg= 1.0,
                         learning_rate = 0.01, bagging_temperature = 0.95, iterations= 1000)

In [18]:
#При валидации менялись значения y_train (Y['1'], Y['2'], Y['3'], Y['4'], Y['5'])
y_train = Y['1']
#При валидации менялась модель (logit, lgbc, catc)
scores = cross_val_score(logit, X_train, y_train, cv=ss, scoring='roc_auc')
print((scores).mean().round(5), (scores).min().round(5), (scores).std().round(3))

0.59464 0.57009 0.017


### Предсказание

Первоначально парамтеры моделей подбирались hyperopt, но в ходе экспериментирования они так много раз менялись, что от первоначального варианта не осталось ничего.

Первоначально для каждого y выбирался собственный датасет, но как оказалось - это вело к переобучению под тренировочные данные. Поэтому все предсказания делались по общему датасету, но с индивидуальными параметрами моделей. 

Для логистической регрессии органичение размерности с TruncatedSVD давало положительный результат. Модели деревьев наоборот лучше работали с полным набором данных.

Для всех моделей предсказание делалось по 5 фолдам. В предсказании логистической регресси использовалась валидационная выборка.

In [19]:
#Параметры логистической регрессии одинаковые для всех моделей. 
logit = LogisticRegression(random_state=42, C= 1.0, solver= 'lbfgs', max_iter= 100, penalty='l2')

#### Предсказание для у1

In [20]:
#logit
y_train = Y['1']
svd = TruncatedSVD(n_components=200, algorithm='randomized', random_state=42)
svd.fit(X_train)
X_svd_train = svd.transform(X_train)
X_svd_test = svd.transform(X_test)

num_folds = 5
folds = KFold(n_splits= num_folds, shuffle=True, random_state=22)
y_1 = np.zeros(X_svd_test.shape[0])
oof = np.zeros(X_svd_train.shape[0])

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_svd_train, y_train)):
    print("fold n°{}".format(fold_ + 1))
    X_trn = X_svd_train[trn_idx]
    y_trn = y_train[trn_idx]
    X_val = X_svd_train[val_idx]
    y_val = y_train[val_idx]

    clf = logit.fit(X_trn, y_trn)
    oof[val_idx] = clf.predict_proba(X_val)[:, 1]
    print('Fold %2d roc_auc : %.6f' % (fold_ + 1, roc_auc_score(y_val, oof[val_idx]))) 
    
    y_1 += clf.predict_proba(X_svd_test) [:, 1] / folds.n_splits
    
print(f"Out of folds AUC = {roc_auc_score(y_train, oof)}")

fold n°1
Fold  1 roc_auc : 0.610088
fold n°2
Fold  2 roc_auc : 0.636057
fold n°3
Fold  3 roc_auc : 0.575049
fold n°4
Fold  4 roc_auc : 0.636043
fold n°5
Fold  5 roc_auc : 0.610719
Out of folds AUC = 0.6123495208753538


In [21]:
#lgb
params = {'objective' : 'binary', 'metric' : 'auc', 'boosting_type' : 'gbdt', 
           'colsample_bytree' : 0.3,  'subsample_freq' : 1,  'subsample' : 0.6,
          'learning_rate': 0.01,  'num_leaves': 3, 
         }
y_train = Y['1']

num_folds = 5
folds = KFold(n_splits= num_folds, shuffle=True, random_state=42)
a1 = np.zeros(X_test.shape[0])

for fold_n, (trn_idx, val_idx) in enumerate(folds.split(X_train_, y_train)):
    print('Fold', fold_n + 1)
    X_trn, X_val = X_train[trn_idx], X_train[val_idx]
    y_trn, y_val = y_train[trn_idx], y_train[val_idx]
    
    train_data = lgb.Dataset(X_trn, label=y_trn)
    valid_data = lgb.Dataset(X_val, label=y_val)
        
    clf = lgb.train(params, train_data, num_boost_round=10000, valid_sets = [train_data, valid_data], 
                    verbose_eval=500, early_stopping_rounds = 100) #categorical_feature = cat_features, 
    
    a1 += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits

Fold 1
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[285]	training's auc: 0.703371	valid_1's auc: 0.62727
Fold 2
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[23]	training's auc: 0.653468	valid_1's auc: 0.587949
Fold 3
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[187]	training's auc: 0.682482	valid_1's auc: 0.582392
Fold 4
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[293]	training's auc: 0.700035	valid_1's auc: 0.640453
Fold 5
Training until validation scores don't improve for 100 rounds.
[500]	training's auc: 0.725319	valid_1's auc: 0.586836
[1000]	training's auc: 0.77259	valid_1's auc: 0.593071
Early stopping, best iteration is:
[903]	training's auc: 0.764247	valid_1's auc: 0.593451


In [22]:
%%time
#catboost
params = {'loss_function' : 'Logloss', 'eval_metric' : 'AUC', 'logging_level' : 'Silent', 'task_type' : 'GPU',
          'leaf_estimation_iterations': 2, 'boosting_type':'Ordered',   
          'iterations': 400,  'depth':2,  'learning_rate': 0.02, 
         }

y_train = Y['1']

num_folds = 5
folds = KFold(n_splits= num_folds, shuffle=True, random_state=42)
b1 = np.zeros(X_test.shape[0])

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_, y_train)):
    print("fold n°{}".format(fold_ + 1))
    trn_data = Pool(X_train[trn_idx], label=y_train[trn_idx])#, cat_features=cat_feat
    val_data = Pool(X_train[val_idx], label=y_train[val_idx])#, cat_features=cat_feat

    clf = catboost.train(params=params, pool = trn_data, eval_set=val_data, num_boost_round=6000, 
                         early_stopping_rounds=100, verbose_eval=1000) 

    #Прогноз
    b1 += clf.predict(X_test, prediction_type='Probability')[:, 1] / folds.n_splits

fold n°1
0:	learn: 0.5033431	test: 0.4973183	best: 0.4973183 (0)	total: 34.2ms	remaining: 3m 25s
bestTest = 0.5882296562
bestIteration = 211
Shrink model to first 212 iterations.
fold n°2
0:	learn: 0.5027976	test: 0.4968032	best: 0.4968032 (0)	total: 27.1ms	remaining: 2m 42s
bestTest = 0.6009210944
bestIteration = 69
Shrink model to first 70 iterations.
fold n°3
0:	learn: 0.5039889	test: 0.5017366	best: 0.5017366 (0)	total: 32.7ms	remaining: 3m 15s
bestTest = 0.6455945075
bestIteration = 152
Shrink model to first 153 iterations.
fold n°4
0:	learn: 0.5058059	test: 0.5052574	best: 0.5052574 (0)	total: 32.2ms	remaining: 3m 12s
bestTest = 0.59613958
bestIteration = 160
Shrink model to first 161 iterations.
fold n°5
0:	learn: 0.5034834	test: 0.4985459	best: 0.4985459 (0)	total: 24.9ms	remaining: 2m 29s
bestTest = 0.5894202888
bestIteration = 71
Shrink model to first 72 iterations.
CPU times: user 1min 14s, sys: 17.6 s, total: 1min 31s
Wall time: 1min 10s


#### Предсказание для у2

In [23]:
#logit
y_train = Y['2']
svd = TruncatedSVD(n_components=200, algorithm='randomized', random_state=42)
svd.fit(X_train)
X_svd_train = svd.transform(X_train)
X_svd_test = svd.transform(X_test)

num_folds = 5
folds = KFold(n_splits= num_folds, shuffle=True, random_state=22)
y_2 = np.zeros(X_svd_test.shape[0])
oof = np.zeros(X_svd_train.shape[0])

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_svd_train, y_train)):
    print("fold n°{}".format(fold_ + 1))
    X_trn = X_svd_train[trn_idx]
    y_trn = y_train[trn_idx]
    X_val = X_svd_train[val_idx]
    y_val = y_train[val_idx]

    clf = logit.fit(X_trn, y_trn)
    oof[val_idx] = clf.predict_proba(X_val)[:, 1]
    print('Fold %2d roc_auc : %.6f' % (fold_ + 1, roc_auc_score(y_val, oof[val_idx]))) 
    
    y_2 += clf.predict_proba(X_svd_test) [:, 1] / folds.n_splits
    
print(f"Out of folds AUC = {roc_auc_score(y_train, oof)}")

fold n°1
Fold  1 roc_auc : 0.631425
fold n°2
Fold  2 roc_auc : 0.625044
fold n°3
Fold  3 roc_auc : 0.620593
fold n°4
Fold  4 roc_auc : 0.632084
fold n°5
Fold  5 roc_auc : 0.620413
Out of folds AUC = 0.6253983034777513


In [24]:
#lgb
params =  {'objective' : 'binary', 'metric' : 'auc', 'boosting_type' : 'gbdt', 
           'colsample_bytree' : 0.3,  'subsample_freq' : 1,  'subsample' : 0.6,
           'learning_rate': 0.01,  'num_leaves': 3, 
         }
y_train = Y['2']

num_folds = 5
folds = KFold(n_splits= num_folds, shuffle=True, random_state=42)
a2 = np.zeros(X_test.shape[0])

for fold_n, (trn_idx, val_idx) in enumerate(folds.split(X_train_, y_train)):
    print('Fold', fold_n + 1)
    X_trn, X_val = X_train[trn_idx], X_train[val_idx]
    y_trn, y_val = y_train[trn_idx], y_train[val_idx]
    
    train_data = lgb.Dataset(X_trn, label=y_trn)
    valid_data = lgb.Dataset(X_val, label=y_val)
        
    clf = lgb.train(params, train_data, num_boost_round=10000, valid_sets = [train_data, valid_data], 
                    verbose_eval=500, early_stopping_rounds = 100) #categorical_feature = cat_features, 
    
    a2 += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits

Fold 1
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[107]	training's auc: 0.676414	valid_1's auc: 0.609281
Fold 2
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[23]	training's auc: 0.655692	valid_1's auc: 0.629752
Fold 3
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[27]	training's auc: 0.647806	valid_1's auc: 0.633824
Fold 4
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[31]	training's auc: 0.651137	valid_1's auc: 0.602305
Fold 5
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[161]	training's auc: 0.673461	valid_1's auc: 0.611579


In [25]:
%%time
#catboost
params = {'loss_function' : 'Logloss', 'eval_metric' : 'AUC', 'logging_level' : 'Silent', 'task_type' : 'GPU',
          'leaf_estimation_iterations': 1, 'boosting_type':'Ordered',  
          'iterations': 400,  'depth':2,  'learning_rate': 0.02, 'l2_leaf_reg': 5.5,
         }

y_train = Y['2']

num_folds = 5
folds = KFold(n_splits= num_folds, shuffle=True, random_state=42)
b2 = np.zeros(X_test.shape[0])

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_, y_train)):
    print("fold n°{}".format(fold_ + 1))
    trn_data = Pool(X_train[trn_idx], label=y_train[trn_idx])#, cat_features=cat_feat
    val_data = Pool(X_train[val_idx], label=y_train[val_idx])#, cat_features=cat_feat

    clf = catboost.train(params=params, pool = trn_data, eval_set=val_data, num_boost_round=6000, 
                         early_stopping_rounds=100, verbose_eval=1000) 

    #Прогноз
    b2 += clf.predict(X_test, prediction_type='Probability')[:, 1] / folds.n_splits

fold n°1
0:	learn: 0.5274521	test: 0.5035906	best: 0.5035906 (0)	total: 28.9ms	remaining: 2m 53s
bestTest = 0.5785573721
bestIteration = 113
Shrink model to first 114 iterations.
fold n°2
0:	learn: 0.5269319	test: 0.5108259	best: 0.5108259 (0)	total: 26.3ms	remaining: 2m 37s
bestTest = 0.5939709842
bestIteration = 141
Shrink model to first 142 iterations.
fold n°3
0:	learn: 0.5181474	test: 0.5254061	best: 0.5254061 (0)	total: 27.5ms	remaining: 2m 45s
bestTest = 0.6072658002
bestIteration = 392
Shrink model to first 393 iterations.
fold n°4
0:	learn: 0.5091484	test: 0.4987612	best: 0.4987612 (0)	total: 26.3ms	remaining: 2m 37s
bestTest = 0.608835578
bestIteration = 168
Shrink model to first 169 iterations.
fold n°5
0:	learn: 0.5215309	test: 0.5244653	best: 0.5244653 (0)	total: 28.2ms	remaining: 2m 49s
bestTest = 0.6130445302
bestIteration = 13
Shrink model to first 14 iterations.
CPU times: user 1min 16s, sys: 18.6 s, total: 1min 34s
Wall time: 1min 10s


#### Предсказание для у3

In [26]:
#logit
y_train = Y['3']
svd = TruncatedSVD(n_components=200, algorithm='randomized', random_state=42)
svd.fit(X_train)
X_svd_train = svd.transform(X_train)
X_svd_test = svd.transform(X_test)

num_folds = 5
folds = KFold(n_splits= num_folds, shuffle=True, random_state=22)
y_3 = np.zeros(X_svd_test.shape[0])
oof = np.zeros(X_svd_train.shape[0])

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_svd_train, y_train)):
    print("fold n°{}".format(fold_ + 1))
    X_trn = X_svd_train[trn_idx]
    y_trn = y_train[trn_idx]
    X_val = X_svd_train[val_idx]
    y_val = y_train[val_idx]

    clf = logit.fit(X_trn, y_trn)
    oof[val_idx] = clf.predict_proba(X_val)[:, 1]
    print('Fold %2d roc_auc : %.6f' % (fold_ + 1, roc_auc_score(y_val, oof[val_idx]))) 
    
    y_3 += clf.predict_proba(X_svd_test) [:, 1] / folds.n_splits
    
print(f"Out of folds AUC = {roc_auc_score(y_train, oof)}")

fold n°1
Fold  1 roc_auc : 0.597058
fold n°2
Fold  2 roc_auc : 0.608889
fold n°3
Fold  3 roc_auc : 0.649703
fold n°4
Fold  4 roc_auc : 0.612179
fold n°5
Fold  5 roc_auc : 0.608556
Out of folds AUC = 0.6148429534774495


In [27]:
#lgb
params = {'objective' : 'binary', 'metric' : 'auc', 'boosting_type' : 'gbdt', 
           'colsample_bytree' : 0.5,  'subsample_freq' : 1,  'subsample' : 0.6,
          'learning_rate': 0.005,  'num_leaves': 3, 
         }
y_train = Y['3']

num_folds = 5
folds = KFold(n_splits= num_folds, shuffle=True, random_state=42)
a3 = np.zeros(X_test.shape[0])

for fold_n, (trn_idx, val_idx) in enumerate(folds.split(X_train_, y_train)):
    print('Fold', fold_n + 1)
    X_trn, X_val = X_train[trn_idx], X_train[val_idx]
    y_trn, y_val = y_train[trn_idx], y_train[val_idx]
    
    train_data = lgb.Dataset(X_trn, label=y_trn)
    valid_data = lgb.Dataset(X_val, label=y_val)
        
    clf = lgb.train(params, train_data, num_boost_round=10000, valid_sets = [train_data, valid_data], 
                    verbose_eval=500, early_stopping_rounds = 100) #categorical_feature = cat_features, 
    
    a3 += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits

Fold 1
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[13]	training's auc: 0.643919	valid_1's auc: 0.598896
Fold 2
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[153]	training's auc: 0.668561	valid_1's auc: 0.637876
Fold 3
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[49]	training's auc: 0.660949	valid_1's auc: 0.665249
Fold 4
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[7]	training's auc: 0.637407	valid_1's auc: 0.60238
Fold 5
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[7]	training's auc: 0.630428	valid_1's auc: 0.611195


In [28]:
%%time
#catboost
params = {'loss_function' : 'Logloss', 'eval_metric' : 'AUC', 'logging_level' : 'Silent', 'task_type' : 'GPU',
          'leaf_estimation_iterations': 2, 'boosting_type':'Ordered',
          'iterations': 400,  'depth':1,  'learning_rate': 0.02, 
         }

y_train = Y['3']

num_folds = 5
folds = KFold(n_splits= num_folds, shuffle=True, random_state=42)
b3 = np.zeros(X_test.shape[0])

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_, y_train)):
    print("fold n°{}".format(fold_ + 1))
    trn_data = Pool(X_train[trn_idx], label=y_train[trn_idx])#, cat_features=cat_feat
    val_data = Pool(X_train[val_idx], label=y_train[val_idx])#, cat_features=cat_feat

    clf = catboost.train(params=params, pool = trn_data, eval_set=val_data, num_boost_round=6000, 
                         early_stopping_rounds=100, verbose_eval=1000) 

    #Прогноз
    b3 += clf.predict(X_test, prediction_type='Probability')[:, 1] / folds.n_splits

fold n°1
0:	learn: 0.6071786	test: 0.5811117	best: 0.5811117 (0)	total: 19.1ms	remaining: 1m 54s
bestTest = 0.6045835614
bestIteration = 327
Shrink model to first 328 iterations.
fold n°2
0:	learn: 0.5061838	test: 0.5082209	best: 0.5082209 (0)	total: 19.4ms	remaining: 1m 56s
bestTest = 0.6266306788
bestIteration = 59
Shrink model to first 60 iterations.
fold n°3
0:	learn: 0.5035638	test: 0.5010340	best: 0.5010340 (0)	total: 18.8ms	remaining: 1m 52s
bestTest = 0.627628088
bestIteration = 136
Shrink model to first 137 iterations.
fold n°4
0:	learn: 0.5073498	test: 0.5039259	best: 0.5039259 (0)	total: 18.9ms	remaining: 1m 53s
bestTest = 0.6054557562
bestIteration = 60
Shrink model to first 61 iterations.
fold n°5
0:	learn: 0.5069071	test: 0.5051700	best: 0.5051700 (0)	total: 18.6ms	remaining: 1m 51s
bestTest = 0.6336921453
bestIteration = 167
Shrink model to first 168 iterations.
CPU times: user 1min 7s, sys: 15.2 s, total: 1min 22s
Wall time: 58.3 s


#### Предсказание для у4

In [29]:
#logit
y_train = Y['4']
svd = TruncatedSVD(n_components=200, algorithm='randomized', random_state=42)
svd.fit(X_train)
X_svd_train = svd.transform(X_train)
X_svd_test = svd.transform(X_test)

num_folds = 5
folds = KFold(n_splits= num_folds, shuffle=True, random_state=22)
y_4 = np.zeros(X_svd_test.shape[0])
oof = np.zeros(X_svd_train.shape[0])

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_svd_train, y_train)):
    print("fold n°{}".format(fold_ + 1))
    X_trn = X_svd_train[trn_idx]
    y_trn = y_train[trn_idx]
    X_val = X_svd_train[val_idx]
    y_val = y_train[val_idx]

    clf = logit.fit(X_trn, y_trn)
    oof[val_idx] = clf.predict_proba(X_val)[:, 1]
    print('Fold %2d roc_auc : %.6f' % (fold_ + 1, roc_auc_score(y_val, oof[val_idx]))) 
    
    y_4 += clf.predict_proba(X_svd_test) [:, 1] / folds.n_splits
    
print(f"Out of folds AUC = {roc_auc_score(y_train, oof)}")

fold n°1
Fold  1 roc_auc : 0.615891
fold n°2
Fold  2 roc_auc : 0.610505
fold n°3
Fold  3 roc_auc : 0.579463
fold n°4
Fold  4 roc_auc : 0.631003
fold n°5
Fold  5 roc_auc : 0.633643
Out of folds AUC = 0.6138930768424532


In [30]:
#lgb
params = {'objective' : 'binary', 'metric' : 'auc', 'boosting_type' : 'gbdt', 
           'colsample_bytree' : 0.5,  'subsample_freq' : 1,  'subsample' : 0.6,
           'learning_rate': 0.005,  'num_leaves': 3, 
         }
y_train = Y['4']

num_folds = 5
folds = KFold(n_splits= num_folds, shuffle=True, random_state=42)
a4 = np.zeros(X_test.shape[0])

for fold_n, (trn_idx, val_idx) in enumerate(folds.split(X_train_, y_train)):
    print('Fold', fold_n + 1)
    X_trn, X_val = X_train[trn_idx], X_train[val_idx]
    y_trn, y_val = y_train[trn_idx], y_train[val_idx]
    
    train_data = lgb.Dataset(X_trn, label=y_trn)
    valid_data = lgb.Dataset(X_val, label=y_val)
        
    clf = lgb.train(params, train_data, num_boost_round=10000, valid_sets = [train_data, valid_data], 
                    verbose_eval=500, early_stopping_rounds = 100) #categorical_feature = cat_features, 
    
    a4 += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits

Fold 1
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[19]	training's auc: 0.639623	valid_1's auc: 0.609542
Fold 2
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[203]	training's auc: 0.680526	valid_1's auc: 0.58404
Fold 3
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[122]	training's auc: 0.661725	valid_1's auc: 0.601385
Fold 4
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[20]	training's auc: 0.641033	valid_1's auc: 0.625136
Fold 5
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[20]	training's auc: 0.632939	valid_1's auc: 0.624458


In [31]:
%%time
#catboost
params = {'loss_function' : 'Logloss', 'eval_metric' : 'AUC', 'logging_level' : 'Silent', 'task_type' : 'GPU',
          'leaf_estimation_iterations': 2, 'boosting_type':'Ordered',  
          'iterations': 400,  'depth':2,  'learning_rate': 0.02, 
         }

y_train = Y['4']

num_folds = 5
folds = KFold(n_splits= num_folds, shuffle=True, random_state=42)
b4 = np.zeros(X_test.shape[0])

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_, y_train)):
    print("fold n°{}".format(fold_ + 1))
    trn_data = Pool(X_train[trn_idx], label=y_train[trn_idx])#, cat_features=cat_feat
    val_data = Pool(X_train[val_idx], label=y_train[val_idx])#, cat_features=cat_feat

    clf = catboost.train(params=params, pool = trn_data, eval_set=val_data, num_boost_round=6000, 
                         early_stopping_rounds=100, verbose_eval=1000) 

    #Прогноз
    b4 += clf.predict(X_test, prediction_type='Probability')[:, 1] / folds.n_splits

fold n°1
0:	learn: 0.5081928	test: 0.5065751	best: 0.5065751 (0)	total: 27ms	remaining: 2m 41s
bestTest = 0.6127967238
bestIteration = 174
Shrink model to first 175 iterations.
fold n°2
0:	learn: 0.5868837	test: 0.5756368	best: 0.5756368 (0)	total: 23.5ms	remaining: 2m 20s
bestTest = 0.6110355258
bestIteration = 94
Shrink model to first 95 iterations.
fold n°3
0:	learn: 0.5065753	test: 0.5002002	best: 0.5002002 (0)	total: 28ms	remaining: 2m 48s
bestTest = 0.6446183026
bestIteration = 163
Shrink model to first 164 iterations.
fold n°4
0:	learn: 0.5095312	test: 0.5071666	best: 0.5071666 (0)	total: 26.2ms	remaining: 2m 37s
bestTest = 0.5655074567
bestIteration = 34
Shrink model to first 35 iterations.
fold n°5
0:	learn: 0.5063785	test: 0.4936390	best: 0.4936390 (0)	total: 24.1ms	remaining: 2m 24s
bestTest = 0.5965273678
bestIteration = 80
Shrink model to first 81 iterations.
CPU times: user 1min 8s, sys: 16.1 s, total: 1min 24s
Wall time: 1min 2s


#### Предсказание для у5

In [32]:
#logit
y_train = Y['5']
svd = TruncatedSVD(n_components=200, algorithm='randomized', random_state=42)
svd.fit(X_train)
X_svd_train = svd.transform(X_train)
X_svd_test = svd.transform(X_test)

num_folds = 5
folds = KFold(n_splits= num_folds, shuffle=True, random_state=22)
y_5 = np.zeros(X_svd_test.shape[0])
oof = np.zeros(X_svd_train.shape[0])

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_svd_train, y_train)):
    print("fold n°{}".format(fold_ + 1))
    X_trn = X_svd_train[trn_idx]
    y_trn = y_train[trn_idx]
    X_val = X_svd_train[val_idx]
    y_val = y_train[val_idx]

    clf = logit.fit(X_trn, y_trn)
    oof[val_idx] = clf.predict_proba(X_val)[:, 1]
    print('Fold %2d roc_auc : %.6f' % (fold_ + 1, roc_auc_score(y_val, oof[val_idx]))) 
    
    y_5 += clf.predict_proba(X_svd_test) [:, 1] / folds.n_splits
    
print(f"Out of folds AUC = {roc_auc_score(y_train, oof)}")

fold n°1
Fold  1 roc_auc : 0.603807
fold n°2
Fold  2 roc_auc : 0.623345
fold n°3
Fold  3 roc_auc : 0.601511
fold n°4
Fold  4 roc_auc : 0.626674
fold n°5
Fold  5 roc_auc : 0.627683
Out of folds AUC = 0.6151044034174086


In [33]:
#lgb
params = {'objective' : 'binary', 'metric' : 'auc', 'boosting_type' : 'gbdt', 
           'colsample_bytree' : 0.3,  'subsample_freq' : 1,  'subsample' : 0.6,
          'learning_rate': 0.005,  'num_leaves': 3, 
         }
y_train = Y['5']

num_folds = 5
folds = KFold(n_splits= num_folds, shuffle=True, random_state=42)
a5 = np.zeros(X_test.shape[0])

for fold_n, (trn_idx, val_idx) in enumerate(folds.split(X_train_, y_train)):
    print('Fold', fold_n + 1)
    X_trn, X_val = X_train[trn_idx], X_train[val_idx]
    y_trn, y_val = y_train[trn_idx], y_train[val_idx]
    
    train_data = lgb.Dataset(X_trn, label=y_trn)
    valid_data = lgb.Dataset(X_val, label=y_val)
        
    clf = lgb.train(params, train_data, num_boost_round=10000, valid_sets = [train_data, valid_data], 
                    verbose_eval=500, early_stopping_rounds = 100) #categorical_feature = cat_features, 
    
    a5 += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits

Fold 1
Training until validation scores don't improve for 100 rounds.
[500]	training's auc: 0.709824	valid_1's auc: 0.57637
Early stopping, best iteration is:
[558]	training's auc: 0.71475	valid_1's auc: 0.578007
Fold 2
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[112]	training's auc: 0.667469	valid_1's auc: 0.60082
Fold 3
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[67]	training's auc: 0.660459	valid_1's auc: 0.561186
Fold 4
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[33]	training's auc: 0.648601	valid_1's auc: 0.608009
Fold 5
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[45]	training's auc: 0.645858	valid_1's auc: 0.603124


In [34]:
%%time
#catboost
params = {'loss_function' : 'Logloss', 'eval_metric' : 'AUC', 'logging_level' : 'Silent', 'task_type' : 'GPU',
          'leaf_estimation_iterations': 2, 'boosting_type':'Ordered',  
          'iterations': 400,  'depth':2,  'learning_rate': 0.02, 
         }

y_train = Y['5']

num_folds = 5
folds = KFold(n_splits= num_folds, shuffle=True, random_state=42)
b5 = np.zeros(X_test.shape[0])

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_, y_train)):
    print("fold n°{}".format(fold_ + 1))
    trn_data = Pool(X_train[trn_idx], label=y_train[trn_idx])#, cat_features=cat_feat
    val_data = Pool(X_train[val_idx], label=y_train[val_idx])#, cat_features=cat_feat

    clf = catboost.train(params=params, pool = trn_data, eval_set=val_data, num_boost_round=6000, 
                         early_stopping_rounds=100, verbose_eval=1000) 

    #Прогноз
    b5 += clf.predict(X_test, prediction_type='Probability')[:, 1] / folds.n_splits

fold n°1
0:	learn: 0.5071253	test: 0.4934080	best: 0.4934080 (0)	total: 31.2ms	remaining: 3m 7s
bestTest = 0.6124601364
bestIteration = 229
Shrink model to first 230 iterations.
fold n°2
0:	learn: 0.5053622	test: 0.5005680	best: 0.5005680 (0)	total: 27.9ms	remaining: 2m 47s
bestTest = 0.5744367242
bestIteration = 381
Shrink model to first 382 iterations.
fold n°3
0:	learn: 0.5077508	test: 0.5076533	best: 0.5076533 (0)	total: 24.9ms	remaining: 2m 29s
bestTest = 0.620056659
bestIteration = 114
Shrink model to first 115 iterations.
fold n°4
0:	learn: 0.5031999	test: 0.5009645	best: 0.5009645 (0)	total: 23.8ms	remaining: 2m 22s
bestTest = 0.6074842811
bestIteration = 32
Shrink model to first 33 iterations.
fold n°5
0:	learn: 0.5047716	test: 0.4988334	best: 0.4988334 (0)	total: 26.4ms	remaining: 2m 38s
bestTest = 0.5616071522
bestIteration = 102
Shrink model to first 103 iterations.
CPU times: user 1min 16s, sys: 17.5 s, total: 1min 34s
Wall time: 1min 9s


### Финальное решение 

Результатом модели является submission1. Не оставалось попыток, чтобы проверить, сколько наберет это решение. Поэтому, чтобы максимизировать пользу от последней попытки, этот сабмит было решено объединить с предыдущим лучшим решением.

In [35]:
submission1 = pd.DataFrame(index = X1_test.index)
submission1['1'] = np.exp(0.34*np.log(a1) + 0.32*np.log(y_1)+ 0.33*np.log(b1))
submission1['2'] = np.exp(0.34*np.log(a2) + 0.33*np.log(y_2)+ 0.33*np.log(b2))
submission1['3'] = np.exp(0.34*np.log(a3) + 0.33*np.log(y_3)+ 0.33*np.log(b3))
submission1['4'] = np.exp(0.34*np.log(a4) + 0.33*np.log(y_4)+ 0.33*np.log(b4))
submission1['5'] = np.exp(0.34*np.log(a5) + 0.33*np.log(y_5)+ 0.33*np.log(b5))
submission1.head()

Unnamed: 0_level_0,1,2,3,4,5
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.320297,0.342647,0.270872,0.304255,0.374553
1,0.338998,0.294868,0.28993,0.293493,0.422721
2,0.358539,0.401801,0.263127,0.312543,0.372728
4,0.273669,0.308732,0.296852,0.285238,0.431408
7,0.344934,0.328316,0.38612,0.364364,0.392184


In [36]:
submission1[['1', '2', '3', '4', '5']].to_csv('subm_34.csv', index=True)

Время работы менее 10 минут.