In [1]:
import pandas as pd
import numpy as np
#from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score
import pickle
from scipy.stats import ttest_rel
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from rgf.sklearn import RGFClassifier

In [5]:
# Коэффициент Gini
def GiniScore(y_actual, y_pred):
    return 2*roc_auc_score(y_actual, y_pred)-1

In [3]:
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))


def target_encode(trn_series=None,    # Revised to encode validation series
                  val_series=None,
                  tst_series=None,
                  target=None,
                  min_samples_leaf=1,
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior
    """
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index
    ft_val_series = pd.merge(
        val_series.to_frame(val_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=val_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_val_series.index = val_series.index
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_val_series, noise_level), add_noise(ft_tst_series, noise_level)

# ===========KNN===============

In [3]:
# Считываем данные
# Сразу преобразовываем -1 в NA
df_train = pd.read_csv('/resources/data/driver/train.csv', index_col='id', na_values=-1)

# Тест сортируем по индексу
df_test = pd.read_csv('/resources/data/driver/test.csv', index_col='id', na_values=-1).sort_index()  

In [4]:
df_train['ps_car_14']=df_train[df_train['ps_car_14'].notnull()]['ps_car_14'].apply(lambda x: round(x*1000000))
df_train['ps_reg_03']=df_train[df_train['ps_reg_03'].notnull()]['ps_reg_03'].apply(lambda x: round(x*1000000))

df_test['ps_car_14']=df_test[df_test['ps_car_14'].notnull()]['ps_car_14'].apply(lambda x: round(x*1000000))
df_test['ps_reg_03']=df_test[df_test['ps_reg_03'].notnull()]['ps_reg_03'].apply(lambda x: round(x*1000000))

In [7]:
# 'ps_car_11' ps_car_12 содержит пять и одну записей с target = 0 в трейне. Удаляем их
df_train.dropna(subset=['ps_car_11'], inplace=True)
df_train.dropna(subset=['ps_car_12'], inplace=True)

# Преобразуем в год выпуска
df_train['ps_car_15'] = df_train['ps_car_15'].apply(lambda x: round(x**2))
df_train['ps_car_15'] = df_train['ps_car_15'].astype(np.int8)

df_test['ps_car_15'] = df_test['ps_car_15'].apply(lambda x: round(x**2))
df_test['ps_car_15'] = df_test['ps_car_15'].astype(np.int8)

In [8]:
# Разделение на категориальные и числовые (надо подумать)
num_cols = np.array(['ps_reg_01', 'ps_reg_02', 
                     'ps_car_12', 'ps_car_13', 
                     'ps_calc_01','ps_calc_02','ps_calc_03'])
cat_cols = np.array(list(set(df_test.columns.values.tolist()) - set(num_cols)))

In [9]:
%%time
### TRAIN ###
"""

"""
# Признаки с пропусками
lst = ['ps_ind_05_cat', 'ps_reg_03', 'ps_car_14',
       'ps_car_09_cat', 'ps_car_07_cat', 'ps_car_01_cat', 'ps_ind_02_cat',
       'ps_ind_04_cat', 'ps_car_02_cat', 'ps_car_05_cat', 'ps_car_03_cat']
# Для каждого признака
for i in lst:
    
    # Сортируем по заполняемому признаку, чтобы NA были в начале
    df_train.sort_values(by=i, na_position='first', inplace=True)

    # Значимые признаки по которым будем заполнять. Создаем отдельный датафрейм
    df_knn = df_train[['ps_car_13', 'ps_ind_03', 'ps_reg_01', 'ps_ind_15', i]]

    # Нормализуем тренировочный набор, целевую переменную не трогаем
    # Сначала обучаем скелер на всех train данных 
    scaler = StandardScaler()
    scaler.fit(df_knn[df_knn.columns[:4]])

    # Затем разбиваем на два датафрейма - с пропущенными и с заполненными значениями
    # Плюс целевую переменную делаем отдельно как У
    knn_x_train = df_knn.dropna(subset=[i]).drop([i], axis=1)
    knn_y_train = df_knn.dropna(subset=[i])[i]
    knn_x_test = df_knn[df_knn[i].isnull()].drop([i], axis=1)

    # Нормализуем данные обученным скелером
    knn_x_train = scaler.transform(knn_x_train)
    knn_x_test = scaler.transform(knn_x_test)

    # Прогнозируем пропущенные значения
    if i in num_cols:
        clf = KNeighborsRegressor(5, weights='distance')
    else:
        clf = KNeighborsClassifier(5, weights='distance')
    clf.fit(knn_x_train, knn_y_train)
    imputed_values = clf.predict(knn_x_test)

    # Объединяем спрогнозированные и остальные
    # И собираем таблицу в исходное состояние
    # Теперь у нас вместо пропущенных значений - спрогнозированные
    X2 = np.hstack((knn_x_test, imputed_values.reshape(-1,1)))
    X1 = np.hstack((knn_x_train, knn_y_train[:, np.newaxis]))
    df_knn = np.vstack((X2, X1))

    # Помещаем заполненный столбец в исходную таблицу
    # Меняем тип данных на нужный
    # И сортируем по индексу - Готово!
    df_train[i] = df_knn[:,-1]

df_train.sort_index(inplace=True)

### TEST ###
"""

"""
lst = ['ps_ind_05_cat', 'ps_reg_03', 'ps_car_14',
       'ps_car_09_cat', 'ps_car_07_cat', 'ps_car_01_cat', 'ps_ind_02_cat',
       'ps_ind_04_cat', 'ps_car_02_cat', 'ps_car_11', 'ps_car_05_cat', 'ps_car_03_cat']

for i in lst:
    # Сортируем по заполняемому признаку, чтобы NA были в верхней части таблицы
    df_test.sort_values(by=i, na_position='first', inplace=True)

    # Значимые признаки по которым будем заполнять. Создаем отдельную таблицу
    df_knn = df_test[['ps_car_13', 'ps_ind_03', 'ps_reg_01', 'ps_ind_15', i]]

    # Нормализуем тренировочный набор, целевую переменную не трогаем
    # Сначала обучаем скелер на всех данных 
    scaler = StandardScaler()
    scaler.fit(df_knn[df_knn.columns[:4]])

    # Затем разбиваем на две таблицы - с пропущенными и с заполненными значениями
    # Плюс целевую переменную делаем отдельно как У
    knn_x_train = df_knn.dropna(subset=[i]).drop([i], axis=1)
    knn_y_train = df_knn.dropna(subset=[i])[i]
    knn_x_test = df_knn[df_knn[i].isnull()].drop([i], axis=1)

    # Нормализуем данные обученным скелером
    knn_x_train = scaler.transform(knn_x_train)
    knn_x_test = scaler.transform(knn_x_test)

    # Прогнозируем пропущенные значения
    if i in num_cols:
        clf = KNeighborsRegressor(5, weights='distance')
    else:
        clf = KNeighborsClassifier(5, weights='distance')
    trained_model = clf.fit(knn_x_train, knn_y_train)
    imputed_values = trained_model.predict(knn_x_test)

    # Объединяем спрогнозированные и остальные
    # И собираем таблицу в исходное состояние
    # Теперь у нас вместо пропущенных значений - спрогнозированные
    X2 = np.hstack((knn_x_test, imputed_values.reshape(-1,1)))
    X1 = np.hstack((knn_x_train, knn_y_train[:, np.newaxis]))
    df_knn = np.vstack((X2, X1))

    # Помещаем заполненный столбец в исходную таблицу
    # Меняем тип данных на нужный
    # И сортируем по индексу - Готово!
    df_test[i] = df_knn[:,-1]

df_test.sort_index(inplace=True)

del df_knn, knn_x_train, knn_y_train, knn_x_test

CPU times: user 7min 44s, sys: 12.8 s, total: 7min 57s
Wall time: 7min 54s


In [10]:
df_train['ps_car_14']=df_train['ps_car_14'].apply(lambda x: x/1000000)
df_train['ps_reg_03']=df_train['ps_reg_03'].apply(lambda x: x/1000000)

df_test['ps_car_14']=df_test['ps_car_14'].apply(lambda x: x/1000000)
df_test['ps_reg_03']=df_test['ps_reg_03'].apply(lambda x: x/1000000)

In [12]:
# Разделение на категориальные и числовые 
num_cols = np.array(['ps_reg_01', 'ps_reg_02', 'ps_reg_03','ps_car_14', 
                     'ps_car_12', 'ps_car_13', 
                     'ps_calc_01','ps_calc_02','ps_calc_03'])
cat_cols = np.array(list(set(df_test.columns.values.tolist()) - set(num_cols)))

In [13]:
# Преобразование типов (после заполнения пропусков)
for c in num_cols:
    df_train[c] = df_train[c].astype(np.float32)
    df_test[c] = df_test[c].astype(np.float32)
    
for c in cat_cols:
    df_train[c] = df_train[c].astype(np.int8)
    df_test[c] = df_test[c].astype(np.int8)

In [14]:
# Оставляем хорошие признаки
good_col=['ps_ind_03',
'ps_car_13' ,           
'ps_ind_05_cat',        
'ps_reg_01',           
'ps_ind_17_bin',        
'ps_reg_02',           
'ps_ind_15',            
'ps_reg_03',           
'ps_car_01_cat',        
'ps_ind_06_bin',        
'ps_ind_01',            
'ps_car_14',           
'ps_car_15',           
'ps_ind_07_bin',        
'ps_car_07_cat',        
'ps_car_09_cat',       
'ps_ind_02_cat',        
'ps_car_04_cat',        
'ps_car_12',            
'ps_ind_16_bin',      
'ps_car_11',           
'ps_ind_09_bin',       
'ps_car_02_cat',       
'ps_car_06_cat',       
'ps_ind_04_cat',        
'ps_ind_08_bin',       
'ps_car_03_cat',
] 

In [15]:
df_train = df_train[good_col + ['target']]
df_test = df_test[good_col]

In [17]:
df_train.to_pickle('/resources/data/driver/train_knn_dec.pkl')
df_test.to_pickle('/resources/data/driver/test_knn_dec.pkl')

In [13]:
cat_features = [a for a in df_x_train.columns if a.endswith('cat')]

for column in cat_features:
    temp = pd.get_dummies(pd.Series(df_x_train[column]), prefix=column)
    df_x_train = pd.concat([df_x_train,temp],axis=1)
    df_x_train = df_x_train.drop([column],axis=1)
    
for column in cat_features:
    temp = pd.get_dummies(pd.Series(df_test[column]), prefix=column)
    df_test = pd.concat([df_test,temp],axis=1)
    df_test = df_test.drop([column],axis=1)

print(df_x_train.values.shape, df_test.values.shape)

(595206, 81) (892816, 81)


In [3]:
df_train = pd.read_pickle('/resources/data/driver/train_knn_dec.pkl')
df_x_train = df_train.drop('target', axis = 1)
df_y_train = df_train['target']    
del df_train

df_test = pd.read_pickle('/resources/data/driver/test_knn_dec.pkl')

In [4]:
cat_features = [a for a in df_x_train.columns if a.endswith('cat')]

for column in cat_features:
    temp = pd.get_dummies(pd.Series(df_x_train[column]), prefix=column)
    df_x_train = pd.concat([df_x_train,temp],axis=1)
    df_x_train = df_x_train.drop([column],axis=1)
    
for column in cat_features:
    temp = pd.get_dummies(pd.Series(df_test[column]), prefix=column)
    df_test = pd.concat([df_test,temp],axis=1)
    df_test = df_test.drop([column],axis=1)

print(df_x_train.values.shape, df_test.values.shape)

(595206, 81) (892816, 81)


In [5]:
# XGB_1 params
xgb_2_params = {}
xgb_2_params['max_depth'] = 4
xgb_2_params['learning_rate'] = 0.1
xgb_2_params['n_estimators'] = 400
xgb_2_params['subsample'] = 0.8
xgb_2_params['colsample_bytree'] = 0.8   
xgb_2_params['min_child_weight'] = 6
xgb_2_params['gamma']=10
xgb_2_params['reg_alpha'] = 8
xgb_2_params['reg_lambda'] = 1.3
xgb_2_params['seed'] = 1
xgb_2_params['n_jobs'] = -1

In [25]:
# XGB_2 params
xgb_2_params = {}
xgb_2_params['max_depth'] = 4
xgb_2_params['learning_rate'] = 0.1
xgb_2_params['n_estimators'] = 400
xgb_2_params['subsample'] = 0.8
xgb_2_params['colsample_bytree'] = 0.8   
xgb_2_params['min_child_weight'] = 6
xgb_2_params['gamma']=10
xgb_2_params['reg_alpha'] = 8
xgb_2_params['reg_lambda'] = 1.3
xgb_2_params['seed'] = 1
xgb_2_params['n_jobs'] = -1

In [None]:
%%time

# scores_1.bin - СКОР РАБОЧЕГО ДАТАСЕТА ПРИ max_depth=4, n_estimator=110 без Oversample
# with open('scores_1.bin', 'rb') as f:
#      scores_1 = np.load(f)
        
scores_1 = np.array([])
scores_2 = np.array([])

# estimator = xgb.XGBClassifier(**xgb_1_params)
#estimator = CatBoostClassifier(depth=6, iterations=500, random_seed=1)
estimator = xgb.XGBClassifier(max_depth=4, n_estimator=110, seed=1, n_jobs=-1)

for i in range(10):
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)

    for train_index, test_index in cv.split(df_x_train, df_y_train):

        X_train, X_test = df_x_train.iloc[train_index, :], df_x_train.iloc[test_index, :]
        Y_train, Y_test = df_y_train.iloc[train_index], df_y_train.iloc[test_index]
        
        pos = pd.Series(Y_train == 1)
        # Add positive examples
        number = Y_train[Y_train==0].shape[0] // Y_train[Y_train==1].shape[0]
        X_train = X_train.append([X_train.loc[pos]]*number)
        Y_train = Y_train.append([Y_train.loc[pos]]*number)

        #Shuffle data
        idx = shuffle(np.arange(len(X_train)), random_state=i)
        X_train = X_train.iloc[idx]
        Y_train = Y_train.iloc[idx]


        estimator.fit(X_train, Y_train)
        pred = estimator.predict_proba(X_test)[:, 1]

        gini_sc = GiniScore(Y_test, pred)
        scores_1 = np.append(scores_1, gini_sc)  

        
#################################################
### ИЗМЕНЕНИЯ ВО ВТОРОЙ МОДЕЛИ
df_train = pd.read_pickle('/resources/data/driver/train_knn_dec.pkl')
df_x_train = df_train.drop('target', axis = 1)
df_y_train = df_train['target']    
del df_train

df_test = pd.read_pickle('/resources/data/driver/test_knn_dec.pkl')
#################################################
#estimator = xgb.XGBClassifier(max_depth=4, n_estimator=110, seed=1, n_jobs=-1)
#estimator = xgb.XGBClassifier(**xgb_2_params)
#estimator = CatBoostClassifier(depth=7, iterations=500, random_seed=1)
epoch = 0
pred_xgb = pd.DataFrame()

for i in range(10):
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
    for train_index, test_index in cv.split(df_x_train, df_y_train):
        
        epoch = epoch+1
        
        X_train, X_test = df_x_train.iloc[train_index, :], df_x_train.iloc[test_index, :]
        Y_train, Y_test = df_y_train.iloc[train_index], df_y_train.iloc[test_index]
        
        ### Oversampling
        pos = pd.Series(Y_train == 1)
        number = Y_train[Y_train==0].shape[0] // Y_train[Y_train==1].shape[0] // 5
        X_train = X_train.append([X_train.loc[pos]]*number)
        Y_train = Y_train.append([Y_train.loc[pos]]*number)
        idx = shuffle(np.arange(len(X_train)), random_state=i)
        X_train = X_train.iloc[idx]
        Y_train = Y_train.iloc[idx]
        ### Oversampling
          
       
        estimator.fit(X_train, Y_train)
        pred = estimator.predict_proba(X_test)[:, 1]

        gini_sc = GiniScore(Y_test, pred)
        scores_2 = np.append(scores_2, gini_sc)
        print(gini_sc)
        
        pred_xgb['p_'+str(epoch)] = estimator.predict_proba(df_test)[:, 1]

# =================TARGET ENCODING===============

In [5]:
df_train = pd.read_pickle('/resources/data/driver/train_knn_dec.pkl')
df_x_train = df_train.drop('target', axis = 1)
df_y_train = df_train['target']    
del df_train

df_test = pd.read_pickle('/resources/data/driver/test_knn_dec.pkl')

###### Комбинации признаков
combs = [
    ('ps_reg_01', 'ps_car_02_cat'),  
    ('ps_reg_01', 'ps_car_04_cat'),
]

for n_c, (f1, f2) in enumerate(combs):
    name1 = f1 + "_plus_" + f2
    df_x_train[name1] = df_x_train[f1].apply(lambda x: str(x)) + "_" + df_x_train[f2].apply(lambda x: str(x))
    df_test[name1] = df_test[f1].apply(lambda x: str(x)) + "_" + df_test[f2].apply(lambda x: str(x))
    # Label Encode
    lbl = LabelEncoder()
    lbl.fit(list(df_x_train[name1].values) + list(df_test[name1].values))
    df_x_train[name1] = lbl.transform(list(df_x_train[name1].values))
    df_test[name1] = lbl.transform(list(df_test[name1].values))

    
####### Категориальные признаки для таргет кодирования
f_cats = [f for f in df_x_train.columns if "_cat" in f]

In [6]:
xgb_params = {}
xgb_params['max_depth'] = 4
xgb_params['learning_rate'] = 0.04
# xgb_params['n_estimators'] = 400
xgb_params['subsample'] = 0.8
xgb_params['colsample_bytree'] = 0.8
xgb_params['min_child_weight'] = 6
xgb_params['gamma']=11
xgb_params['reg_alpha'] = 11
xgb_params['reg_lambda'] = 1.4
xgb_params['seed'] = 1
xgb_params['n_jobs'] = -1

In [9]:
%%time
# OUTLIERS = False
# scores_out = np.array([])

SEEDS = 1
SPLITS = 5

scores_2 = np.array([])

for param in [450, 500]:
    
    S_train = np.zeros((len(df_x_train), SEEDS))
    S_test = np.zeros((len(df_test), SEEDS))   
    #estimator = LGBMClassifier(**lgb_params)
    #estimator = RGFClassifier(**rgf_params, learning_rate=param)
    estimator = xgb.XGBClassifier(**xgb_params, n_estimators=param)
 
    # сколько раз по фолдам
    for seed in range(SEEDS):
        
        S_test_i = np.zeros((len(df_test), SPLITS))
        cv = StratifiedKFold(n_splits=SPLITS, shuffle=True, random_state=seed)
        for i, (train_index, test_index) in enumerate(cv.split(df_x_train, df_y_train)):
           
            X_train, X_test = df_x_train.iloc[train_index, :].copy(), df_x_train.iloc[test_index, :].copy()
            Y_train, Y_test = df_y_train.iloc[train_index].copy(), df_y_train.iloc[test_index]
            # Для таргет кодирования
            X_sub = df_test.copy()

            ### OVERSAMPLING
            pos = pd.Series(Y_train == 1)
            number = Y_train[Y_train==0].shape[0] // Y_train[Y_train==1].shape[0] // 5
            X_train = X_train.append([X_train.loc[pos]]*number)
            Y_train = Y_train.append([Y_train.loc[pos]]*number)
            idx = shuffle(np.arange(len(X_train)), random_state=seed)
            X_train = X_train.iloc[idx]
            Y_train = Y_train.iloc[idx]
            ### OVERSAMPLING

            ### TARGET ENCODING
            for f in f_cats:
                X_train[f + "_avg"], X_test[f + "_avg"], X_sub[f + "_avg"] = target_encode(
                                                                trn_series=X_train[f],
                                                                val_series=X_test[f],
                                                                tst_series=X_sub[f],
                                                                target=Y_train,
                                                                min_samples_leaf=200,
                                                                smoothing=15,
                                                                noise_level=0)
            ### TARGET ENCODING    
            
            estimator.fit(X_train, Y_train)
            pred = estimator.predict_proba(X_test)[:, 1]
            
#             if OUTLIERS:
                
#                 gini_sc = GiniScore(Y_test, pred)
#                 scores_2 = np.append(scores_2, gini_sc)
#                 print(gini_sc)
                
#                 ############################# OUTLIERS REMOVE
#                 pred_class = estimator.predict(X_test)
            
#                 outliers = pd.DataFrame(index=Y_test.index, columns=['act', 'class', 'pred'])
#                 outliers['act'] = Y_test
#                 outliers['class'] = pred_class
#                 outliers['pred'] = pred

#                 # Ошибочные 1 (убираем выбросы)
#                 outliers = outliers.drop(outliers[(outliers['act'] < outliers['class']) & (outliers['pred'] > 
#                                 outliers['pred'][outliers['act'] < outliers['class']].quantile(0.95))].index)

#                 # Ошибочные 0 (убираем выбросы)
#                 outliers = outliers.drop(outliers[(outliers['act'] > outliers['class']) & (outliers['pred'] < 
#                                 outliers['pred'][outliers['act'] > outliers['class']].quantile(0.05))].index)

#                 gini_sc = GiniScore(outliers['act'], outliers['pred'])
#                 scores_out = np.append(scores_out, gini_sc)
#                 print(gini_sc)
#                 del outliers
#                 ############################### OUTLIERS REMOVE          


            gini_sc = GiniScore(Y_test, pred)
            scores_2 = np.append(scores_2, gini_sc)
            print(gini_sc)

#             S_train[test_index, seed] = pred
#             S_test_i[:, i] = estimator.predict_proba(X_sub)[:, 1]

            del X_test, X_train, Y_train, X_sub
        
#         S_test[:, seed] = np.mean(S_test_i, axis=1)

#     S_train = np.mean(S_train, axis=1)

0.292496783736
0.28771924788
0.283662199193
0.285594139112
0.282534472093
0.292711302101
0.288531441956
0.283141192201
0.28500435866
0.282611957544
CPU times: user 1h 32min 47s, sys: 57.2 s, total: 1h 33min 44s
Wall time: 15min 41s


In [10]:
np.mean(scores_2)

0.28369792499976493

In [10]:
# Сравниваем
# with open('scores_xgb_trees.bin', 'rb') as f:
#     scores_1 = np.load(f)

for i in range(0, len(scores_2), 5):
    print(np.mean(scores_2[i:i+5]))

#     print(ttest_rel(scores_2[i:i+30], scores_2[150:180]))
#     print(np.mean(scores_2[120:150]), np.mean(scores_2[150:180]))

0.286401368403
0.286400050492


In [15]:
# Сохраняем
with open('stack_xgb_trees_300_600.bin', 'wb') as f:
    np.save(f, scores_2)

In [62]:
# Среднее теста
pred_xgb['target'] = np.exp(np.mean(pred_xgb.applymap(lambda x: np.log(x)), axis=1))

# SUBMIT

In [12]:
df_submission = pd.read_csv('/resources/data/driver/sample_submission.csv').sort_values('id') 
df_submission['target'] = final
df_submission.to_csv('/resources/data/driver/submission.csv', index=False)
print(df_submission.shape)

(892816, 2)


In [None]:
# Сохраняем стек
with open('stack_xgb_trees_300_600.bin', 'wb') as f:
    np.save(f, scores_2)

# STACK

In [4]:
df_train = pd.read_pickle('/resources/data/driver/train_knn_dec.pkl')
df_x_train = df_train.drop('target', axis = 1)
df_y_train = df_train['target']    
del df_train

df_test = pd.read_pickle('/resources/data/driver/test_knn_dec.pkl')

###### Комбинации признаков
combs = [
    ('ps_reg_01', 'ps_car_02_cat'),  
    ('ps_reg_01', 'ps_car_04_cat'),
]

for n_c, (f1, f2) in enumerate(combs):
    name1 = f1 + "_plus_" + f2
    df_x_train[name1] = df_x_train[f1].apply(lambda x: str(x)) + "_" + df_x_train[f2].apply(lambda x: str(x))
    df_test[name1] = df_test[f1].apply(lambda x: str(x)) + "_" + df_test[f2].apply(lambda x: str(x))
    # Label Encode
    lbl = LabelEncoder()
    lbl.fit(list(df_x_train[name1].values) + list(df_test[name1].values))
    df_x_train[name1] = lbl.transform(list(df_x_train[name1].values))
    df_test[name1] = lbl.transform(list(df_test[name1].values))

    
####### Категориальные признаки для таргет кодирования
f_cats = [f for f in df_x_train.columns if "_cat" in f]

In [None]:
# 0.286263
cat_params1 = {}
cat_params1['depth'] = 7
cat_params1['learning_rate'] = 0.02
cat_params1['rsm'] = 0.8
cat_params1['iterations'] = 700
cat_params1['l2_leaf_reg'] = 18
cat_params1['random_seed'] = 1

# 0.286199
cat_params2 = {}
cat_params2['depth'] = 7
cat_params2['learning_rate'] = 0.02
cat_params2['rsm'] = 0.8
cat_params2['iterations'] = 750
cat_params2['l2_leaf_reg'] = 22
cat_params2['random_seed'] = 1

# 0.287700
lgb_params1 = {}
lgb_params1['n_estimators'] = 1000
lgb_params1['learning_rate'] = 0.02
lgb_params1['num_leaves'] = 16
lgb_params1['subsample'] = 0.7
lgb_params1['subsample_freq'] = 1
lgb_params1['colsample_bytree'] = 0.7
lgb_params1['reg_alpha'] = 18
lgb_params1['reg_lambda'] = 1.6
lgb_params1['random_state'] = 1

# 0.287167
lgb_params3 = {}
lgb_params3['n_estimators'] = 1150
lgb_params3['learning_rate'] = 0.02
lgb_params3['num_leaves'] = 20
lgb_params3['subsample'] = 0.7
lgb_params3['subsample_freq'] = 1
lgb_params3['colsample_bytree'] = 0.7
lgb_params3['reg_alpha'] = 18
lgb_params3['reg_lambda'] = 1.8
lgb_params3['random_state'] = 1

xgb_params1 = {}
xgb_params1['max_depth'] = 4
xgb_params1['learning_rate'] = 0.04
xgb_params1['n_estimators'] = 450
xgb_params1['subsample'] = 0.8
xgb_params1['colsample_bytree'] = 0.8
xgb_params1['min_child_weight'] = 6
xgb_params1['gamma']=11
xgb_params1['reg_alpha'] = 11
xgb_params1['reg_lambda'] = 1.4
xgb_params1['seed'] = 1
xgb_params1['n_jobs'] = -1

xgb_params2 = {}
xgb_params2['max_depth'] = 4
xgb_params2['learning_rate'] = 0.07
xgb_params2['n_estimators'] = 250
xgb_params2['subsample'] = 0.8
xgb_params2['colsample_bytree'] = 0.8
xgb_params2['min_child_weight'] = 6
xgb_params2['gamma']=10
xgb_params2['reg_alpha'] = 8
xgb_params2['reg_lambda'] = 1.3
xgb_params2['seed'] = 1
xgb_params2['n_jobs'] = -1

In [5]:
xgb_params1 = {}
xgb_params1['max_depth'] = 4
xgb_params1['learning_rate'] = 0.04
xgb_params1['n_estimators'] = 450
xgb_params1['subsample'] = 0.8
xgb_params1['colsample_bytree'] = 0.8
xgb_params1['min_child_weight'] = 6
xgb_params1['gamma']=11
xgb_params1['reg_alpha'] = 11
xgb_params1['reg_lambda'] = 1.4
xgb_params1['seed'] = 1
xgb_params1['n_jobs'] = -1

xgb_params2 = {}
xgb_params2['max_depth'] = 4
xgb_params2['learning_rate'] = 0.07
xgb_params2['n_estimators'] = 250
xgb_params2['subsample'] = 0.8
xgb_params2['colsample_bytree'] = 0.8
xgb_params2['min_child_weight'] = 6
xgb_params2['gamma']=10
xgb_params2['reg_alpha'] = 8
xgb_params2['reg_lambda'] = 1.3
xgb_params2['seed'] = 1
xgb_params2['n_jobs'] = -1

xgb_params3 = {}
xgb_params3['max_depth'] = 4
xgb_params3['learning_rate'] = 0.03
xgb_params3['n_estimators'] = 550
xgb_params3['subsample'] = 0.8
xgb_params3['colsample_bytree'] = 0.8
xgb_params3['min_child_weight'] = 6
xgb_params3['gamma'] = 13
xgb_params3['reg_alpha'] = 11
xgb_params3['reg_lambda'] = 1.5
xgb_params3['seed'] = 1
xgb_params3['n_jobs'] = -1

In [12]:
# 0.287700
lgb_params1 = {}
lgb_params1['n_estimators'] = 1000
lgb_params1['learning_rate'] = 0.02
lgb_params1['num_leaves'] = 16
lgb_params1['subsample'] = 0.7
lgb_params1['subsample_freq'] = 1
lgb_params1['colsample_bytree'] = 0.7
lgb_params1['reg_alpha'] = 18
lgb_params1['reg_lambda'] = 1.6
lgb_params1['random_state'] = 1

# 0.286942
lgb_params2 = {}
lgb_params2['n_estimators'] = 1100
lgb_params2['learning_rate'] = 0.02
lgb_params2['num_leaves'] = 20
lgb_params2['subsample'] = 0.7
lgb_params2['subsample_freq'] = 1
lgb_params2['colsample_bytree'] = 0.7
lgb_params2['reg_alpha'] = 19
lgb_params2['reg_lambda'] = 1.7
lgb_params2['random_state'] = 1

# 0.287167
lgb_params3 = {}
lgb_params3['n_estimators'] = 1150
lgb_params3['learning_rate'] = 0.02
lgb_params3['num_leaves'] = 20
lgb_params3['subsample'] = 0.7
lgb_params3['subsample_freq'] = 1
lgb_params3['colsample_bytree'] = 0.7
lgb_params3['reg_alpha'] = 18
lgb_params3['reg_lambda'] = 1.8
lgb_params3['random_state'] = 1

In [5]:
# 0.286263
cat_params1 = {}
cat_params1['depth'] = 7
cat_params1['learning_rate'] = 0.02
cat_params1['rsm'] = 0.8
cat_params1['iterations'] = 700
cat_params1['l2_leaf_reg'] = 18
cat_params1['random_seed'] = 1

# 0.286199
cat_params2 = {}
cat_params2['depth'] = 7
cat_params2['learning_rate'] = 0.02
cat_params2['rsm'] = 0.8
cat_params2['iterations'] = 750
cat_params2['l2_leaf_reg'] = 22
cat_params2['random_seed'] = 1

# 0.285511
cat_params3 = {}
cat_params3['depth'] = 7
cat_params3['learning_rate'] = 0.03
cat_params3['rsm'] = 0.7
cat_params3['iterations'] = 500
cat_params3['bagging_temperature'] = 1
cat_params3['l2_leaf_reg'] = 15
cat_params3['random_seed'] = 1

In [6]:
%%time
# OUTLIERS = False
# scores_out = np.array([])

# model1 = xgb.XGBClassifier(**xgb_params1)
# model2 = xgb.XGBClassifier(**xgb_params2)
# model3 = xgb.XGBClassifier(**xgb_params3)

# model1 = CatBoostClassifier(**cat_params1)
# model2 = CatBoostClassifier(**cat_params2)
# model3 = CatBoostClassifier(**cat_params3)

model1 = LGBMClassifier(**lgb_params1)
model2 = LGBMClassifier(**lgb_params3)
model3 = xgb.XGBClassifier(**xgb_params1)
model4 = xgb.XGBClassifier(**xgb_params2)
model5 = CatBoostClassifier(**cat_params1)
model6 = CatBoostClassifier(**cat_params2)

models = (model1, model2, model3, model4, model5, model6)

SPLITS = 5

S_train = np.zeros((len(df_x_train), len(models)))
S_test = np.zeros((len(df_test), len(models)))   
 
# сколько раз по фолдам
for j, clf in enumerate(models):

    estimator = clf
    S_test_i = np.zeros((len(df_test), SPLITS))
    cv = StratifiedKFold(n_splits=SPLITS, shuffle=True, random_state=0)
    scores_2 = np.array([])
    
    # Для каждой модели
    for i, (train_index, test_index) in enumerate(cv.split(df_x_train, df_y_train)):
           
        X_train, X_test = df_x_train.iloc[train_index, :].copy(), df_x_train.iloc[test_index, :].copy()
        Y_train, Y_test = df_y_train.iloc[train_index].copy(), df_y_train.iloc[test_index]
        # Для таргет кодирования
        X_sub = df_test.copy()

        ### OVERSAMPLING
        pos = pd.Series(Y_train == 1)
        number = Y_train[Y_train==0].shape[0] // Y_train[Y_train==1].shape[0] // 5
        X_train = X_train.append([X_train.loc[pos]]*number)
        Y_train = Y_train.append([Y_train.loc[pos]]*number)
        idx = shuffle(np.arange(len(X_train)), random_state=0)
        X_train = X_train.iloc[idx]
        Y_train = Y_train.iloc[idx]
        ### OVERSAMPLING

        ### TARGET ENCODING
        for f in f_cats:
            X_train[f + "_avg"], X_test[f + "_avg"], X_sub[f + "_avg"] = target_encode(
                                                                trn_series=X_train[f],
                                                                val_series=X_test[f],
                                                                tst_series=X_sub[f],
                                                                target=Y_train,
                                                                min_samples_leaf=200,
                                                                smoothing=15,
                                                                noise_level=0)
        ### TARGET ENCODING    
            
        estimator.fit(X_train, Y_train)
        pred = estimator.predict_proba(X_test)[:, 1]
     


        gini_sc = GiniScore(Y_test, pred)
        scores_2 = np.append(scores_2, gini_sc)
        print(gini_sc)

        S_train[test_index, j] = pred
        S_test_i[:, i] = estimator.predict_proba(X_sub)[:, 1]

        del X_test, X_train, Y_train, X_sub
        
    S_test[:, j] = np.exp(np.mean(np.log(S_test_i), axis=1))
    print(np.mean(scores_2))
# Мы получили три столбца предикта на трэйн и тест

# Предикт (не надо для второго уровня)
stacker = LogisticRegression()
stacker.fit(S_train, df_y_train)
final = stacker.predict_proba(S_test)[:,1]

0.292496783736
0.28771924788
0.283662199193
0.285594139112
0.282534472093
0.286401368403
0.291430328185
0.286239361502
0.280513404216
0.285975330709
0.282835860454
0.285398857014
0.292182987965
0.287575204218
0.283047680548
0.2865257133
0.28318801078
0.286503919362
CPU times: user 2h 20min 55s, sys: 1min 55s, total: 2h 22min 50s
Wall time: 34min 57s


In [7]:
S_test[:5,:]

array([[ 0.13729029,  0.13452692,  0.13357566],
       [ 0.14481544,  0.14039931,  0.14026862],
       [ 0.13018696,  0.13070774,  0.13242098],
       [ 0.0806234 ,  0.08367767,  0.08156432],
       [ 0.17964633,  0.18165595,  0.17904538]])

In [8]:
# три столбца полученные на стеке моделей 
val_train = pd.DataFrame(index=df_x_train.index, columns=['1', '2', '3'])
val_train['1'] = S_train[:, 0]
val_train['2'] = S_train[:, 1]
val_train['3'] = S_train[:, 2]

val_test = pd.DataFrame(index=df_test.index, columns=['1', '2', '3'])
val_test['1'] = S_test[:, 0]
val_test['2'] = S_test[:, 1]
val_test['3'] = S_test[:, 2]

val_train.to_csv('/resources/data/driver/res_cat_train.csv', index=False)
val_test.to_csv('/resources/data/driver/res_cat_test.csv', index=False)

In [9]:
df_x_train.shape

(595206, 29)

In [1]:
import operator

In [10]:
val_test.head()

Unnamed: 0_level_0,1,2,3
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.13729,0.134527,0.133576
1,0.144815,0.140399,0.140269
2,0.130187,0.130708,0.132421
3,0.080623,0.083678,0.081564
4,0.179646,0.181656,0.179045


In [1]:
import pandas as pd
import numpy as np
#from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score
import pickle
from scipy.stats import ttest_rel
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from rgf.sklearn import RGFClassifier

In [2]:
# Коэффициент Gini
def GiniScore(y_actual, y_pred):
    return 2*roc_auc_score(y_actual, y_pred)-1

In [3]:
val_train = pd.read_csv('/resources/data/driver/res_lgb_train.csv')
val_test = pd.read_csv('/resources/data/driver/res_lgb_test.csv')

In [3]:
val_train = pd.read_csv('/resources/data/driver/res_xgb_train.csv')
val_test = pd.read_csv('/resources/data/driver/res_xgb_test.csv')

In [3]:
val_train = pd.read_csv('/resources/data/driver/res_cat_train.csv')
val_test = pd.read_csv('/resources/data/driver/res_cat_test.csv')

In [4]:
df_train = pd.read_pickle('/resources/data/driver/train_knn_dec.pkl')
df_y_train = df_train['target']    
del df_train
df_y_train.reset_index(drop=True, inplace=True)

In [5]:
# Собираем предикт для каждого
SPLITS = 5
scores_2 = np.array([])

S_train = np.zeros((len(val_train), 1))
S_test = np.zeros((len(val_test), 1)) 
S_test_i = np.zeros((len(val_test), SPLITS))

estimator = LogisticRegression()

cv = StratifiedKFold(n_splits=SPLITS, shuffle=True, random_state=1)


for i, (train_index, test_index) in enumerate(cv.split(val_train, df_y_train)):
           
    X_train, X_test = val_train.iloc[train_index, :], val_train.iloc[test_index, :]
    Y_train, Y_test = df_y_train.iloc[train_index], df_y_train.iloc[test_index]


    ### OVERSAMPLING
    pos = pd.Series(Y_train == 1)
    number = Y_train[Y_train==0].shape[0] // Y_train[Y_train==1].shape[0] // 5
    X_train = X_train.append([X_train.loc[pos]]*number)
    Y_train = Y_train.append([Y_train.loc[pos]]*number)
    idx = shuffle(np.arange(len(X_train)), random_state=1)
    X_train = X_train.iloc[idx]
    Y_train = Y_train.iloc[idx]
    ### OVERSAMPLING
 
            
    estimator.fit(X_train, Y_train)
    pred = estimator.predict_proba(X_test)[:, 1]
    

    gini_sc = GiniScore(Y_test, pred)
    scores_2 = np.append(scores_2, gini_sc)
    print(gini_sc)

    S_train[test_index, 0] = pred
    S_test_i[:, i] = estimator.predict_proba(val_test)[:, 1]

    del X_test, X_train, Y_train
        
S_test[:,0] = np.exp(np.mean(np.log(S_test_i), axis=1))

# Сохраняем
with open('stack_train_3.bin', 'wb') as f:
    np.save(f, S_train[:,0])
with open('stack_test_3.bin', 'wb') as f:
    np.save(f, S_test[:,0])

0.292814099727
0.284917599661
0.276468064751
0.281757271965
0.297751463232


In [6]:
S_train[:,0]

array([ 0.21853407,  0.16603845,  0.13581187, ...,  0.1083093 ,
        0.14366838,  0.1130128 ])

In [7]:
df_train = pd.read_pickle('/resources/data/driver/train_knn_dec.pkl')
df_x_train = df_train.drop('target', axis = 1)
df_y_train = df_train['target']    
del df_train
df_y_train.reset_index
df_test = pd.read_pickle('/resources/data/driver/test_knn_dec.pkl')

## Для трех моделей##############
S_train_f = np.zeros((len(df_x_train), 3))
S_test_f = np.zeros((len(df_test), 3)) 

for i in range(3):
    # train
    with open('stack_train_'+str(i+1)+'.bin', 'rb') as f:
        S_train_f[:,i] = np.load(f)
    # test
    with open('stack_test_'+str(i+1)+'.bin', 'rb') as f:
        S_test_f[:,i] = np.load(f)

In [9]:
S_test_f

array([[ 0.13698623,  0.13428062,  0.13738251],
       [ 0.1326837 ,  0.13403583,  0.14237263],
       [ 0.12606727,  0.1311661 ,  0.13450562],
       ..., 
       [ 0.17128715,  0.17324402,  0.17677975],
       [ 0.12396437,  0.1256496 ,  0.12768861],
       [ 0.15664125,  0.15590674,  0.15544927]])

In [10]:
stacker = LogisticRegression()
stacker.fit(S_train_f, df_y_train.values)
final = stacker.predict_proba(S_test_f)[:,1] 

# ==========================================

In [11]:
final

array([ 0.02710455,  0.02641555,  0.02584769, ...,  0.03225679,
        0.02554802,  0.0301071 ])

In [3]:
class Ensemble(object):
    def __init__(self, n_splits, stacker, base_models):
        self.n_splits = n_splits
        self.stacker = stacker
        self.base_models = base_models

    def fit_predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)

        folds = list(StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=1).split(X, y))

        S_train = np.zeros((X.shape[0], len(self.base_models))) # Заготовка для предиктов на трейне
        S_test = np.zeros((T.shape[0], len(self.base_models))) # Заготовка для предиктов на тесте
        for i, clf in enumerate(self.base_models):

            S_test_i = np.zeros((T.shape[0], self.n_splits))

            for j, (train_idx, test_idx) in enumerate(folds):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]


                print ("Fit %s fold %d" % (str(clf).split('(')[0], j+1))
                clf.fit(X_train, y_train)

                y_pred = clf.predict_proba(X_holdout)[:,1]                

                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = clf.predict_proba(T)[:,1]
            S_test[:, i] = S_test_i.mean(axis=1)

        results = cross_val_score(self.stacker, S_train, y, cv=3, scoring='roc_auc')
        print("Stacker score: %.5f" % (results.mean()))

        self.stacker.fit(S_train, y)
        res = self.stacker.predict_proba(S_test)[:,1]
        return res

In [4]:
df_train = pd.read_pickle('/resources/data/driver/train_knn_dec.pkl')
df_x_train = df_train.drop('target', axis = 1)
df_y_train = df_train['target']    
del df_train

df_test = pd.read_pickle('/resources/data/driver/test_knn_dec.pkl')

###### Комбинации признаков
combs = [
    ('ps_reg_01', 'ps_car_02_cat'),  
    ('ps_reg_01', 'ps_car_04_cat'),
]

for n_c, (f1, f2) in enumerate(combs):
    name1 = f1 + "_plus_" + f2
    df_x_train[name1] = df_x_train[f1].apply(lambda x: str(x)) + "_" + df_x_train[f2].apply(lambda x: str(x))
    df_test[name1] = df_test[f1].apply(lambda x: str(x)) + "_" + df_test[f2].apply(lambda x: str(x))
    # Label Encode
    lbl = LabelEncoder()
    lbl.fit(list(df_x_train[name1].values) + list(df_test[name1].values))
    df_x_train[name1] = lbl.transform(list(df_x_train[name1].values))
    df_test[name1] = lbl.transform(list(df_test[name1].values))

    
####### Категориальные признаки для таргет кодирования
f_cats = [f for f in df_x_train.columns if "_cat" in f]

In [None]:
# 0.286263
cat_params1 = {}
cat_params1['depth'] = 7
cat_params1['learning_rate'] = 0.02
cat_params1['rsm'] = 0.8
cat_params1['iterations'] = 700
cat_params1['l2_leaf_reg'] = 18
cat_params1['random_seed'] = 1

# 0.286199
cat_params2 = {}
cat_params2['depth'] = 7
cat_params2['learning_rate'] = 0.02
cat_params2['rsm'] = 0.8
cat_params2['iterations'] = 750
cat_params2['l2_leaf_reg'] = 22
cat_params2['random_seed'] = 1

# 0.287700
lgb_params1 = {}
lgb_params1['n_estimators'] = 1000
lgb_params1['learning_rate'] = 0.02
lgb_params1['num_leaves'] = 16
lgb_params1['subsample'] = 0.7
lgb_params1['subsample_freq'] = 1
lgb_params1['colsample_bytree'] = 0.7
lgb_params1['reg_alpha'] = 18
lgb_params1['reg_lambda'] = 1.6
lgb_params1['random_state'] = 1

# 0.287167
lgb_params3 = {}
lgb_params3['n_estimators'] = 1150
lgb_params3['learning_rate'] = 0.02
lgb_params3['num_leaves'] = 20
lgb_params3['subsample'] = 0.7
lgb_params3['subsample_freq'] = 1
lgb_params3['colsample_bytree'] = 0.7
lgb_params3['reg_alpha'] = 18
lgb_params3['reg_lambda'] = 1.8
lgb_params3['random_state'] = 1

xgb_params1 = {}
xgb_params1['max_depth'] = 4
xgb_params1['learning_rate'] = 0.04
xgb_params1['n_estimators'] = 450
xgb_params1['subsample'] = 0.8
xgb_params1['colsample_bytree'] = 0.8
xgb_params1['min_child_weight'] = 6
xgb_params1['gamma']=11
xgb_params1['reg_alpha'] = 11
xgb_params1['reg_lambda'] = 1.4
xgb_params1['seed'] = 1
xgb_params1['n_jobs'] = -1

xgb_params2 = {}
xgb_params2['max_depth'] = 4
xgb_params2['learning_rate'] = 0.07
xgb_params2['n_estimators'] = 250
xgb_params2['subsample'] = 0.8
xgb_params2['colsample_bytree'] = 0.8
xgb_params2['min_child_weight'] = 6
xgb_params2['gamma']=10
xgb_params2['reg_alpha'] = 8
xgb_params2['reg_lambda'] = 1.3
xgb_params2['seed'] = 1
xgb_params2['n_jobs'] = -1

In [None]:
model1 = LGBMClassifier(**lgb_params1)
model2 = LGBMClassifier(**lgb_params3)
model3 = xgb.XGBClassifier(**xgb_params1)
model4 = xgb.XGBClassifier(**xgb_params2)
model5 = CatBoostClassifier(**cat_params1)
model6 = CatBoostClassifier(**cat_params2)

log_model = LogisticRegression()
       
stack = Ensemble(n_splits=5,
        stacker = log_model,
        base_models = (model1, model2, model3, model4, model5, model6))        
        
y_pred = stack.fit_predict(df_x_train, df_y_train, df_test)        

sub_1 = pd.DataFrame()
sub_1['id'] = id_test
sub_1['target'] = y_pred