In [1]:
# базовые библиотеки
from sklearn.model_selection import cross_validate
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
import warnings
warnings.filterwarnings("ignore") 
from tqdm import tqdm_notebook
from scipy.sparse import hstack, vstack, csc_matrix
import os, re, sys, gc, pickle, time
from collections import defaultdict



# валидация, оптимизация гиперпараметров
from sklearn.model_selection import GridSearchCV, cross_val_score,\
                                    KFold, train_test_split, cross_validate, ParameterGrid

# пайплайн
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin,  clone
from sklearn.metrics import make_scorer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, OneHotEncoder

# дамми-регрессор
from sklearn.dummy import DummyRegressor

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
import time
import shutil

In [2]:
def convert_types(df):
    '''конвертирует типы (по возможности)'''
    df_c = df.copy()
    for col in df.columns:
        ser = df[col]
        try:
            ser2 = ser.astype('datetime64')
            df_c[col] = ser2
        except:
            try:
                ser2 =ser.astype(int)
                if (ser != ser2).any():
                    try:
                        df_c[col] =ser.astype(float)
                    except:
                        pass
                    
            except:
                pass
        
    try:
        return pd.concat([df_c.select_dtypes('datetime64'), df_c.select_dtypes(exclude = ['datetime64'])], 1)
    except:
        return df_c
    
def profile_df(df):
    '''отчет по датафрейму'''
    df_c = df.copy()
    _row_duplicates = df.duplicated().mean()
    _types = df.dtypes
    _nan_mean = df.isna().mean()
    _nunique = df.nunique() / df.shape[0]
    
    report_df = pd.concat([_types, _nan_mean, _nunique], 1)
    report_df.columns = ['тип','доля_nan', 'доля_уник_зн']
    report_df.index = df.columns
    print('% дубликатов строк равен {:.5%}'.format(_row_duplicates))
    
    # удаляем дубликаты строк
    df_c = df_c[~df.duplicated()].reset_index(drop = True)
    # удаляем колонки с 1 уникальным значением
    df_c = df_c.loc[:, df_c.fillna('nan').nunique()!=1]
    return (df_c, report_df)

def train_hold_test_split(features, target, tr_size, ho_size, shuffle, random_state, stratify, use_test):
    if use_test:
        # делим данные на тренировочную, отложенную, тестовую части
        features_train, features_te, target_train, target_te = train_test_split(\
                                                                       features, target,\
                                                                       train_size = tr_size,\
                                                                       shuffle = shuffle, random_state = random_state,\
                                                                       stratify = stratify)
        features_tr, features_ho, target_tr, target_ho = train_test_split(\
                                                                       features_train, target_train,\
                                                                       train_size = 1-ho_size,\
                                                                       shuffle = shuffle, random_state = random_state,\
                                                                       stratify = stratify)
        print('train size = {}, hold size ={}, test size = {}'\
              .format(features_tr.shape[0], features_ho.shape[0], features_te.shape[0]))
        return (features_tr, features_ho, features_te, target_tr, target_ho, target_te)
    else:
        # делим данные на тренировочную, отложенную, тестовую части
        features_tr, features_ho, target_tr, target_ho = train_test_split(\
                                                             features, target,\
                                                             train_size = tr_size,\
                                                             shuffle = shuffle, random_state = random_state,\
                                                             stratify = stratify)
        print('train size = {}, hold size ={}'\
              .format(features_tr.shape[0], features_ho.shape[0]))
        return (features_tr, features_ho, target_tr, target_ho)
    
class SklearnHelperColumnSelector(BaseEstimator, TransformerMixin):
    '''выбирает колонки, отпавляемые в пайплайн'''
    def __init__(self, columns):
        self.columns = columns
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.columns]

class SklearnHelperLabelEncoder(TransformerMixin, BaseEstimator):
    ''' Факторизация категорий '''
    def __init__(self):
        pass
    def fit(self, X, y=None):
        X_c = X.astype(str)
        self.d1 = {}
        for col in X_c.columns:
            uniques = X_c[col].dropna().unique() 
            self.d1[col] =  dict(zip(uniques, range(len(uniques))))              
        return self
    def transform(self, X): 
        X_c = X.astype(str)
        for key, value in self.d1.items():
            X_c[key] = X_c[key].map(value)
        return X_c

class SklearnHelperTargetEncoder(BaseEstimator, TransformerMixin):
    ''' Кодирование категорий с помощью целевой переменной '''
    def __init__(self, n_iter, n_folds, min_samples_leaf, seed):
        self.n_iter = n_iter
        self.n_folds = n_folds
        self.min_samples_leaf = min_samples_leaf
        self.seed = seed
    def fit(self, X, y=None):
        self.y_mean = y.mean()
        _df_tr = pd.concat([X, y], 1)
        target_col = _df_tr.columns[-1]
        to_encode = _df_tr.columns[:-1]
        
        L_tr = []        
        self.L_d_encs = []
        for i in tqdm_notebook(range(self.n_iter)): 
            enc_tr = pd.DataFrame(index = _df_tr.index, columns = to_encode).fillna(0.0)
            for col in to_encode:
                for tr_idx, val_idx in KFold(self.n_folds, shuffle = True,random_state = self.seed+i)\
                                       .split(_df_tr):                    
                    grp = _df_tr.iloc[tr_idx].groupby(col)[target_col].agg({'mean', 'count'})                    
                    d_enc = grp[grp['count']>=self.min_samples_leaf]['mean'].to_dict()
                    self.L_d_encs.append((col, d_enc))
                    to_enc_tr =_df_tr.iloc[val_idx]                    
                    enc_tr.loc[to_enc_tr.index, col] = to_enc_tr[col].map(d_enc).fillna(_df_tr.iloc[tr_idx][target_col].mean())                   
            L_tr.append(enc_tr)    
            
        self.enc_tr =  pd.concat(L_tr, 1)
        self._df_tr = _df_tr
        return self    
    def transform(self, X):
        if np.all(X.values == self._df_tr.values):
            return self.enc_tr
        else:
            df_enc = pd.DataFrame(index = X.index, columns=X.columns).fillna(0.0)
            for feat, d in tqdm_notebook(self.L_d_encs):
                df_enc.loc[:, feat] += X[feat].map(d) / self.n_iter
            return df_enc

class SklearnHelperFeatureSelector(BaseEstimator, TransformerMixin):
    ''' Отбор признаков '''
    def __init__(self, model, cv, scoring, show_progress):
        self.model = model
        self.cv = cv
        self.scoring = scoring
        self.show_progress = show_progress
    def fit(self, X, y=None):
        try:
            _X = X.todense()
        except:
            _X =X.copy()            
        cv_scores = []
        for i in tqdm_notebook(range(_X.shape[1])):
            _X_curr = _X[:, i].reshape(-1,1)
            mean_cv_score = cross_val_score(self.model, _X_curr, y, cv =self.cv, scoring = self.scoring, n_jobs=-1).mean()
            
            cv_scores.append(mean_cv_score)
        order = np.argsort(cv_scores)[::-1]
        to_drop_before, best_features, best_cv_score = [], [], -np.inf
        for i in tqdm_notebook(order):
            curr_features = best_features+[i]
            _X_curr = _X[:, curr_features]
            mean_cv_score = cross_val_score(self.model, _X_curr, y, cv =self.cv, scoring = self.scoring, n_jobs=-1).mean()
            if mean_cv_score>best_cv_score:
                best_cv_score = mean_cv_score
                best_features = curr_features
                if self.show_progress:
                    print('new best score = {:.5f}'.format(best_cv_score))
            else:
                to_drop_before.append(i)
        while True:
            to_drop_after = []
            for i in tqdm_notebook(to_drop_before):
                curr_features = best_features+[i]
                _X_curr = _X[:, curr_features]
                mean_cv_score = cross_val_score(self.model, _X_curr, y, cv =self.cv, scoring = self.scoring, n_jobs=-1).mean()
                if mean_cv_score>best_cv_score:
                    best_cv_score = mean_cv_score
                    best_features = curr_features
                    if self.show_progress:
                        print('new best score = {:.5f}'.format(best_cv_score))
                else:
                    to_drop_after.append(i)
            if to_drop_before == to_drop_after:
                break
            else:
                to_drop_before = to_drop_after  
        self.best_features = best_features
        self.best_cv_score = best_cv_score
    def transform(self, X):
        if isinstance(X, csc_matrix):
            _X = X.copy()
        else:            
            _X = csc_matrix(X) 
        return _X[:, self.best_features]
    
def get_classic_stacked_metafeatures(base_models,\
                                     base_features_train, base_features_hold, base_features_test,\
                                     y_train, y_hold, y_test,\
                                     n_folds,\
                                     seed):
    ''' 
    Каждая базовая модель обучается на всем пространстве признаков с помощью к-фолд валидации
    - ооф предсказания формируют метапризнаки
    - метапризнаки для отложенной и тестовой частей получаются усреднением предсказаний обученных на валидации моделей
    '''
    # списки с метапризнаками
    L_meta_tr, L_meta_hold, L_meta_te = [], [], []
    
    # фиксируем валидацию
    stack_kf = KFold(n_folds, random_state = seed, shuffle = True)
    
    # проходим по моделям
    for i in tqdm_notebook(range(len(base_models))):
        
        # выбираем модель, признаки
        _model = base_models[i]
        _X_train = base_features_train[i]
        _X_hold = base_features_hold[i]
        _X_test = base_features_test[i]
            
        Z_train = np.zeros((_X_train.shape[0], 1))
        Z_hold = np.zeros((_X_hold.shape[0], n_folds))
        Z_test = np.zeros((_X_test.shape[0], n_folds))
        
        # делаем ооф предсказания
        for j, (tr_idx, val_idx) in enumerate(stack_kf.split(y_train)):
            _model.fit(_X_train[tr_idx], y_train[tr_idx])
            Z_train[val_idx, 0] = _model.predict(_X_train[val_idx])
            Z_hold[:, j] = _model.predict(_X_hold)
            Z_test[:, j] = _model.predict(_X_test)
        
        # для отложенной и теста усредняем ооф предсказания, сохраняем
        L_meta_tr.append(Z_train)
        L_meta_hold.append(np.mean(Z_hold, 1))
        L_meta_te.append(np.mean(Z_test, 1))

    X_meta_tr = np.column_stack(L_meta_tr)
    X_meta_ho = np.column_stack(L_meta_hold)
    X_meta_te = np.column_stack(L_meta_te)
    
    return (X_meta_tr, X_meta_ho, X_meta_te)

def get_stacked_random_subsample_metafeatures(base_models,\
                                              base_features_train, base_features_hold, base_features_test,\
                                              y_train, y_hold, y_test,\
                                              n_iterations,\
                                              n_folds,\
                                              seed):
    '''     
    Каждая базовая модель обучается на всем пространстве признаков с помощью к-фолд валидации
    - ооф предсказания формируют метапризнаки
    - метапризнаки для отложенной и тестовой частей получаются усреднением предсказаний обученных на валидации моделей
    
    upd: n_iterations выбирается модель, для модели выбирается доля исходных признаков (subsample) (от .5 до .9),\
    которые участвуют в обучении, на каждой итерации используется новая схема валидации
    '''
    # размеры подвыборок
    subsamples = [.5, .6, .7, .8, .9]
    # индексы моделей
    indexes = np.arange(len(base_models)) 
    # метапризнаки
    L_X_meta_tr, L_X_meta_ho, L_X_meta_te = [], [], []
    
    # итерируемся
    for i in tqdm_notebook(range(n_iterations)):
        
        # фиксируем сид
        _seed = i+seed
        np.random.seed(_seed)
        # фиксируем валидацию
        kf = KFold(n_folds, shuffle=True, random_state = _seed)
        # выбираем индекс
        current_idx = np.random.choice(indexes)
        # выбираем размер подвыборки
        subsample = np.random.choice(subsamples)
        
        # выбираем модель
        current_model = base_models[current_idx]
        # выбираем признаки
        X_tr = base_features_train[current_idx]
        X_ho = base_features_hold[current_idx]
        X_te = base_features_test[current_idx]
        # выбираем подпространсво признаков
        feat_subspace = np.random.choice(np.arange(X_tr.shape[1]),\
                                         np.int32(np.around(subsample*X_tr.shape[1])),\
                                         replace = False)
        _X_tr = X_tr[:,feat_subspace]
        _X_ho = X_ho[:,feat_subspace]
        _X_te = X_te[:,feat_subspace]        
        del X_tr, X_ho, X_te
        gc.collect()
        
        Z_tr = np.zeros((_X_tr.shape[0], 1))
        Z_ho = np.zeros((_X_ho.shape[0], n_folds))
        Z_te = np.zeros((_X_te.shape[0], n_folds))
        
        # делаем ооф предсказания, предсказания для отложенной, для тестоой
        for i, (tr_idx, val_idx) in enumerate(kf.split(_X_tr)):
            current_model.fit(_X_tr[tr_idx], y_train[tr_idx])
            Z_tr[val_idx, 0] = current_model.predict(_X_tr[val_idx])
            Z_ho[:, i] = current_model.predict(_X_ho)
            Z_te[:, i] = current_model.predict(_X_te)
            
        # сохраняем ооф предсказания
        L_X_meta_tr.append(Z_tr)
        # усредняем предсказания по фолдам для отложенной и тестовой частей
        # сохраняем
        L_X_meta_ho.append(np.mean(Z_ho, 1))
        L_X_meta_te.append(np.mean(Z_te, 1))
    
    # собираем сохраненные предсказания     
    X_meta_tr = np.column_stack(L_X_meta_tr)
    X_meta_ho = np.column_stack(L_X_meta_ho)
    X_meta_te = np.column_stack(L_X_meta_te)
    
    return (X_meta_tr, X_meta_ho, X_meta_te)    

def hp_tune_v1(model, grid, X, y, cv, scoring):    
    gs = GridSearchCV(model,param_grid=grid,cv = cv, scoring = scoring, n_jobs=-1, verbose = 1)
    gs.fit(X, y)
    best_estimator_ = clone(gs.best_estimator_)
    del gs
    gc.collect()    
    return best_estimator_


def hp_tune_v2(model, grid1, grid2, grid3, X_tr, y_tr, X_ho, y_ho, cv, scoring): 
    fit_params={'early_stopping_rounds':10,\
                'eval_set':[(X_ho, y_ho)],\
                'verbose':0}
    gs = GridSearchCV(model, param_grid=grid1, cv = cv, scoring=scoring, n_jobs=-1, verbose=1)
    gs.fit(X_tr, y_tr, **fit_params)    
    bp = gs.best_params_
    model = model.set_params(**bp)
    del gs
    gc.collect()

    gs = GridSearchCV(model,param_grid = grid2, cv = cv, scoring = scoring, n_jobs=-1, verbose = 1)
    gs.fit(X_tr, y_tr, **fit_params)    
    bp.update(gs.best_params_)
    model = model.set_params(**bp)
    del gs
    gc.collect()
    bp_c = bp.copy()
    
    best_score = -np.inf
    for params in tqdm_notebook(list(ParameterGrid(grid3))):
        bp_c.update(params)
        model = model.set_params(**bp_c)
        mean_cv_score = cross_val_score(model, X_tr, y_tr, cv=cv, scoring =scoring, n_jobs=-1).mean()
        if mean_cv_score>best_score:
            best_score = mean_cv_score            
            best_estimator_ = model
        else:
            break
    return clone(best_estimator_)

In [3]:
# константы
SEED = 13
# валидация
KF = KFold(3, random_state = SEED, shuffle = True)
# метрика качества
def neg_rmse_func(y_true, y_pred):
    return -np.sqrt(np.mean((y_true-y_pred)**2))
NEG_RMSE_SCORER = make_scorer(neg_rmse_func)

In [4]:
# загружаем данные, определяем типы 
df = convert_types(pd.read_csv('datasets/autos.csv'))

In [5]:
for col in ['DateCrawled', 'LastSeen']:
    df[col+'.year'] = df[col].dt.year
    df[col+'.month'] =df[col].dt.month
    df[col+'.day'] =df[col].dt.day
    df[col+'.dayofweek'] =df[col].dt.dayofweek
    df[col+'.hour'] =df[col].dt.hour
    df[col+'.minute'] =(df[col].dt.minute / 60).round()
    df[col+'.dayofyear'] =df[col].dt.dayofyear
    df[col+'.weekofyear'] =df[col].dt.weekofyear
    df[col+'.quarter'] =df[col].dt.quarter    
df['DateCreated.year'] = df['DateCreated'].dt.year
df['DateCreated.month'] = df['DateCreated'].dt.month
df['DateCreated.day'] = df['DateCreated'].dt.day
df.drop(['DateCrawled', 'DateCreated', 'LastSeen'], axis = 1, inplace = True)

In [6]:
# признаки, целевой признак
features, target = df.drop('Price', 1),df['Price'] 

features_tr, features_ho, features_te, target_tr, target_ho, target_te = \
    train_hold_test_split(features, target, tr_size=.9, ho_size=.1,\
                          shuffle=True, random_state=SEED,\
                          stratify=None,  use_test=True)

train size = 287038, hold size =31894, test size = 35437


In [7]:
nuniques =features_tr.nunique()
nuniques_nonone = nuniques[nuniques!=1].sort_values()

In [8]:
mask_nuniques = nuniques_nonone<100

In [9]:
df_obj = features_tr.select_dtypes(object)
df_num = features_tr.drop(df_obj.columns,1)
num_nuniques = df_num.nunique()
num_nuniques_nonones = num_nuniques[num_nuniques!=1].sort_values()
mask_nuniques = num_nuniques_nonones<100

obj_features = df_obj.columns.tolist()
num_features_small = num_nuniques_nonones[mask_nuniques].index.tolist()
num_features_large = num_nuniques_nonones[~mask_nuniques].index.tolist()

In [10]:
print('num features1 ...')
for i, val in enumerate(num_features_large):
    print('{}) {}'.format(i+1, val))
print('---------------------------------')  

print('num features2 ...')
for i, val in enumerate(num_features_small):
    print('{}) {}'.format(i+1, val))
print('---------------------------------')  

print('obj features ...')
for i, val in enumerate(obj_features):
    print('{}) {}'.format(i+1, val))
print('---------------------------------') 

num features1 ...
1) RegistrationYear
2) Power
3) PostalCode
---------------------------------
num features2 ...
1) LastSeen.quarter
2) DateCrawled.month
3) DateCrawled.minute
4) LastSeen.minute
5) DateCrawled.quarter
6) LastSeen.month
7) DateCreated.year
8) DateCrawled.weekofyear
9) LastSeen.weekofyear
10) DateCrawled.dayofweek
11) LastSeen.dayofweek
12) DateCreated.month
13) Kilometer
14) RegistrationMonth
15) DateCrawled.hour
16) LastSeen.hour
17) DateCreated.day
18) DateCrawled.day
19) LastSeen.day
20) DateCrawled.dayofyear
21) LastSeen.dayofyear
---------------------------------
obj features ...
1) VehicleType
2) Gearbox
3) Model
4) FuelType
5) Brand
6) NotRepaired
---------------------------------


In [12]:
# строки
ppl_obj1 = Pipeline([('obj_features', SklearnHelperColumnSelector(obj_features)),\
                     ('label encoder', SklearnHelperLabelEncoder())])

ppl_obj2 = Pipeline([('obj_features', SklearnHelperColumnSelector(obj_features)),\
                     ('target encoder', SklearnHelperTargetEncoder(n_iter = 5,\
                                                                   n_folds = 20,\
                                                                   min_samples_leaf = 5,\
                                                                   seed = SEED))])
ppl_obj3 = Pipeline([('obj_features', SklearnHelperColumnSelector(obj_features)),\
                     ('impute', SimpleImputer(strategy = 'constant', fill_value = 'other')),\
                     ('one hot encoder', OneHotEncoder(handle_unknown = 'ignore'))])
# даты
ppl_num_small1 = Pipeline([('num_features_small', SklearnHelperColumnSelector(num_features_small))])
ppl_num_small2 = Pipeline([('num_features_small', SklearnHelperColumnSelector(num_features_small)),\
                           ('target encoder', SklearnHelperTargetEncoder(n_iter = 5,\
                                                                         n_folds = 20,\
                                                                         min_samples_leaf = 5,\
                                                                         seed = SEED))])
ppl_num_small3 = Pipeline([('num_features_small', SklearnHelperColumnSelector(num_features_small)),\
                           ('one hot encoder', OneHotEncoder(handle_unknown = 'ignore'))])
# остальные 
ppl_num_large1 = Pipeline([('num_features_large', SklearnHelperColumnSelector(num_features_large))])
ppl_num_large2 = Pipeline([('num_features_large', SklearnHelperColumnSelector(num_features_large)),\
                           ('target encoder', SklearnHelperTargetEncoder(n_iter = 5,\
                                                                         n_folds = 20,\
                                                                         min_samples_leaf = 5,\
                                                                         seed = SEED))])

In [13]:
ppl_tree = Pipeline([('prepare data', 
                        FeatureUnion([\
                            ('f1',ppl_obj1), ('f2',ppl_obj2),\
                            ('f4',ppl_num_small1), ('f5', ppl_num_small2),\
                            ('f7', ppl_num_large1), ('f8', ppl_num_large2)])),\
                     ('impute', SimpleImputer(strategy = 'constant', fill_value = -1)),\
                     ('scale', MaxAbsScaler()),\
                     ('select features', 
                        SklearnHelperFeatureSelector(\
                            model = LinearRegression(),\
                            cv = KF,  scoring = NEG_RMSE_SCORER,\
                            show_progress = True))])

In [14]:
ppl_tree.fit(features_tr, target_tr)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2100.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=60.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=60.0), HTML(value='')))

new best score = -3149.93470
new best score = -2667.30648
new best score = -2546.60667
new best score = -2502.41416
new best score = -2502.22558
new best score = -2489.90953
new best score = -2469.12021
new best score = -2467.57538
new best score = -2458.75114
new best score = -2438.45070
new best score = -2430.84819
new best score = -2424.41812
new best score = -2424.27196
new best score = -2403.51065
new best score = -2401.63784
new best score = -2401.62751
new best score = -2401.62463
new best score = -2398.77523
new best score = -2397.14781
new best score = -2397.01701
new best score = -2397.01701
new best score = -2392.17466
new best score = -2392.08914
new best score = -2390.25013
new best score = -2390.20928
new best score = -2390.04600
new best score = -2389.85133
new best score = -2389.51439
new best score = -2389.30152
new best score = -2389.18689
new best score = -2389.15101
new best score = -2388.61163
new best score = -2388.61163
new best score = -2388.60649
new best score

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=22.0), HTML(value='')))

new best score = -2387.04347
new best score = -2387.03545



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))

new best score = -2386.96741



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=19.0), HTML(value='')))




Pipeline(steps=[('prepare data',
                 FeatureUnion(transformer_list=[('f1',
                                                 Pipeline(steps=[('obj_features',
                                                                  SklearnHelperColumnSelector(columns=['VehicleType',
                                                                                                       'Gearbox',
                                                                                                       'Model',
                                                                                                       'FuelType',
                                                                                                       'Brand',
                                                                                                       'NotRepaired'])),
                                                                 ('label '
                                                               

In [86]:
ppl_lin = Pipeline([('prepare data', 
                        FeatureUnion([\
                            ('f1',ppl_obj1), ('f2',ppl_obj2),('f3', ppl_obj3),\
                            ('f4',ppl_num_small1), ('f5', ppl_num_small2),('f6', ppl_num_small3),\
                            ('f7', ppl_num_large1), ('f8', ppl_num_large2)])),\
                     ('impute', SimpleImputer(strategy = 'constant', fill_value = -1)),\
                     ('scale', MaxAbsScaler())])                     

In [15]:
# ppl_lin.fit(features_tr, target_tr)
X_lin_tr = ppl_tree.transform(features_tr)
X_lin_ho = ppl_tree.transform(features_ho)
X_lin_te = ppl_tree.transform(features_te)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2100.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2100.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2100.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=300.0), HTML(value='')))




In [18]:
y_tr, y_ho, y_te = target_tr.values, target_ho.values, target_te.values

In [None]:
%%time 

for alpha in tqdm_notebook(np.linspace(.001, .01, 10)):
    
    ols = LinearRegression()
    lasso = Lasso(alpha=alpha, normalize=True)
    lasso.fit(X_lin_tr,y_tr)
    selected_idxs = lasso.coef_ != 0
    X_sel_lin_tr = X_lin_tr[:, selected_idxs]
    X_sel_lin_ho = X_lin_ho[:, selected_idxs]
    X_sel_lin_te = X_lin_te[:, selected_idxs]
    mean_cv_score = cross_val_score(ols,X_sel_lin_tr, y_tr,\
                                     cv = KF, scoring = NEG_RMSE_SCORER).mean()
    ols.fit(X_sel_lin_tr, y_tr)
    ho_score = neg_rmse_func(y_ho, ols.predict(X_sel_lin_ho))
    print(mean_cv_score, ho_score)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))

-2227.167073089147 -2249.1033496402915


In [19]:
ols = LinearRegression()
ols.fit(X_lin_tr, y_tr)
ho_score = neg_rmse_func(y_ho, ols.predict(X_lin_ho))

In [21]:
cross_val_score(ols,X_lin_tr, y_tr,\
                                     cv = KF, scoring = NEG_RMSE_SCORER).mean()

-2386.980568974011

In [22]:
ho_score

-2412.256430577081

In [9]:
to_oh_enc = nuniques_c[nuniques_c<100].index.tolist()
to_target_enc = nuniques_c[nuniques_c>=100].index.tolist()

In [10]:
ppl_oh_enc = Pipeline([('to_oh_enc', SklearnHelperColumnSelector(to_oh_enc)),\
                       ('one hot encoder', OneHotEncoder(handle_unknown = 'ignore'))])
ppl_target_enc = Pipeline([('to_target_enc', SklearnHelperColumnSelector(to_target_enc)),\
                           ('target encoder', SklearnHelperTargetEncoder(n_iter = 10,\
                                                                         n_folds = 20,\
                                                                         min_samples_leaf = 5,\
                                                                         seed = SEED))])

In [None]:
ppl_lin = Pipeline([('prepare data', 
                        FeatureUnion([('f1',ppl_oh_enc), ('f2',ppl_target_enc)])),\
                     ('scale', MaxAbsScaler())])
ppl_lin.fit(features_tr, target_tr)   
X_lin_tr = ppl_lin.transform(features_tr)
X_lin_ho = ppl_lin.transform(features_ho)
X_lin_te = ppl_lin.transform(features_te)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=800.0), HTML(value='')))

In [28]:
%%time 

ols = LinearRegression()
lasso = Lasso(alpha=.005, normalize=True)
lasso.fit(X_lin_tr,y_tr)
selected_idxs = lasso.coef_ != 0
X_sel_lin_tr = X_lin_tr[:, selected_idxs]
X_sel_lin_ho = X_lin_ho[:, selected_idxs]
X_sel_lin_te = X_lin_te[:, selected_idxs]
mean_cv_score = cross_val_score(ols,X_sel_lin_tr, y_tr,\
                                 cv = KF, scoring = NEG_RMSE_SCORER).mean()
ols.fit(X_sel_lin_tr, y_tr)
ho_score = neg_rmse_func(y_ho, ols.predict(X_sel_lin_ho))

Wall time: 2min 15s


In [29]:
(mean_cv_score, ho_score)

(-2290.2669597174904, -2315.4838254836673)

In [9]:
# строки
str_features = ['VehicleType', 'Gearbox', 'Model', 'FuelType', 'Brand', 'NotRepaired']
# даты
date_features = ['DateCrawled.year', 'DateCrawled.month', 'DateCrawled.day', 'DateCrawled.dayofweek', 'DateCrawled.hour',\
                 'DateCrawled.minute', 'DateCrawled.dayofyear', 'DateCrawled.weekofyear','DateCrawled.quarter',\
                 'LastSeen.year', 'LastSeen.month','LastSeen.day', 'LastSeen.dayofweek', 'LastSeen.hour','LastSeen.minute',\
                 'LastSeen.dayofyear', 'LastSeen.weekofyear','LastSeen.quarter', 'DateCreated.year', 'DateCreated.month',
                 'DateCreated.day', 'RegistrationMonth', 'RegistrationYear']
# остальные
other_features = ['Power', 'Kilometer', 'PostalCode']

In [12]:
# строки
ppl_str1 = Pipeline([('str', SklearnHelperColumnSelector(str_features)),\
                     ('label encoder', SklearnHelperLabelEncoder())])
ppl_str2 = Pipeline([('str', SklearnHelperColumnSelector(str_features)),\
                     ('target encoder', SklearnHelperTargetEncoder(n_iter = 10,\
                                                                   n_folds = 20,\
                                                                   min_samples_leaf = 5,\
                                                                   seed = SEED))])
ppl_str3 = Pipeline([('str', SklearnHelperColumnSelector(str_features)),\
                     ('one hot encoder', OneHotEncoder(handle_unknown = 'ignore'))])
# даты
ppl_date1 = Pipeline([('date', SklearnHelperColumnSelector(date_features))])
ppl_date2 = Pipeline([('date', SklearnHelperColumnSelector(date_features)),\
                      ('target encoder', SklearnHelperTargetEncoder(n_iter = 10,\
                                                                   n_folds = 20,\
                                                                   min_samples_leaf = 5,\
                                                                   seed = SEED))])
ppl_date3 = Pipeline([('date', SklearnHelperColumnSelector(date_features)),\
                      ('one hot encoder', OneHotEncoder(handle_unknown = 'ignore'))])
# остальные 
ppl_other1 = Pipeline([('other', SklearnHelperColumnSelector(other_features))])
ppl_other2 = Pipeline([('other', SklearnHelperColumnSelector(other_features)),\
                       ('target encoder', SklearnHelperTargetEncoder(n_iter = 10,\
                                                                   n_folds = 20,\
                                                                   min_samples_leaf = 5,\
                                                                   seed = SEED))])
ppl_other3 = Pipeline([('other', SklearnHelperColumnSelector(other_features)),\
                       ('one hot encoder', OneHotEncoder(handle_unknown = 'ignore'))])

In [13]:
# бейзлайн
baseline_score = np.abs(cross_val_score(DummyRegressor('mean'),\
                                        features_tr, target_tr,\
                                        cv = KF, scoring = NEG_RMSE_SCORER).mean())

In [14]:
ppl_lin = Pipeline([('prepare data', 
                        FeatureUnion([('f1',ppl_str3), ('f2',ppl_date3), ('f3', ppl_other3)])),\
                     ('scale', MaxAbsScaler())])
ppl_lin.fit(features_tr, target_tr)   
X_lin_tr = ppl_lin.transform(features_tr)
X_lin_ho = ppl_lin.transform(features_ho)
X_lin_te = ppl_lin.transform(features_te)

In [None]:
%%time 

ols = LinearRegression()
lasso = Lasso(alpha=.005, normalize=True)
lasso.fit(X_lin_tr,y_tr)
selected_idxs = lasso.coef_ != 0
X_sel_lin_tr = X_lin_tr[:, selected_idxs]
X_sel_lin_ho = X_lin_ho[:, selected_idxs]
X_sel_lin_te = X_lin_te[:, selected_idxs]
mean_cv_score = cross_val_score(ols,X_sel_lin_tr, y_tr,\
                                 cv = KF, scoring = NEG_RMSE_SCORER).mean()
ols.fit(X_sel_lin_tr, y_tr)
ho_score = neg_rmse_func(y_ho, ols.predict(X_sel_lin_ho))


In [None]:
mean_cv_score, ho_score

[0.005,
 0.015555555555555555,
 0.026111111111111113,
 0.03666666666666667,
 0.04722222222222222,
 0.057777777777777775,
 0.06833333333333334,
 0.0788888888888889,
 0.08944444444444445,
 0.1,
 0.1,
 0.14444444444444446,
 0.18888888888888888,
 0.23333333333333334,
 0.2777777777777778,
 0.32222222222222224,
 0.3666666666666667,
 0.4111111111111111,
 0.4555555555555556,
 0.5]

In [None]:
ppl_tree = Pipeline([('prepare data', 
                        FeatureUnion([\
                            ('f1',ppl_str1), ('f2',ppl_str2),\
                            ('f3',ppl_date1), ('f4',ppl_date2),\
                            ('f5', ppl_other1), ('f6', ppl_other2)])),\
                     ('impute', SimpleImputer(strategy = 'constant', fill_value = -1)),\
                     ('scale', MaxAbsScaler()),\
                     ('select features', 
                        SklearnHelperFeatureSelector(\
                            model = LGBMRegressor(n_jobs=-1, random_state = SEED),\
                            cv = KF,  scoring = NEG_RMSE_SCORER,\
                            show_progress = True))])

In [None]:
ppl_tree.fit(features_tr, target_tr)
X_tree_tr = ppl_tree.transform(features_tr).toarray()
X_tree_ho = ppl_tree.transform(features_ho).toarray()
X_tree_te = ppl_tree.transform(features_te).toarray()

In [None]:
D_XXX_lin = {'tr':X_lin_tr, 'ho':X_lin_ho, 'te':X_lin_te}
with open('D_XXX_lin.pickle', 'wb') as f:
    pickle.dump(D_XXX_lin, f)
D_XXX_tree = {'tr':X_tree_tr, 'ho':X_tree_ho, 'te':X_tree_te}
with open('D_XXX_tree.pickle', 'wb') as f:
    pickle.dump(D_XXX_tree, f)

In [None]:
# with open('D_XXX_lin.pickle', 'rb') as f:
#     D_XXX_lin = pickle.load(f)
# X_lin_tr, X_lin_ho, X_lin_te = D_XXX_lin['tr'], D_XXX_lin['ho'], D_XXX_lin['te']

# with open('D_XXX_tree.pickle', 'rb') as f:
#     D_XXX_tree = pickle.load(f)
# X_tree_tr, X_tree_ho, X_tree_te = D_XXX_tree['tr'], D_XXX_tree['ho'], D_XXX_tree['te']

In [None]:
# результаты валидации
L_cvAB = []
L_best_estimators_names = ['Ridge', 'LinearSVR',\
                           'DecisionTree', 'ExtraTree',\
                           'RandomForest', 'ExtraTrees',\
                           'Lightgbm', 'XGBoost']
L_best_estimators = []
y_tr, y_ho, y_te = target_tr.values, target_ho.values, target_te.values

In [None]:
ridge_best_model = hp_tune_v1(Ridge(), {'alpha':[0, .1, .3, .5, .7, 1, 3, 5, 7, 10, 30, 50, 70]},\
                              X_lin_tr, y_tr, cv=KF, scoring=NEG_RMSE_SCORER)
L_best_estimators.append(ridge_best_model)

In [None]:
svr_best_model = hp_tune_v1(LinearSVR(), {'C':[1, 3, 5, 7, 10, 30, 50, 70]},\
                              X_lin_tr, y_tr, cv=KF, scoring=NEG_RMSE_SCORER)
L_best_estimators.append(svr_best_model)

In [None]:
tree_pg = {'max_depth':np.arange(7, 41), 'min_samples_leaf':[2, 20, 200]}
dt_best_model = hp_tune_v1(DecisionTreeRegressor(), tree_pg, X_tree_tr, y_tr, cv=KF, scoring=NEG_RMSE_SCORER)
L_best_estimators.append(dt_best_model)

exdt_best_model = hp_tune_v1(ExtraTreeRegressor(), tree_pg, X_tree_tr, y_tr, cv=KF, scoring=NEG_RMSE_SCORER)
L_best_estimators.append(exdt_best_model)

In [None]:
trees_pg = {'max_depth':np.arange(5, 21),'min_samples_leaf':[2, 20],'n_estimators':[10], 'n_jobs':[-1], 'random_state':[SEED]}

rf_best_model = hp_tune_v1(RandomForestRegressor(),trees_pg, X_tree_tr, y_tr, cv=KF, scoring=NEG_RMSE_SCORER)
bp = {'n_estimators':100}
bp.update(**rf_best_model.get_params())
rf_best_model.set_params(**bp)
L_best_estimators.append(rf_best_model)

exts_best_model = hp_tune_v1(ExtraTreesRegressor(),trees_pg, X_tree_tr, y_tr, cv=KF, scoring=NEG_RMSE_SCORER)
bp = {'n_estimators':100}
bp.update(**exts_best_model.get_params())
exts_best_model.set_params(**bp)
L_best_estimators.append(exts_best_model)

In [None]:
lgb_grid1 = {'n_estimators':[10], 'n_jobs':[-1], 'random_state':[SEED],\
             'max_depth':np.arange(2, 21).tolist(),\
             'num_leaves':[4, 8, 16, 32, 64, 128, 256, 512, 1024, 1200, 1500],\
             'min_child_samples':[20, 50]}
lgb_grid2 = {'subsample':np.linspace(.1, 1, 10),\
             'colsample_bytree':np.linspace(.1, 1, 10)}
lgb_grid3 = {'learning_rate':np.linspace(.01, .1, 50), 'n_estimators':[100]}

lgb_best_model = hp_tune_v2(LGBMRegressor(),\
                            lgb_grid1, lgb_grid2, lgb_grid3,\
                            X_tree_tr, y_tr, X_tree_ho, y_ho,\
                            cv=KF, scoring=NEG_RMSE_SCORER)
L_best_estimators.append(lgb_best_model)

In [None]:
xgb_grid1 = {'n_estimators':[10], 'n_jobs':[-1], 'random_state':[SEED],\
             'max_depth':[14, 15, 16, 17, 18],\
             'min_child_weight':[20]}
xgb_grid2 = {'subsample':[.5, .6, .7, .8, .9, 1],\
             'colsample_bytree':[.5, .6, .7, .8, .9, 1]}
xgb_grid3 = {'learning_rate':[.04, .045, .05,.055, .06, .065], 'n_estimators':[100]}

xgb_best_model = hp_tune_v2(XGBRegressor(),\
                            xgb_grid1, xgb_grid2, xgb_grid3,\
                            X_tree_tr, y_tr, X_tree_ho, y_ho,\
                            cv=KF, scoring=NEG_RMSE_SCORER)
L_best_estimators.append(xgb_best_model)

In [None]:
# with open('L_best_estimators.pickle', 'wb') as f:
#     pickle.dump(L_best_estimators, f)
# with open('L_best_estimators.pickle', 'rb') as f:
#     L_best_estimators = pickle.load(f)

In [None]:
L_base_X_tr = (X_lin_tr, X_lin_tr, X_tree_tr, X_tree_tr, X_tree_tr, X_tree_tr, X_tree_tr, X_tree_tr)
L_base_X_ho = (X_lin_ho, X_lin_ho, X_tree_ho, X_tree_ho, X_tree_ho, X_tree_ho, X_tree_ho, X_tree_ho)
L_base_X_te = (X_lin_te, X_lin_te, X_tree_te, X_tree_te, X_tree_te, X_tree_te, X_tree_te, X_tree_te)

In [None]:
L_cvAB = []
for name, estimator, X_tr, X_ho in tqdm_notebook(zip(L_best_estimators_names, L_best_estimators, L_base_X_tr, L_base_X_ho),\
                                                 total = len(L_base_X_tr)):
    start = time.time()
    mean_cv_score = cross_val_score(estimator, X_tr, y_tr, cv = KF, scoring = NEG_RMSE_SCORER, n_jobs=-1).mean()
    end = time.time()
    duration = round(end-start)
    
    estimator.fit(X_tr, y_tr)
    ho_score = neg_rmse_func(y_ho, estimator.predict(X_ho))
    
    L_cvAB.append((name, mean_cv_score, ho_score, duration))
    
cvAB = pd.DataFrame(L_cvAB, columns = ['model', 'cv', 'ho', 'duration']).set_index('model').astype(float).abs()
cvAB = cvAB.loc[cvAB[['cv', 'ho']].mean(1).sort_values().index]
cvAB.to_pickle('cvAB.pkl')

In [None]:
# cvAB = pd.read_pickle('cvAB.pkl')

In [None]:
cvAB

In [None]:
class SklearnHelperMetaFeaturesRegressor(BaseEstimator, TransformerMixin):    
    def __init__(self, model, cv, path_to_folder):
        self.model = model
        self.cv =cv
        self.path_to_folder = path_to_folder
    def fit(self, X, y=None):
        model = self.model
        Z = np.zeros((X.shape[0], 1)
        for i, (tr_idx, val_idx) in enumerate(self.cv.split(y)):            
            model.fit(X[tr_idx], y[tr_idx])
            Z[val_idx, 0] = model.predict(X[val_idx]))
        
            if not os.path.exists(self.path_to_folder):
                os.makedirs(self.path_to_folder)
            else:
                shutil.rmtree(self.path_to_folder, ignore_errors=True)
                os.makedirs(self.path_to_folder)
            path_fitted = os.path.join([self.path_to_folder, f'_fitted{i}.pickle'])
            with open(path_fitted, 'wb') as f:
                pickle.dump(model, f)   
        self.Z = Z
        self.X = X
        return self
    def transform(self, X):
        if np.all(self.X == np.array(X)):
            return self.Z
        else:
            L = []
            for filename in os.listdir(self.path_to_folder):
                with open(filename, 'rb') as f:
                    model = pickle.load(f)
                    L.append(model.predict(X).flatten().reshape(-1,1))
            return np.mean(np.c_[L], 1)

In [None]:
class SklearnHelperMetaFeaturesRegressor_v2(BaseEstimator, TransformerMixin):    
    def __init__(self, model, cv, n_iterations, path_to_folder1, path_to_folder2):
        self.model = model
        self.cv =cv
        self.n_iterations = n_iterations
        self.path_to_folder1 = path_to_folder1
        self.path_to_folder2 = path_to_folder2
    def fit(self, X, y=None):
        model = self.model
        subsamples = [.1, .2, .3, .4, .5, .6, .7, .8, .9]
        for j in tqdm_notebook(range(self.n_iterations)):
            np.random.seed(j)
            subsample = np.random.choice(subsamples)
            feat_use = np.random.choice(np.arange(X.shape[1]), np.int32(np.around(subsample*X.shape[1])), replace = False) 
            Z = np.zeros((X.shape[0], 1)
            for i, (tr_idx, val_idx) in enumerate(self.cv.split(y)):            
                model.fit(X[tr_idx, feat_use], y[tr_idx])
                Z[val_idx, 0] = model.predict(X[val_idx, feat_use]))

                if not os.path.exists(self.path_to_folder1):
                    os.makedirs(self.path_to_folder1)                    
                else:
                    shutil.rmtree(self.path_to_folder1, ignore_errors=True)
                    os.makedirs(self.path_to_folder1)
                
                if not os.path.exists(self.path_to_folder2):
                    os.makedirs(self.path_to_folder2)                    
                else:
                    shutil.rmtree(self.path_to_folder2, ignore_errors=True)
                    os.makedirs(self.path_to_folder2)
                
                path_fitted = os.path.join([self.path_to_folder1, f'_fitted{j}.{i}.pickle'])
                path_feat_idxs = os.path.join([self.path_to_folder2, f'_fitted{j}.{i}.pickle'])                
                
                with open(path_fitted, 'wb') as f:
                    pickle.dump(model, f)
                with open(path_feat_idxs, 'wb') as f:
                    pickle.dump(feat_use, f)                    
        self.Z = Z
        self.X = X
        return self
    def transform(self, X):
        if np.all(self.X == np.array(X)):
            return self.Z
        else:
            Ls=[]
            i = 0
            for filename1, filename2 in zip(os.listdir(self.path_to_folder1), os.listdir(self.path_to_folder2)):
                L = []
                with open(filename1, 'rb') as f:
                    model = pickle.load(f)
                with open(filename2, 'rb') as f:
                    use_idxs = pickle.load(f)
                    L.append(model.predict(X[:, use_idxs]).flatten().reshape(-1,1))
                i+=1
                if (i % self.cv.get_n_splits())==0:
                    Ls.append(np.mean(np.c_[L], 1)) 
            return np.c_[Ls]