Устроим стекинг ТОП-алгоритмов из блиц-проверки

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from sklearn.base import clone

from catboost import Pool, CatBoostRegressor

from tqdm import tqdm


In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

# Загрузка тренировочного датасета

In [3]:
X = pd.read_csv('EDAv2_Train.zip')
y = X['price']
X.drop(columns='price', inplace=True)
X.head(1).T

Unnamed: 0,0
body_type,седан
brand,AUDI
color,чёрный
fuel_type,бензин
model_year,1990
n_doors,4
production_year,1991
vehicle_transmission,механика
engine_power,174.0
description,"Машина в приличном состоянии ,не гнилая не р..."


In [4]:
y.head(1)

0    200000.0
Name: price, dtype: float64

In [5]:
X_test = pd.read_csv('EDAv2_Test.zip')
X_test.head(1).T

Unnamed: 0,0
body_type,лифтбек
brand,SKODA
color,синий
description,"Все автомобили, представленные в продаже, прох..."
engine_displacement,1.2
engine_power,105.0
fuel_type,бензин
mileage,74000
model_year,2013
n_doors,5


# Проверим готовность данных

In [6]:
# оставим в тренировочном датасете только те бренды, которые есть в тестовом
brand_list = list(X_test['brand'].unique())
brand_mask = X['brand'].apply(lambda x: True if x in brand_list else False)
X = X[brand_mask]
y = y[brand_mask]
# и проверим это
display('Train brands:', X['brand'].nunique())
display('Test brands:', X_test['brand'].nunique())

'Train brands:'

12

'Test brands:'

12

In [7]:
# оставим в тренировочном датасете только б/у машины
used_cars_mask = list(X['n_owners'] != '0')
X = X[used_cars_mask]
y = y[used_cars_mask]
display(X.shape, y.shape)

(37841, 17)

(37841,)

In [8]:
# склеим датасеты признаков
X['is_train'] = 1
X_test['is_train'] = 0
X_ = X.append(X_test)
# обработаем
columns_to_drop = ['engine_displacement', 'model_year', 'full_model_name', 'description']
X_.drop(columns=columns_to_drop, inplace=True)
X_ = pd.get_dummies(X_)
# разделим
X = X_[X_['is_train'] == 1].drop(columns=['is_train'])
X_test = X_[X_['is_train'] == 0].drop(columns=['is_train'])

# Стекинг

## Готовим данные

In [9]:
seed = 73

In [10]:
# готовим для стекинга
X_train = X.values
y_train = y.values
X_test = X_test.values

## Функции стекинга

In [11]:
def compute_meta_feature(clf, X_train, X_test, y_train, cv):

    X_meta_train = np.zeros_like(y_train, dtype=np.float64)
    for train_fold_index, predict_fold_index in cv.split(X_train):
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        X_meta_train[predict_fold_index] = folded_clf.predict(X_fold_predict)
    
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    
    X_meta_test = meta_clf.predict(X_test)
    
    return X_meta_train, X_meta_test

def generate_meta_features(classifiers, X_train, X_test, y_train, cv):
    features = [
        compute_meta_feature(clf, X_train, X_test, y_train, cv)
        for clf in tqdm(classifiers)]
    stacked_features_train = np.vstack([
        features_train for features_train, features_test in features]).T
    stacked_features_test = np.vstack([
        features_test for features_train, features_test in features]).T
    
    return stacked_features_train, stacked_features_test

In [12]:
# модели для стекинга
models_list = [BaggingRegressor(n_jobs=-1,
                                n_estimators=300,
                                max_samples=0.9),
               RandomForestRegressor(n_jobs=-1,
                                     n_estimators=400,
                                     max_features=0.5),
#                KNeighborsRegressor(n_jobs=-1,
#                                    n_neighbors=3),
               ExtraTreesRegressor(n_jobs=-1,
                                   n_estimators=300,
                                   max_features=0.9)
]

In [13]:
cv = KFold(n_splits=5, shuffle=True)

stacked_features_train, stacked_features_test = generate_meta_features(models_list, X_train, X_test, y_train, cv)

  0%|                                                                                            | 0/3 [00:00<?, ?it/s]

array([      0.        ,       0.        ,       0.        , ...,
       6299655.39666667,       0.        ,       0.        ])

array([      0.        ,   86071.66333333,       0.        , ...,
       6299655.39666667,       0.        ,       0.        ])

array([ 144083.33333333,   86071.66333333,       0.        , ...,
       6299655.39666667, 2896426.63      ,       0.        ])

array([ 144083.33333333,   86071.66333333,       0.        , ...,
       6299655.39666667, 2896426.63      ,       0.        ])

array([ 144083.33333333,   86071.66333333,  253989.97333333, ...,
       6299655.39666667, 2896426.63      , 4868011.74      ])

 33%|███████████████████████████▋                                                       | 1/3 [02:12<04:24, 132.39s/it]

array([ 141627.5   ,       0.    ,  220389.46  , ..., 6583871.5325,
       3441201.4825,       0.    ])

array([ 141627.5       ,   84132.4975    ,  220389.46      , ...,
       6583871.5325    , 3441201.4825    , 4712463.00583333])

array([ 141627.5       ,   84132.4975    ,  220389.46      , ...,
       6583871.5325    , 3441201.4825    , 4712463.00583333])

array([ 141627.5       ,   84132.4975    ,  220389.46      , ...,
       6583871.5325    , 3441201.4825    , 4712463.00583333])

array([ 141627.5       ,   84132.4975    ,  220389.46      , ...,
       6583871.5325    , 3441201.4825    , 4712463.00583333])

 67%|████████████████████████████████████████████████████████                            | 2/3 [03:20<01:34, 94.47s/it]

array([0., 0., 0., ..., 0., 0., 0.])

array([      0.        ,       0.        ,       0.        , ...,
             0.        , 3206374.50666667,       0.        ])

array([      0.        ,   62246.66666667,       0.        , ...,
             0.        , 3206374.50666667, 4382677.19666667])

array([ 127673.33333333,   62246.66666667,  194530.        , ...,
             0.        , 3206374.50666667, 4382677.19666667])

array([ 127673.33333333,   62246.66666667,  194530.        , ...,
       6491795.79333333, 3206374.50666667, 4382677.19666667])

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [04:59<00:00, 99.71s/it]


In [14]:
meta_model = LinearRegression()
meta_model.fit(stacked_features_train, y_train)
y_test_predict = meta_model.predict(stacked_features_test)

# Результат для Kaggle

In [15]:
submission = pd.read_csv('sample_submission_empty.csv')
submission['price'] = np.around(y_test_predict, decimals=-3)
submission.to_csv('sample_submission_stacking.csv', index=False)