In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score, mean_squared_error as MSE, mean_absolute_error as MAE
from joblib import dump, load
import joblib
import random
from transformers import *


In [2]:

# Фиксируем датчики случайных чисел
random.seed(42)
np.random.seed(42)

# Загружаем данные
df = pd.read_csv('data.csv')

# Разделяем ИСХОДНЫЕ ДАННЫЕ на тренировочную и тестовую выборки
df_train, df_test = train_test_split(df, train_size=0.75, random_state=42)

  df = pd.read_csv('data.csv')


In [3]:
df_kladr = pd.read_csv('kladr-2.csv')
df_kladr = df_kladr.sort_values("TYPE_NP")
df_kladr = df_kladr.drop_duplicates(keep='first', subset='NAME').sort_values('NAME')

In [4]:
alphabet = {}
for i, j in enumerate(["A", "B", "C", "D", "E", "F", "G", "H", "I", "K"]):
    alphabet[i] = j

In [5]:
df_kladr["OKRUG_CODE"] = df_kladr["OKRUG_CODE"].replace(alphabet)

In [6]:
df_kladr.loc[39998, "SOCR"] = 'г'

In [7]:
df_train = df_train.merge(df_kladr[['NAME', 'REGION', 'TYPE_NP', 'OKRUG_OCATD', 'OKRUG_CODE']], left_on='ann_city', right_on='NAME', how='left').drop(columns=['NAME'])
df_train.rename(columns={'TYPE_NP': 'type_city',
                   'REGION': 'region',
                   'OKRUG_OCATD': 'okr_ocatd',
                   'OKRUG_CODE': 'okr_code'
                   }, inplace=True)

In [8]:
df_test = df_test.merge(df_kladr[['NAME', 'REGION', 'TYPE_NP', 'OKRUG_OCATD', 'OKRUG_CODE']], left_on='ann_city', right_on='NAME', how='left').drop(columns=['NAME'])
df_test.rename(columns={'TYPE_NP': 'type_city',
                   'REGION': 'region',
                   'OKRUG_OCATD': 'okr_ocatd',
                   'OKRUG_CODE': 'okr_code'
                   }, inplace=True)

In [9]:
df_train["type_city"] = df_train["type_city"].fillna("Nan")
df_test["type_city"] = df_test["type_city"].fillna("Nan")

In [10]:
df_town = pd.read_excel('town.xlsx')

In [11]:
import pandas as pd

# Добавляем данные из df_3 по численности
df_train = df_train.merge(df_town[['Town', 'Count']], left_on='ann_city', right_on='Town', how='left').drop(columns=['Town'])

# Заменяем значения в type_city по условиям Count
df_train.loc[(df_train['Count'] >= 50) & (df_train['Count'] < 100), 'type_city'] = 'gor_100'
df_train.loc[(df_train['Count'] >= 100) & (df_train['Count'] < 250), 'type_city'] = 'gor_250'
df_train.loc[(df_train['Count'] >= 250) & (df_train['Count'] < 500), 'type_city'] = 'gor_500'
df_train.loc[(df_train['Count'] >= 500) & (df_train['Count'] < 1000), 'type_city'] = 'gor_1000'
df_train.loc[(df_train['Count'] >= 1000) & (df_train['Count'] < 2500), 'type_city'] = 'gor_2500'
df_train.loc[(df_train['Count'] >= 2500) & (df_train['Count'] < 5000), 'type_city'] = 'gor_5000'
df_train.loc[df_train['Count'] > 5000, 'type_city'] = 'gor_10000'
# df_train = df_train.drop(columns=['Count'])

In [12]:
import pandas as pd

# Добавляем данные из df_3 по численности
df_test = df_test.merge(df_town[['Town', 'Count']], left_on='ann_city', right_on='Town', how='left').drop(columns=['Town'])

# Заменяем значения в type_city по условиям Count
df_test.loc[(df_test['Count'] >= 50) & (df_test['Count'] < 100), 'type_city'] = 'gor_100'
df_test.loc[(df_test['Count'] >= 100) & (df_test['Count'] < 250), 'type_city'] = 'gor_250'
df_test.loc[(df_test['Count'] >= 250) & (df_test['Count'] < 500), 'type_city'] = 'gor_500'
df_test.loc[(df_test['Count'] >= 500) & (df_test['Count'] < 1000), 'type_city'] = 'gor_1000'
df_test.loc[(df_test['Count'] >= 1000) & (df_test['Count'] < 2500), 'type_city'] = 'gor_2500'
df_test.loc[(df_test['Count'] >= 2500) & (df_test['Count'] < 5000), 'type_city'] = 'gor_5000'
df_test.loc[df_test['Count'] > 5000, 'type_city'] = 'gor_10000'
# df_test = df_test.drop(columns=['Count'])

In [13]:
# Загрузка (если нужно)
rf_pipeline = load("rf_pipeline.joblib")

# Обучение Pipeline
transformed_train = rf_pipeline.fit_transform(df_train)

# Преобразование тестовых данных
transformed_test = rf_pipeline.transform(df_test)

In [14]:
# Преобразуем и разделяем признаки и таргет
X_train = transformed_train.copy()
y_train = transformed_train['car_price']
X_test = transformed_test.copy()
y_test = transformed_test['car_price']
X_train = X_train.drop(['car_price'], axis=1)
X_test = X_test.drop(['car_price'], axis=1)

In [15]:
rf_model = joblib.load('random_forest_pipeline_without_log.pkl')

In [16]:
y_pred_rf = rf_model.predict(X_test)
y_pred_train_rf = rf_model.predict(X_train)



In [17]:
# Функция вывода основных характеристик модели (ориентируемся прежде всего на R^2)
def print_r2_mse(y_train, y_pred_train, y_test, y_pred):
    '''
    Функция принимает на вход признаки и таргет для тренировочных
    и тестовых данных, модель и выводит на экран 
    значения MSE, R^2 для трейна и теста для указанных параметров
    '''
    print('*'*20)
    print(f'Значение MAE для трейна:  {MAE(y_train, y_pred_train)}')
    print(f'Значение MAE для теста:   {MAE(y_test, y_pred)}')
    print('*'*20)
    print(f'Значение MSE для трейна:  {MSE(y_train, y_pred_train)}')
    print(f'Значение MSE для теста:   {MSE(y_test, y_pred)}')
    print('*'*20)
    print(f'Значение R^2 для трейна:  {r2_score(y_train, y_pred_train)}')
    print(f'Значение R^2 для теста:   {r2_score(y_test, y_pred)}')

In [18]:
print_r2_mse(y_train, y_pred_train_rf, y_test, y_pred_rf)

********************
Значение MAE для трейна:  318444.4522253497
Значение MAE для теста:   342453.1960682919
********************
Значение MSE для трейна:  3754516993926.684
Значение MSE для теста:   1590771116211.9438
********************
Значение R^2 для трейна:  0.9069304227277237
Значение R^2 для теста:   0.957044645738207


In [19]:
# Загрузка (если нужно)
loaded_pipeline = load("pipeline.joblib")

# Обучение Pipeline
transformed_train = loaded_pipeline.fit_transform(df_train)

# Преобразование тестовых данных
transformed_test = loaded_pipeline.transform(df_test)

In [20]:
# Преобразуем и разделяем признаки и таргет
X_train = transformed_train.copy()
y_train = transformed_train['car_price']
X_test = transformed_test.copy()
y_test = transformed_test['car_price']
X_train = X_train.drop(['car_price'], axis=1)
X_test = X_test.drop(['car_price'], axis=1)

In [21]:
# Преобразование категориальных признаков в Pandas category
for col in X_train.select_dtypes(include=['object']).columns:
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col].astype('category')

In [22]:


# Загрузка модели
lgbm_model = joblib.load('lgbm_pipeline_.pkl')
catboost_model = joblib.load('catboost_pipeline_.pkl')
ridge_model = joblib.load('ridge_pipeline_.pkl')


In [23]:
y_pred_lgbm = lgbm_model.predict(X_test)
y_pred_train_lgbm = lgbm_model.predict(X_train)

y_pred_cb = catboost_model.predict(X_test)
y_pred_train_cb = catboost_model.predict(X_train)

y_pred_ridge = ridge_model.predict(X_test)
y_pred_train_ridge = ridge_model.predict(X_train)



In [24]:
y_pred = (y_pred_lgbm + y_pred_cb + np.log(y_pred_rf + 2900000) + y_pred_ridge) / 4
y_pred_train = (y_pred_train_lgbm + y_pred_train_cb + np.log(y_pred_train_rf + 2900000) + y_pred_train_ridge) / 4

In [25]:
# Функция вывода основных характеристик модели (ориентируемся прежде всего на R^2)
def print_r2_mse_exp(y_train, y_pred_train, y_test, y_pred):
    '''
    Функция принимает на вход признаки и таргет для тренировочных
    и тестовых данных, модель и выводит на экран 
    значения MSE, R^2 для трейна и теста для указанных параметров
    '''

    delta_y = 2900000
    
    print('*'*20)
    print(f'Значение MAE для трейна:  {MAE(np.exp(y_train) - delta_y, np.exp(y_pred_train) - delta_y)}')
    print(f'Значение MAE для теста:   {MAE(np.exp(y_test) - delta_y, np.exp(y_pred) - delta_y)}')
    print('*'*20)
    print(f'Значение MSE для трейна:  {MSE(np.exp(y_train) - delta_y, np.exp(y_pred_train) - delta_y)}')
    print(f'Значение MSE для теста:   {MSE(np.exp(y_test) - delta_y, np.exp(y_pred) - delta_y)}')
    print('*'*20)
    print(f'Значение R^2 для трейна:  {r2_score(np.exp(y_train) - delta_y, np.exp(y_pred_train) - delta_y)}')
    print(f'Значение R^2 для теста:   {r2_score(np.exp(y_test) - delta_y, np.exp(y_pred) - delta_y)}')

In [26]:
print_r2_mse_exp(y_train, y_pred_train_lgbm, y_test, y_pred_lgbm) # Выводим результаты, довольны собой

********************
Значение MAE для трейна:  217884.09450490336
Значение MAE для теста:   315298.71649147646
********************
Значение MSE для трейна:  522908576983.93
Значение MSE для теста:   1875556231847.2312
********************
Значение R^2 для трейна:  0.9870377786834723
Значение R^2 для теста:   0.9493546359021411


In [27]:
print_r2_mse_exp(y_train, y_pred_train_cb, y_test, y_pred_cb)

********************
Значение MAE для трейна:  284415.01216352795
Значение MAE для теста:   329634.497481051
********************
Значение MSE для трейна:  986582424747.463
Значение MSE для теста:   1488185306143.2114
********************
Значение R^2 для трейна:  0.9755439090130547
Значение R^2 для теста:   0.9598147550071185


In [28]:
print_r2_mse_exp(y_train, y_pred_train_ridge, y_test, y_pred_ridge)

********************
Значение MAE для трейна:  320394.2988547338
Значение MAE для теста:   349860.04969869717
********************
Значение MSE для трейна:  1522473707741.2415
Значение MSE для теста:   1606172021165.1626
********************
Значение R^2 для трейна:  0.9622598633547697
Значение R^2 для теста:   0.956628777407763


In [29]:
print_r2_mse_exp(y_train, y_pred_train, y_test, y_pred) # Выводим результаты, довольны собой

********************
Значение MAE для трейна:  260350.32601655737
Значение MAE для теста:   304979.69557411055
********************
Значение MSE для трейна:  1289575400244.5518
Значение MSE для теста:   1384662548767.2693
********************
Значение R^2 для трейна:  0.9680331085048671
Значение R^2 для теста:   0.9626101645238757


In [57]:
import numpy as np
from sklearn.linear_model import LinearRegression

# Загружаем предсказания моделей
y_pred_lgbm = lgbm_model.predict(X_test)
y_pred_train_lgbm = lgbm_model.predict(X_train)

y_pred_cb = catboost_model.predict(X_test)
y_pred_train_cb = catboost_model.predict(X_train)

y_pred_ridge = ridge_model.predict(X_test)
y_pred_train_ridge = ridge_model.predict(X_train)

y_pred_rf = np.log(rf_model.predict(X_test) + 2900000)
y_pred_train_rf = np.log(rf_model.predict(X_train) + 2900000)

# Объединение массивов в DataFrame
test_preds = pd.DataFrame(
    np.column_stack((y_pred_lgbm, y_pred_cb, y_pred_ridge, y_pred_rf)), 
    columns=['y_pred_lgbm', 'y_pred_cb', 'y_pred_ridge', 'y_pred_rf']
)

train_preds = pd.DataFrame(
    np.column_stack((y_pred_train_lgbm, y_pred_train_cb, y_pred_train_ridge, y_pred_train_rf)), 
    columns=['y_pred_lgbm', 'y_pred_cb', 'y_pred_ridge', 'y_pred_rf']
)

# Обучение метамодели
meta_model = Ridge(alpha=100)
meta_model.fit(train_preds, y_train)

# Предсказания финальной модели
final_test_preds = meta_model.predict(test_preds)
final_train_preds = meta_model.predict(train_preds)






In [91]:
import optuna
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold

def objective(trial):
    """Функция для оптимизации гиперпараметров с помощью optuna."""
    
    # Пространство поиска гиперпараметров
    params = {
        'objective': 'regression',
        'boosting_type': 'gbdt',
        'metric': 'l2',  # LightGBM не поддерживает 'r2', используем MSE
        'n_jobs': -1,
        'seed': 13,
        'verbosity': -1,
        'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
        'early_stopping_rounds': trial.suggest_int('early_stopping_rounds', 50, 200),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.2, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-5, 10.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.4, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-5, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-5, 10.0, log=True),
        'max_bin': trial.suggest_int('max_bin', 50, 500),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 10.0)
    }
    
    scores = []
    kf = KFold(n_splits=5, shuffle=True, random_state=13)
    
    for train_idx, val_idx in kf.split(train_preds):
        X_tr, X_val = train_preds.iloc[train_idx], train_preds.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        # Если есть категориальные признаки
        cat_features = [i for i, col in enumerate(train_preds.columns) if train_preds[col].dtype == "object"]
        
        lgb_train = lgb.Dataset(X_tr, y_tr, categorical_feature=cat_features)
        lgb_eval = lgb.Dataset(X_val, y_val, categorical_feature=cat_features)

        lgb_model = lgb.train(
            params,
            lgb_train,
            valid_sets=[lgb_eval],  # Используем только валидацию
            valid_names=['val'],
        )
        
        y_pred = lgb_model.predict(X_val)
        score_fold = r2_score(y_val, y_pred)  # R² на валидации
        scores.append(score_fold)
    
    return np.mean(scores)

# Запуск optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Вывод лучших параметров
print("Best params:", study.best_params)
print("Best R2:", study.best_value)

[I 2025-03-14 14:32:32,066] A new study created in memory with name: no-name-841836a7-3add-47c9-bf0a-78b2f5e2cddf
[I 2025-03-14 14:33:25,697] Trial 0 finished with value: 0.9826831310419578 and parameters: {'n_estimators': 980, 'early_stopping_rounds': 150, 'learning_rate': 0.03315210482142908, 'num_leaves': 102, 'max_depth': 13, 'min_child_samples': 57, 'min_child_weight': 0.01905349968923528, 'subsample': 0.5985683975508317, 'colsample_bytree': 0.9648649957020118, 'reg_alpha': 0.00021503955096240378, 'reg_lambda': 2.7936081015958973, 'max_bin': 117, 'scale_pos_weight': 4.077878201095752}. Best is trial 0 with value: 0.9826831310419578.
[I 2025-03-14 14:33:42,841] Trial 1 finished with value: 0.987165781820381 and parameters: {'n_estimators': 1650, 'early_stopping_rounds': 55, 'learning_rate': 0.1620517391166068, 'num_leaves': 70, 'max_depth': 6, 'min_child_samples': 79, 'min_child_weight': 0.0003540761037659691, 'subsample': 0.4029367119155587, 'colsample_bytree': 0.4724937400085249,

Best params: {'n_estimators': 1196, 'early_stopping_rounds': 75, 'learning_rate': 0.12147267835306508, 'num_leaves': 44, 'max_depth': 13, 'min_child_samples': 18, 'min_child_weight': 0.000337834055040744, 'subsample': 0.9414391616635034, 'colsample_bytree': 0.9136725240745678, 'reg_alpha': 0.3768834527613332, 'reg_lambda': 0.0036623735722240806, 'max_bin': 493, 'scale_pos_weight': 3.5601720577072458}
Best R2: 0.9882359212134644


In [93]:
from sklearn.pipeline import Pipeline
from lightgbm import LGBMRegressor
import numpy as np


params = {
    'objective': 'regression',
    'boosting_type': 'gbdt',
    'metric': 'l2',  # LightGBM не поддерживает 'r2', используем MSE
    'n_jobs': -1,
    'seed': 13,
    'verbosity': -1,
    'n_estimators': 1196,
    'learning_rate': 0.12147267835306508, 
    'num_leaves': 44, 
    'max_depth': 13, 
    'min_child_samples': 18, 
    'min_child_weight': 0.000337834055040744, 
    'subsample': 0.9414391616635034, 
    'colsample_bytree': 0.9136725240745678, 
    'reg_alpha': 0.3768834527613332, 
    'reg_lambda': 0.0036623735722240806, 
    'max_bin': 493, 
    'scale_pos_weight': 3.5601720577072458
}

# Создаём модель отдельно и передаём категории через set_params
lgbm = LGBMRegressor(**params, random_state=42)
#lgbm.set_params(categorical_feature=cat_indices)

# Создаём пайплайн
pipeline_lgbm = Pipeline(steps=[
    ('regressor', lgbm)
])

model_lgbm = pipeline_lgbm.fit(train_preds, y_train)

final_test_preds = model_lgbm.predict(test_preds)
final_train_preds = model_lgbm.predict(train_preds)


In [94]:
print_r2_mse_exp(y_train, final_train_preds, y_test, final_test_preds)

********************
Значение MAE для трейна:  214131.20844941327
Значение MAE для теста:   343500.9718644237
********************
Значение MSE для трейна:  3655660952965.716
Значение MSE для теста:   2624321495134.245
********************
Значение R^2 для трейна:  0.9093809349927984
Значение R^2 для теста:   0.9291358396116928


In [97]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split


# Обучаем базовые модели
models = {
    "catboost": catboost_model,
    "ridge": ridge_model,
    "lgbm_model": lgbm_model,
}

train_preds = []
test_preds = []

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    train_preds.append(model.predict(X_train))
    test_preds.append(model.predict(X_test))

# Создаем DataFrame с предсказаниями базовых моделей
train_meta = pd.DataFrame(np.column_stack(train_preds), columns=models.keys())
test_meta = pd.DataFrame(np.column_stack(test_preds), columns=models.keys())

# Обучаем метамодель
meta_model = Ridge(alpha=1.0)
meta_model.fit(train_meta, y_train)

# Делаем предсказания
final_test_preds = meta_model.predict(test_meta)
final_train_preds = meta_model.predict(train_meta)


Training catboost...
0:	learn: 0.3644968	total: 52.8ms	remaining: 52.3s
100:	learn: 0.0836258	total: 5.36s	remaining: 47.3s
200:	learn: 0.0744177	total: 10.2s	remaining: 40.1s
300:	learn: 0.0698724	total: 14.8s	remaining: 34s
400:	learn: 0.0669752	total: 19.1s	remaining: 28.2s
500:	learn: 0.0647338	total: 23.8s	remaining: 23.3s
600:	learn: 0.0632176	total: 28.1s	remaining: 18.3s
700:	learn: 0.0618008	total: 32.6s	remaining: 13.5s
800:	learn: 0.0605206	total: 37s	remaining: 8.83s
900:	learn: 0.0595955	total: 41.4s	remaining: 4.18s
991:	learn: 0.0587815	total: 45.3s	remaining: 0us
Training ridge...




Training lgbm_model...


In [98]:
print_r2_mse_exp(y_train, final_train_preds, y_test, final_test_preds)

********************
Значение MAE для трейна:  208207.16878796386
Значение MAE для теста:   333044.025449103
********************
Значение MSE для трейна:  471626461294.5043
Значение MSE для теста:   2448967495706.067
********************
Значение R^2 для трейна:  0.9883089954169598
Значение R^2 для теста:   0.9338708974021538


In [100]:
print(train_preds.shape, y_train.shape)

AttributeError: 'list' object has no attribute 'shape'

In [101]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor

# Объединение массивов в DataFrame
test_preds = pd.DataFrame(
    np.column_stack((y_pred_lgbm, y_pred_cb, y_pred_ridge, y_pred_rf)), 
    columns=['y_pred_lgbm', 'y_pred_cb', 'y_pred_ridge', 'y_pred_rf']
)

train_preds = pd.DataFrame(
    np.column_stack((y_pred_train_lgbm, y_pred_train_cb, y_pred_train_ridge, y_pred_train_rf)), 
    columns=['y_pred_lgbm', 'y_pred_cb', 'y_pred_ridge', 'y_pred_rf']
)

rf_regressor = RandomForestRegressor(n_estimators=300, max_depth=30, min_samples_leaf=2, n_jobs=-1, random_state=42)

model = rf_regressor.fit(train_preds, y_train)

y_pred = model.predict(test_preds)
y_pred_train = model.predict(train_preds)

In [102]:
print_r2_mse_exp(y_train, y_pred_train, y_test, y_pred) # Выводим результаты, довольны собой

********************
Значение MAE для трейна:  105062.15626134322
Значение MAE для теста:   332789.8865361891
********************
Значение MSE для трейна:  223366244900.3686
Значение MSE для теста:   1970435912522.6619
********************
Значение R^2 для трейна:  0.9944630422439422
Значение R^2 для теста:   0.946792614091383


In [103]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.svm import SVC
from sklearn.linear_model import RidgeCV, LogisticRegression
from sklearn.ensemble import StackingClassifier, StackingRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from joblib import Parallel, delayed
from mlxtend.plotting import plot_decision_regions

In [132]:
from sklearn.model_selection import KFold
import time
from joblib import Parallel, delayed

class Stacking:
    def __init__(self, estimators, final_estimator, blending=False, cv=5, n_jobs=-1):
        self.estimators = estimators
        self.final_estimator = final_estimator
        self.blending = blending
        self.cv = cv
        self.n_jobs = n_jobs

    def _train_model(self, estimator, X_train, y_train, X_val, fold=None):
        start_time = time.time()
        if fold is not None:
            print(f"\n{'='*50}\nTraining {estimator.__class__.__name__} on fold {fold+1}/{self.cv}\n{'='*50}")
        else:
            print(f"\n{'='*50}\nTraining {estimator.__class__.__name__} on hold-out set\n{'='*50}")
            
        estimator.fit(X_train, y_train)
        print(f"Model {estimator.__class__.__name__} trained in {time.time()-start_time:.1f}s")
        return estimator.predict(X_val)

    def _X_pred(self, estimator, data):
        if self.blending:
            X_train_v, y_train_v, X_val = data
            return self._train_model(estimator, X_train_v, y_train_v, X_val)
        else:
            X_train, y_train = data
            kf = KFold(n_splits=self.cv)
            predictions = []
            
            for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
                X_tr, X_v = X_train.iloc[train_idx], X_train.iloc[val_idx]
                y_tr, y_v = y_train.iloc[train_idx], y_train.iloc[val_idx]
                fold_pred = self._train_model(estimator, X_tr, y_tr, X_v, fold)
                predictions.extend(fold_pred)
                
            return predictions

    def _X_test_pred(self, estimator, data):
        X_train, y_train, X_test = data
        print(f"\n{'='*50}\nTraining {estimator.__class__.__name__} for test prediction\n{'='*50}")
        start_time = time.time()
        estimator.fit(X_train, y_train)
        print(f"Model {estimator.__class__.__name__} trained in {time.time()-start_time:.1f}s")
        return estimator.predict(X_test)

    def _meta_data(self, X_train, y_train, X_test):
        if self.blending:
            X_train_v, X_val, y_train_v, y_val = train_test_split(X_train, y_train, random_state=0)
            train_data = [X_train_v, y_train_v, X_val]
            test_data = [X_train_v, y_train_v, X_test]
            meta_y_train = y_val
        else:
            train_data = [X_train, y_train]
            test_data = [X_train, y_train, X_test]
            meta_y_train = y_train

        cv_X_train_preds = (delayed(self._X_pred)(est, train_data) for est in self.estimators)
        X_test_preds = (delayed(self._X_test_pred)(est, test_data) for est in self.estimators)

        meta_X_train = pd.DataFrame(Parallel(n_jobs=self.n_jobs)(cv_X_train_preds))
        meta_X_test = pd.DataFrame(Parallel(n_jobs=self.n_jobs)(X_test_preds))

        return meta_X_train.T, meta_y_train, meta_X_test.T

    def fit_predict(self, X_train, y_train, X_test):
        print("\n\n=== STARTING META-MODEL TRAINING ===")
        meta_X_train, meta_y_train, meta_X_test = self._meta_data(X_train, y_train, X_test)
        
        print("\n\n=== TRAINING FINAL ESTIMATOR ===")
        start_time = time.time()
        self.final_estimator.fit(meta_X_train, meta_y_train)
        print(f"\nFinal estimator {self.final_estimator.__class__.__name__} trained in {time.time()-start_time:.1f}s")
        
        return self.final_estimator.predict(meta_X_test)

In [135]:
reg_estimators = [
    ridge_model,
    catboost_model,
    #lgbm_model
    ]

stacking_reg = Stacking(estimators=reg_estimators, final_estimator=RidgeCV(alphas=(0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0)), n_jobs=1)
stacking_reg_pred_res = stacking_reg.fit_predict(X_train, y_train, X_test)
stacking_mape = r2_score(y_test, stacking_reg_pred_res)
print(f'stacking_regressor_mape {stacking_mape}')
print(stacking_reg_pred_res, '', sep='\n')



=== STARTING META-MODEL TRAINING ===

Training Pipeline on fold 1/5
Model Pipeline trained in 60.1s





Training Pipeline on fold 2/5
Model Pipeline trained in 53.6s





Training Pipeline on fold 3/5
Model Pipeline trained in 50.2s





Training Pipeline on fold 4/5
Model Pipeline trained in 49.4s





Training Pipeline on fold 5/5
Model Pipeline trained in 58.7s





Training CatBoostRegressor on fold 1/5
0:	learn: 0.3669680	total: 217ms	remaining: 3m 34s
100:	learn: 0.0832404	total: 6.49s	remaining: 57.2s
200:	learn: 0.0741335	total: 11.4s	remaining: 45.1s
300:	learn: 0.0696847	total: 15.4s	remaining: 35.4s
400:	learn: 0.0666731	total: 19.3s	remaining: 28.5s
500:	learn: 0.0643724	total: 23.3s	remaining: 22.8s
600:	learn: 0.0626493	total: 27.2s	remaining: 17.7s
700:	learn: 0.0610676	total: 31.3s	remaining: 13s
800:	learn: 0.0596421	total: 35s	remaining: 8.34s
900:	learn: 0.0585319	total: 38.9s	remaining: 3.93s
991:	learn: 0.0575593	total: 42.7s	remaining: 0us
Model CatBoostRegressor trained in 43.0s

Training CatBoostRegressor on fold 2/5
0:	learn: 0.3641668	total: 46.9ms	remaining: 46.5s
100:	learn: 0.0803711	total: 4.44s	remaining: 39.2s
200:	learn: 0.0723928	total: 8.24s	remaining: 32.4s
300:	learn: 0.0682537	total: 12.1s	remaining: 27.7s
400:	learn: 0.0656321	total: 16.1s	remaining: 23.7s
500:	learn: 0.0634817	total: 24.4s	remaining: 23.9s
600




Training CatBoostRegressor for test prediction
0:	learn: 0.3644968	total: 59.9ms	remaining: 59.4s
100:	learn: 0.0836258	total: 7.1s	remaining: 1m 2s
200:	learn: 0.0744177	total: 12.8s	remaining: 50.3s
300:	learn: 0.0698724	total: 19.5s	remaining: 44.9s
400:	learn: 0.0669752	total: 26.1s	remaining: 38.5s
500:	learn: 0.0647338	total: 31.4s	remaining: 30.8s
600:	learn: 0.0632176	total: 36.9s	remaining: 24s
700:	learn: 0.0618008	total: 42.5s	remaining: 17.7s
800:	learn: 0.0605206	total: 48.4s	remaining: 11.5s
900:	learn: 0.0595955	total: 53.8s	remaining: 5.43s
991:	learn: 0.0587815	total: 58.9s	remaining: 0us
Model CatBoostRegressor trained in 59.3s


=== TRAINING FINAL ESTIMATOR ===

Final estimator RidgeCV trained in 0.1s
stacking_regressor_mape 0.9716294684552493
[14.99489919 14.97955305 15.09370214 ... 14.95044351 18.07990191
 15.31068056]



In [137]:
r2_score(np.exp(y_test) - 2900000, np.exp(stacking_reg_pred_res) - 2900000)

0.9607624423657934