# Подготовка данных для обучения

## Либы

In [974]:
# Necessary libs
import pandas as pd
import numpy as np

#Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#EDA
import ydata_profiling as yp # to fast data observe

# Models
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from catboost import CatBoostRegressor
import lightgbm as lgb

# Metrics
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

# Preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
import shap

# Pipelines
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer

# Seasons
from statsmodels.tsa.seasonal import seasonal_decompose

# D-Fuller
from statsmodels.tsa.stattools import adfuller

# ACF, PACF
from statsmodels.graphics.tsaplots import plot_pacf, plot_acf
from statsmodels.graphics.tsaplots import plot_acf
# AST
import ast

In [976]:
import catboost as cb
import optuna
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import train_test_split

## Необходимые функции

In [979]:
#Functions
def drop_train_test(df_train, df_test, cols):# function for clean the data
    for col in cols:
        if col in df_train.columns.tolist():
            df_train.drop(col, axis = 1, inplace = True)
        if col in df_test.columns.tolist():
            df_test.drop(col, axis = 1, inplace = True)

In [981]:
def add_from_origin(df_origin, train, col_name):# function for add dropped column 
    new_df = train.copy()
    if col_name in df_origin.columns.tolist():
        new_df[col_name] = df_origin[col_name]
    else:
        print(f"Origin hasn't got {col_name}")
    return new_df

In [983]:
# Заполнение пропусков для числовых признаков средним значением по часу в дне в месяце
def impute_numeric_feature(train, feature):
    df = train.copy()
    return df.groupby([df.index.month, df.index.day, df.index.hour])[feature].transform(lambda x: x.fillna(x.mean()))

In [985]:
# Заполнение пропусков для категориальных признаков самым популярным значением (модой)
def impute_categorical_feature(train, feature):
    df = train.copy()
    return df.groupby([df.index.month, df.index.day, df.index.hour])[feature].transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else x))



In [987]:
def fill_missing(train):
    df = train.copy()
    num_features = df.select_dtypes(exclude = 'object').columns.tolist()
    for feature in num_features:
        df[feature] = impute_numeric_feature(df, feature)
    categorical_features = df.select_dtypes(include = 'object').columns.tolist()
    for feature in categorical_features:
        df[feature] = impute_categorical_feature(df, feature)
    return df

# Подготовка данных

## Загружаем данные

In [1055]:
#import datasets
train = pd.read_csv('data/train_upd.csv', index_col='date')
test = pd.read_csv('data/test_upd.csv', index_col ='date')

# Set datetime index

train.index = pd.to_datetime(train.index)
test.index = pd.to_datetime(test.index)

# take only from 2000
#train = train.loc['2000-01-01 00:00:00':]
train['event_dt'] = train.index
test['event_dt'] = test.index

  train = pd.read_csv('data/train_upd.csv', index_col='date')


In [1056]:
weather = pd.read_csv('London Weather Kaggle.csv', index_col = 'date')

In [1057]:
weather.index = pd.to_datetime(weather.index)

In [1058]:
test.shape

(35064, 45)

In [1063]:
test = test.join(weather, how='inner')
train = train.join(weather, how = 'inner')

## Найдем признаки с большим количеством пропусков, заполним пропуски

In [1066]:
# Определение признаков с большим количеством пропусков
def miss_value(train):
    miss_value_features = []
    df = train.copy()
    threshold = len(df) * 0.8
    for column in df.columns:
        if df[column].isna().sum() > threshold:
            miss_value_features.append(column)
    return miss_value_features

In [1068]:
# Привести датасеты в соответствующий попрядок
def make_normal_order(df_train, df_test):
    # make union order
    column_order = df_train.columns.tolist()
    df_test = df_test.reindex(columns = column_order)
    df_test = df_test.dropna(axis = 1, how = 'all')
    return df_test

In [1070]:
#Определение признаков с большим количеством пропусков
miss_value_features = miss_value(train)

train = fill_missing(train)
test = fill_missing(test)
# Вывод списков признаков с большим количеством пропусков
print("Признаки с большим количеством пропусков:", miss_value_features)
print(f'Test nan value count {test.isna().sum().sum()}')
print(f'Train nan value count {train.isna().sum().sum()}')

  return df.groupby([df.index.month, df.index.day, df.index.hour])[feature].transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else x))


Признаки с большим количеством пропусков: []
Test nan value count 0
Train nan value count 0


In [1071]:
test = make_normal_order(train, test)

# Feature Engineering 

## New Feats

In [1074]:
# Mean 'broom_rentals', 'taxi_rides', 'bike_rentals'
def add_mean_rentals(df_train, df_test):
    # Computing mean for 'broom_rentals', 'taxi_rides', 'bike_rentals' on train
    hourly_broom_mean = df_train.groupby('hour')['broom_rentals'].mean().rename('hourly_broom_rentals')
    hourly_taxi_mean = df_train.groupby('hour')['taxi_rides'].mean().rename('hourly_taxi_rides')
    hourly_bike_mean = df_train.groupby('hour')['bike_rentals'].mean().rename('hourly_bike_rentals')

    hourly_daily_broom_mean = df_train.groupby(['hour', 'day'])['broom_rentals'].mean().rename('hourly_daily_broom_rentals')
    hourly_daily_taxi_mean = df_train.groupby(['hour', 'day'])['taxi_rides'].mean().rename('hourly_daily_taxi_rides')
    hourly_daily_bike_mean = df_train.groupby(['hour', 'day'])['bike_rentals'].mean().rename('hourly_daily_bike_rentals')

    hourly_daily_monthly_broom_mean = df_train.groupby(['hour', 'day', 'month'])['broom_rentals'].mean().rename('hourly_daily_monthly_broom_rentals')
    hourly_daily_monthly_taxi_mean = df_train.groupby(['hour', 'day', 'month'])['taxi_rides'].mean().rename('hourly_daily_monthly_taxi_rides')
    hourly_daily_monthly_bike_mean = df_train.groupby(['hour', 'day', 'month'])['bike_rentals'].mean().rename('hourly_daily_monthly_bike_rentals')

    daily_broom_mean = df_train.groupby('day')['broom_rentals'].mean().rename('daily_broom_rentals')
    daily_taxi_mean = df_train.groupby('day')['taxi_rides'].mean().rename('daily_taxi_rides')
    daily_bike_mean = df_train.groupby('day')['bike_rentals'].mean().rename('daily_bike_rentals')

    monthly_broom_mean = df_train.groupby('month')['broom_rentals'].mean().rename('monthly_broom_rentals')
    monthly_taxi_mean = df_train.groupby('month')['taxi_rides'].mean().rename('monthly_taxi_rides')
    monthly_bike_mean = df_train.groupby('month')['bike_rentals'].mean().rename('monthly_bike_rentals')

    # Adding new features to dataset
    mean_features = [
        hourly_broom_mean, hourly_taxi_mean, hourly_bike_mean,
        hourly_daily_broom_mean, hourly_daily_taxi_mean, hourly_daily_bike_mean,
        hourly_daily_monthly_broom_mean, hourly_daily_monthly_taxi_mean, hourly_daily_monthly_bike_mean,
        daily_broom_mean, daily_taxi_mean, daily_bike_mean,
        monthly_broom_mean, monthly_taxi_mean, monthly_bike_mean
    ]

    for mean_feature in mean_features:
        df_train = df_train.merge(mean_feature, on=mean_feature.index.names, how='left')
        df_test = df_test.merge(mean_feature, on=mean_feature.index.names, how='left')
    
    return df_train, df_test

In [1075]:
def get_temp_dif(df):
    # get absolute diff
    df['temp_diff'] = np.abs(df["max_temp"] - df["min_temp"])
    return df

## Encoding

In [1077]:
def encode_categorical_feature(train_df, test_df, column_name):
    encoder = LabelEncoder()
    
    train_df[column_name] = encoder.fit_transform(train_df[column_name])
    test_df[column_name] = encoder.transform(test_df[column_name])
    
    return train_df, test_df


# def encode_tarot_pairs(df_train, df_test, column_name):
#     unique_tarot_cards = set()
#     for pair in pd.concat([df_train[column_name], df_test[column_name]]):
#         unique_tarot_cards.update(pair)
        
#     for card in unique_tarot_cards:
#         card_name = card.replace("'", "")  # Убираем символ ' из названия
#         df_train[f"card_{card_name}"] = df_train[column_name].apply(lambda x: 1 if card in x else 0)
#         df_test[f"card_{card_name}"] = df_test[column_name].apply(lambda x: 1 if card in x else 0)

#     df_train = df_train.drop(columns=[column_name])
#     df_test = df_test.drop(columns=[column_name])
    
#     return df_train, df_test


def new_encode_tarot_pairs(df_train, df_test, column_name):
    # разбиваем строки на две отдельные колонки
    df_train[['left_card', 'right_card']] = df_train[column_name].apply(lambda x: ast.literal_eval(x)).apply(pd.Series)
    df_test[['left_card', 'right_card']] = df_test[column_name].apply(lambda x: ast.literal_eval(x)).apply(pd.Series)

    # словарь с пропрочеством
    tarot_mapping = {
        'The Fool': 1,
        'The Devil': 1,
        'Wheel of Fortune': 1,
        'The High Priestess': 2,
        'The Hierophant': 2,
        'The Emperor': 3,
        'The Empress': 3,
        'Strength': 4,
        'The Lovers': 4,
        'The Chariot': 4
    }

    # все уникальные карты остальные пускай 0
    all_tarot_cards = set(df_train['left_card'].unique())
    for card in all_tarot_cards:
        if card not in tarot_mapping:
            tarot_mapping[card] = 0

    # кодирование карт в колонках 'left_card' и 'right_card'
    for df in [df_train, df_test]:
        df['left_card_code'] = df['left_card'].map(tarot_mapping)
        df['right_card_code'] = df['right_card'].map(tarot_mapping)
        df['tarot_code_sum'] = df['left_card_code'] + df['right_card_code']

    # Удаление временных колонок
    df_train = df_train.drop(columns=[column_name, 'left_card', 'right_card'])
    df_test = df_test.drop(columns=[column_name, 'left_card', 'right_card'])
    
    return df_train, df_test


def quantile_encode(train_df, test_df, column_name, num_quantiles=4):
    quantile_bins = pd.qcut(train_df[column_name], q=num_quantiles, retbins=True)[1]
    
    train_df[f'{column_name}_quantile'] = pd.cut(train_df[column_name], bins=quantile_bins, labels=False, include_lowest=True)
    test_df[f'{column_name}_quantile'] = pd.cut(test_df[column_name], bins=quantile_bins, labels=False, include_lowest=True)
    
    return train_df, test_df

def get_season(n):
    if n in [12,1,2]:
        return 0 # Winter
    elif n in [3,4,5]:
        return 1 # Spring
    elif n in [6,7,8]:
        return 2 # Summer
    else:
        return 3 # Autumn

### Применяем преобразования

In [1079]:
def preproc(train_df, test_df):
    train_df, test_df = add_mean_rentals(train_df, test_df)
    
    # combine preproc features
    train_df['season'] = train_df['month'].apply(get_season)
    test_df['season'] = test_df['month'].apply(get_season)
    
    train_df  = get_temp_dif(train_df)
    test_df  = get_temp_dif(test_df)

    train_df, test_df = encode_categorical_feature(train_df, test_df, 'astrologies_week_proclaim')
    train_df, test_df = encode_categorical_feature(train_df, test_df, 'moon_phase')

    train_df, test_df = quantile_encode(train_df, test_df, '?')

    train_df, test_df = new_encode_tarot_pairs(train_df, test_df, 'tarot')

    return train_df, test_df

In [1080]:
#Preprocessing
train_df = train.copy()
test_df = test.copy()
train, test = preproc(train_df, test_df)

## Небольшие добавки к FE

In [1082]:
# evgeniy_gennadievich_beer: бинаризация, все, что меньше 0.1 - 0, 1 для остальных
train['evgeniy_gennadievich_beer_binary'] = (train['evgeniy_gennadievich_beer'] >= 0.1).astype(int)
test['evgeniy_gennadievich_beer_binary'] = (test['evgeniy_gennadievich_beer'] >= 0.1).astype(int)

# snow_depth: если больше 0 то 3,4, если нет, то 0
train['snow_depth_binary'] = (train['snow_depth'].between(3, 4)| (train['unemployment_rate'].between(0, 0))).astype(int)
test['snow_depth_binary'] = (test['snow_depth'].between(3, 4)| (test['unemployment_rate'].between(0, 0))).astype(int)

# halley_comet_tail_meters: замена на логарифм
train['log_halley_comet_tail_meters'] = np.log(train['halley_comet_tail_meters'])
test['log_halley_comet_tail_meters'] = np.log(test['halley_comet_tail_meters'])

# Всеобщий отбор фичей

In [1084]:
train.columns

Index(['cloud_cover', 'sunshine', 'global_radiation', 'max_temp', 'mean_temp',
       'min_temp', 'pressure', 'snow_depth', 'evgeniy_gennadievich_beer', '?',
       'halley_comet_tail_meters', 'astrologies_week_proclaim', 'bike_rentals',
       'taxi_rides', 'broom_rentals', 'year', 'month', 'hour', 'day', 'Q',
       'woring_days', 'public_holidays', 'weekend_days', 'day_of_week',
       'licensed_taxis', 'collision_cycle_count', 'collision_all_count',
       'claimant_count_percentage', 'motorcycles_and_bicycles', 'gdp',
       'inflation', 'economic_activity_rate', 'in_employment_thousands',
       'Thames_level_capacity', 'population',
       'percentage_change_alcohol_and_tobacco', 'unemployment_rate', 'temp',
       'dwpt', 'rhum', 'wdir', 'wspd', 'pres', 'darkness', 'moon_phase_value',
       'moon_phase', 'event_dt', 'precipitation', 'hourly_broom_rentals',
       'hourly_taxi_rides', 'hourly_bike_rentals',
       'hourly_daily_broom_rentals', 'hourly_daily_taxi_rides',
       

In [1124]:
X = train.drop(columns = ['broom_rentals', 'taxi_rides', 'bike_rentals', 'event_dt'])
labels = train[['broom_rentals', 'taxi_rides', 'bike_rentals', 'event_dt']]

## Отбор по коррелляции

In [1127]:
def corr_df(x, corr_val):
    drop_cols = set()
    corr_matrix = x.corr()

    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) >= corr_val:
                colname_i = corr_matrix.columns[i]
                colname_j = corr_matrix.columns[j]
                print(f"{colname_i} | {colname_j} | {round(corr_matrix.iloc[i, j], 2)}")

                # Добавляем колонку с большим индексом в список на удаление
                drop_cols.add(colname_i)

    print(f"Columns to drop: {drop_cols}")

    return drop_cols


In [1129]:
drop_columns_corr = corr_df(X.select_dtypes(include=np.number), 0.8)

global_radiation | sunshine | 0.86
mean_temp | max_temp | 0.92
min_temp | max_temp | 0.82
min_temp | mean_temp | 0.96
Q | month | 0.97
weekend_days | woring_days | -0.95
licensed_taxis | year | 0.8
population | in_employment_thousands | 0.83
temp | max_temp | 0.84
temp | mean_temp | 0.9
temp | min_temp | 0.85
dwpt | mean_temp | 0.86
dwpt | min_temp | 0.86
dwpt | temp | 0.82
pres | pressure | 0.96
hourly_taxi_rides | hourly_broom_rentals | -0.81
hourly_bike_rentals | hour | 0.84
hourly_bike_rentals | hourly_broom_rentals | -0.93
hourly_bike_rentals | hourly_taxi_rides | 0.92
hourly_daily_taxi_rides | hourly_broom_rentals | -0.81
hourly_daily_taxi_rides | hourly_taxi_rides | 1.0
hourly_daily_taxi_rides | hourly_bike_rentals | 0.91
hourly_daily_bike_rentals | hour | 0.84
hourly_daily_bike_rentals | hourly_broom_rentals | -0.93
hourly_daily_bike_rentals | hourly_taxi_rides | 0.91
hourly_daily_bike_rentals | hourly_bike_rentals | 1.0
hourly_daily_bike_rentals | hourly_daily_taxi_rides | 0.9

In [1130]:
X.drop(columns = drop_columns_corr, inplace = True)

## Отбор по шапам

In [1132]:
def feature_selection_with_shap(X, y, threshold=0):
    iteration = 0
    columns_to_remove = []
    params = {
        'iterations': 900,
        'depth': 4,
        'learning_rate': 0.12,
        'l2_leaf_reg': 3,
        'random_strength': 1,
        'border_count': 32,
        'loss_function': 'RMSE',  # Set for regression
        'eval_metric': 'RMSE',     # Evaluation metric for regression
        'verbose': False,
        'random_seed': 42,
    }
    model = cb.CatBoostRegressor(**params)
    
    while True:
        iteration += 1
        print(f"Iteration {iteration}, number of features in data_set: {X.shape[1]}")

        # Train model
        model.fit(X, y)

        # Calculate SHAP values
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X)

        # Mean importance across all samples
        feature_importances = np.mean(np.abs(shap_values), axis=0)

        # Identify features with importance below the threshold
        low_importance_features = [(col, imp) for col, imp in zip(X.columns, feature_importances) if imp <= threshold]

        # Print features with low importance and their values
        if low_importance_features:
            print("Features with importance below threshold:")
            for feature, importance in low_importance_features:
                print(f"  {feature}: {importance}")
            
            # Add features to removal list
            columns_to_remove.extend([feature for feature, _ in low_importance_features])

            # Drop low importance features
            X = X.drop(columns=[feature for feature, _ in low_importance_features])
        else:
            print('There are no low shap features on this step')
            break

    return columns_to_remove


In [1133]:
y.columns

Index(['broom_rentals', 'taxi_rides', 'bike_rentals'], dtype='object')

In [1134]:
y_bikes = labels['bike_rentals']
y_brooms = labels['broom_rentals']
y_taxi = labels['taxi_rides']

In [1137]:
X.columns[24]

'Thames_level_capacity'

### bikes

In [1143]:
X[X['Thames_level_capacity'] == '---'] = X[X['Thames_level_capacity'] != '---'].median()

  X[X['Thames_level_capacity'] == '---'] = X[X['Thames_level_capacity'] != '---'].median()


In [1145]:
drop_by_shaps_bikes = feature_selection_with_shap(X, y_bikes, 0.1)

Iteration 1, number of features in data_set: 49
Features with importance below threshold:
  snow_depth: 0.0
  evgeniy_gennadievich_beer: 0.001005264747494097
  ?: 0.0
  halley_comet_tail_meters: 0.0008116713484566261
  astrologies_week_proclaim: 0.0784569325734682
  public_holidays: 0.03166631806221939
  collision_all_count: 0.05668057489042104
  unemployment_rate: 0.07602375749178131
  wdir: 0.07331819331199754
  wspd: 0.05682205868607424
  darkness: 0.05006912839677082
  moon_phase_value: 0.020234709380733965
  moon_phase: 0.0842954690668214
  hourly_daily_broom_rentals: 0.07002143278883467
  hourly_daily_monthly_broom_rentals: 0.029244355186368077
  left_card_code: 0.0
  right_card_code: 0.0
  tarot_code_sum: 0.0
  snow_depth_binary: 0.0
Iteration 2, number of features in data_set: 30
Features with importance below threshold:
  cloud_cover: 0.09238788154318531
Iteration 3, number of features in data_set: 29
There are no low shap features on this step


In [1146]:
X_bikes = X.drop(columns = drop_by_shaps_bikes)

### brooms

In [1148]:
drop_by_shaps_brooms = feature_selection_with_shap(X, y_brooms, 0.1)

Iteration 1, number of features in data_set: 49
Features with importance below threshold:
  pressure: 0.09566058895742949
  snow_depth: 0.009046926017879294
  evgeniy_gennadievich_beer: 0.006050780375160718
  astrologies_week_proclaim: 0.086446165104057
  year: 0.031078375624632285
  month: 0.031088381013637898
  woring_days: 0.009337351387119499
  public_holidays: 0.004783582653653575
  day_of_week: 0.053637459382957435
  collision_cycle_count: 0.07908535669091957
  claimant_count_percentage: 0.06200836762107927
  motorcycles_and_bicycles: 0.06801128643320024
  gdp: 0.06932159819571361
  inflation: 0.09817313633759144
  economic_activity_rate: 0.046581698685972006
  Thames_level_capacity: 0.06980055774563834
  percentage_change_alcohol_and_tobacco: 0.061084600545341844
  unemployment_rate: 0.0852645312811811
  wspd: 0.06146391116276281
  darkness: 0.009282987352538167
  moon_phase_value: 0.09167540018336476
  moon_phase: 0.016675275697376035
  precipitation: 0.08276254048237733
  dail

In [1149]:
X_brooms = X.drop(columns = drop_by_shaps_brooms)

### taxi

In [1151]:
drop_by_shaps_taxi = feature_selection_with_shap(X, y_taxi, 0.1)

Iteration 1, number of features in data_set: 49
Features with importance below threshold:
  snow_depth: 0.02455001876498638
  darkness: 0.0
  left_card_code: 0.0
  right_card_code: 0.06392780036824662
  snow_depth_binary: 0.07303638390182536
Iteration 2, number of features in data_set: 44
There are no low shap features on this step


In [1152]:
X_taxi = X.drop(columns = drop_by_shaps_taxi)

# Обучение

## Подготовка к обучению. Линрег

In [585]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import train_test_split

### bikes

In [587]:
# Разделите данные на обучающую и тестовую выборки
X_train_bikes, X_test_bikes, y_train_bikes, y_test_bikes = train_test_split(X_bikes, y_bikes, test_size=0.2, random_state=42)

# Нормализуем данные с помощью StandardScaler
scaler = StandardScaler()
X_train_scaled_bikes = scaler.fit_transform(X_train_bikes)
X_test_scaled_bikes = scaler.transform(X_test_bikes)

### brooms

In [589]:
# Разделите данные на обучающую и тестовую выборки
X_train_brooms, X_test_brooms, y_train_brooms, y_test_brooms = train_test_split(X_brooms, y_brooms, test_size=0.2, random_state=42)

# Нормализуем данные с помощью StandardScaler
scaler = StandardScaler()
X_train_scaled_brooms = scaler.fit_transform(X_train_brooms)
X_test_scaled_brooms = scaler.transform(X_test_brooms)

### taxi

In [591]:
# Разделите данные на обучающую и тестовую выборки
X_train_taxi, X_test_taxi, y_train_taxi, y_test_taxi = train_test_split(X_taxi, y_taxi, test_size=0.2, random_state=42)

# Нормализуем данные с помощью StandardScaler
scaler = StandardScaler()
X_train_scaled_taxi = scaler.fit_transform(X_train_taxi)
X_test_scaled_taxi = scaler.transform(X_test_taxi)

## LinReg

### bikes

In [594]:
# Обучение модели линейной регрессии
model_linreg_bikes= LinearRegression()
model_linreg_bikes.fit(X_train_scaled_bikes, y_train_bikes)

# Предсказание и оценка модели
y_pred_bikes = model_linreg_bikes.predict(X_test_scaled_bikes)
mape_linreg_bikes = mean_absolute_percentage_error(y_test_bikes, y_pred_bikes)

print("MAPE на тестовой выборке:", mape_linreg_bikes)

MAPE на тестовой выборке: 1.8759022924878224


### brooms

In [596]:
# Обучение модели линейной регрессии
model_linreg_brooms = LinearRegression()
model_linreg_brooms.fit(X_train_scaled_brooms, y_train_brooms)

# Предсказание и оценка модели
y_pred_brooms = model_linreg_brooms.predict(X_test_scaled_brooms)
mape_linreg_brooms = mean_absolute_percentage_error(y_test_brooms, y_pred_brooms)

print("MAPE на тестовой выборке:", mape_linreg_brooms)

MAPE на тестовой выборке: 0.3992528061460627


### taxi

In [598]:
# Обучение модели линейной регрессии
model_linreg_taxi = LinearRegression()
model_linreg_taxi.fit(X_train_scaled_taxi, y_train_taxi)

# Предсказание и оценка модели
y_pred_taxi = model_linreg_taxi.predict(X_test_scaled_taxi)
mape_linreg_taxi = mean_absolute_percentage_error(y_test_taxi, y_pred_taxi)

print("Mape на тестовой выборке:", mape_linreg_taxi)

Mape на тестовой выборке: 0.33543119855036024


## Подготовка к Бустингам

### bikes

In [1153]:
columns_unique_val_bikes = X_train_bikes.apply(lambda r: r.nunique(), axis = 0).sort_values()

object_columns_bikes = columns_unique_val_bikes[(columns_unique_val_bikes > 2) & (columns_unique_val_bikes < 30)].index.tolist()

X_train_bikes[object_columns_bikes] = X_train_bikes[object_columns_bikes].astype('str')
X_test_bikes[object_columns_bikes] = X_test_bikes[object_columns_bikes].astype('str')

X_train_bikes[object_columns_bikes] = X_train_bikes[object_columns_bikes].astype('category')
X_train_bikes[object_columns_bikes] = X_train_bikes[object_columns_bikes].astype('category')

cat_feats_bikes = X_train_bikes.select_dtypes(include = ['category']).columns.tolist()

### brooms

In [1155]:
columns_unique_val_brooms = X_train_brooms.apply(lambda r: r.nunique(), axis = 0).sort_values()

object_columns_brooms = columns_unique_val_brooms[(columns_unique_val_brooms > 2) & (columns_unique_val_brooms < 30)].index.tolist()

X_train_brooms[object_columns_brooms] = X_train_brooms[object_columns_brooms].astype('str')
X_test_brooms[object_columns_brooms] = X_test_brooms[object_columns_brooms].astype('str')

X_train_brooms[object_columns_brooms] = X_train_brooms[object_columns_brooms].astype('category')
X_train_brooms[object_columns_brooms] = X_train_brooms[object_columns_brooms].astype('category')

cat_feats_brooms = X_train_brooms.select_dtypes(include = ['category']).columns.tolist()

### taxi

In [1157]:
columns_unique_val_taxi = X_train_taxi.apply(lambda r: r.nunique(), axis = 0).sort_values()

object_columns_taxi = columns_unique_val_taxi[(columns_unique_val_taxi > 2) & (columns_unique_val_taxi < 30)].index.tolist()

X_train_taxi[object_columns_taxi] = X_train_taxi[object_columns_taxi].astype('str')
X_test_taxi[object_columns_taxi] = X_test_taxi[object_columns_taxi].astype('str')

X_train_taxi[object_columns_taxi] = X_train_taxi[object_columns_taxi].astype('category')
X_train_taxi[object_columns_taxi] = X_train_taxi[object_columns_taxi].astype('category')

cat_feats_taxi = X_train_taxi.select_dtypes(include = ['category']).columns.tolist()

### test

In [1159]:
test_cat_cols = set(object_columns_taxi).union(set(object_columns_bikes)).union(set(object_columns_brooms))

In [1160]:
test_cat_cols = list(test_cat_cols)

In [1161]:
test[test_cat_cols] = test[test_cat_cols].astype('str')
test[test_cat_cols] = test[test_cat_cols].astype('category')

## Catboost

### bikes

In [1164]:
def objective(trial):
    # Hyperparameters for tuning
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-5, 10),
        'random_strength': trial.suggest_float('random_strength', 1e-5, 10),
        'border_count': trial.suggest_int('border_count', 1, 255),
        'loss_function': 'MAPE',  # Set for regression with MAPE
        'eval_metric': 'MAPE',    # Evaluation metric for regression with MAPE
        'verbose': 0,
        'use_best_model': True
    }

    # Create and train the model
    model = CatBoostRegressor(**params)

    # Split data into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_train_bikes, y_train_bikes, test_size=0.2, random_state=42)

    # Train the model
    model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=100, cat_features = cat_feats_bikes)

    # Predictions on the validation set
    y_pred = model.predict(X_val)

    # MAPE metric calculation
    mape = mean_absolute_percentage_error(y_val, y_pred)

    return mape

In [1165]:
use_optuna = True

In [1166]:
if use_optuna:
    # Optimize the objective function
    study = optuna.create_study(direction='minimize')  # Minimize MAPE
    study.optimize(objective, n_trials=50)

    # Display the best parameters and MAPE score
    print("Best hyperparameters: ", study.best_params)
    print("Best MAPE: ", study.best_value)

    # Train the final model using the best parameters
    model_catboost_bikes = CatBoostRegressor(**study.best_params)
else:
    print("Training without hyperparameter optimization")

    # Default parameters if not using Optuna
    best_params = {
        'iterations': 900,
        'depth': 4,
        'learning_rate': 0.03,
        'loss_function': 'MAPE',
        'eval_metric': 'MAPE',
        'verbose': True,
        'random_seed': 42,
        'early_stopping_rounds': 20
    }
    # Train the final model using default parameters
    model_catboost_bikes = CatBoostRegressor(**best_params)

[I 2024-11-08 19:53:49,193] A new study created in memory with name: no-name-605b326a-843b-4aa8-9a82-fdadb5708243
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-5, 10),
[I 2024-11-08 19:54:23,118] Trial 0 finished with value: 0.7357550397370723 and parameters: {'iterations': 614, 'depth': 10, 'learning_rate': 0.0001671099186423481, 'l2_leaf_reg': 0.00022741988655419407, 'random_strength': 8.552172767796367, 'border_count': 55}. Best is trial 0 with value: 0.7357550397370723.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-5, 10),
[I 2024-11-08 19:54:45,209] Trial 1 finished with value: 0.37383500996202623 and parameters: {'iterations': 674, 'depth': 5, 'learning_rate': 0.010124532518100285, 'l2_leaf_reg': 0.04122017015479988, 'random_strength': 5.5510971392286494, 'border_count': 136}. Best is trial 1 with value: 0.

Best hyperparameters:  {'iterations': 714, 'depth': 10, 'learning_rate': 0.07772336751033257, 'l2_leaf_reg': 0.411586674568185, 'random_strength': 5.298218246721909, 'border_count': 37}
Best MAPE:  0.15976125751488224


In [1174]:
# Fit the model on the training set
model_catboost_bikes.fit(X_train_bikes, y_train_bikes, cat_features = cat_feats_bikes)

0:	learn: 77.6637446	total: 174ms	remaining: 2m 3s
1:	learn: 72.6743090	total: 342ms	remaining: 2m 1s
2:	learn: 68.0212801	total: 509ms	remaining: 2m
3:	learn: 63.4446913	total: 627ms	remaining: 1m 51s
4:	learn: 59.2603039	total: 758ms	remaining: 1m 47s
5:	learn: 55.7305248	total: 791ms	remaining: 1m 33s
6:	learn: 52.1485586	total: 994ms	remaining: 1m 40s
7:	learn: 49.0028619	total: 1.11s	remaining: 1m 38s
8:	learn: 46.0496633	total: 1.22s	remaining: 1m 35s
9:	learn: 43.5497996	total: 1.25s	remaining: 1m 28s
10:	learn: 41.1785496	total: 1.29s	remaining: 1m 22s
11:	learn: 38.8501930	total: 1.38s	remaining: 1m 20s
12:	learn: 36.7861840	total: 1.63s	remaining: 1m 27s
13:	learn: 34.9516375	total: 1.74s	remaining: 1m 27s
14:	learn: 33.3303661	total: 1.88s	remaining: 1m 27s
15:	learn: 31.7541048	total: 1.99s	remaining: 1m 26s
16:	learn: 30.3595089	total: 2.15s	remaining: 1m 28s
17:	learn: 29.1539225	total: 2.31s	remaining: 1m 29s
18:	learn: 28.0006466	total: 2.47s	remaining: 1m 30s
19:	learn

<catboost.core.CatBoostRegressor at 0x32de742d0>

In [1175]:
y_pred = model_catboost_bikes.predict(X_test_bikes)
mape_catboost_taxi = mean_absolute_percentage_error(y_test_bikes, y_pred)

print(f"MAPE: {mape_catboost_taxi:.4f}")

MAPE: 0.2469


In [1271]:
model_catboost_bikes.fit(pd.concat([X_train_bikes, X_test_bikes], axis = 0), pd.concat([y_train_bikes, y_test_bikes], axis = 0), cat_features = cat_feats_bikes)

0:	learn: 77.4214444	total: 215ms	remaining: 2m 33s
1:	learn: 72.1229431	total: 342ms	remaining: 2m 1s
2:	learn: 67.3227748	total: 409ms	remaining: 1m 36s
3:	learn: 62.7974329	total: 540ms	remaining: 1m 35s
4:	learn: 58.8791814	total: 680ms	remaining: 1m 36s
5:	learn: 55.2251860	total: 723ms	remaining: 1m 25s
6:	learn: 51.7755022	total: 862ms	remaining: 1m 27s
7:	learn: 48.6283840	total: 941ms	remaining: 1m 23s
8:	learn: 45.7760120	total: 1.07s	remaining: 1m 24s
9:	learn: 43.3370609	total: 1.11s	remaining: 1m 18s
10:	learn: 41.0171370	total: 1.24s	remaining: 1m 19s
11:	learn: 38.7986364	total: 1.35s	remaining: 1m 18s
12:	learn: 36.7559072	total: 1.45s	remaining: 1m 18s
13:	learn: 34.9440077	total: 1.53s	remaining: 1m 16s
14:	learn: 33.4926480	total: 1.56s	remaining: 1m 12s
15:	learn: 31.8121804	total: 1.65s	remaining: 1m 11s
16:	learn: 30.4012744	total: 1.76s	remaining: 1m 12s
17:	learn: 29.1154528	total: 1.86s	remaining: 1m 12s
18:	learn: 28.0962807	total: 1.93s	remaining: 1m 10s
19:	

<catboost.core.CatBoostRegressor at 0x32de742d0>

### brooms

In [1177]:
def objective(trial):
    # Hyperparameters for tuning
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-5, 10),
        'random_strength': trial.suggest_float('random_strength', 1e-5, 10),
        'border_count': trial.suggest_int('border_count', 1, 255),
        'loss_function': 'MAPE',  # Set for regression with MAPE
        'eval_metric': 'MAPE',    # Evaluation metric for regression with MAPE
        'verbose': 0,
        'use_best_model': True
    }

    # Create and train the model
    model = CatBoostRegressor(**params)

    # Split data into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_train_brooms, y_train_brooms, test_size=0.2, random_state=42)

    # Train the model
    model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=100, cat_features = cat_feats_brooms)

    # Predictions on the validation set
    y_pred = model.predict(X_val)

    # MAPE metric calculation
    mape = mean_absolute_percentage_error(y_val, y_pred)

    return mape

In [1178]:
use_optuna = True

In [1179]:
if use_optuna:
    # Optimize the objective function
    study = optuna.create_study(direction='minimize')  # Minimize MAPE
    study.optimize(objective, n_trials=50)

    # Display the best parameters and MAPE score
    print("Best hyperparameters: ", study.best_params)
    print("Best MAPE: ", study.best_value)

    # Train the final model using the best parameters
    model_catboost_brooms = CatBoostRegressor(**study.best_params)
else:
    print("Training without hyperparameter optimization")

    # Default parameters if not using Optuna
    best_params = {
        'iterations': 900,
        'depth': 4,
        'learning_rate': 0.03,
        'loss_function': 'MAPE',
        'eval_metric': 'MAPE',
        'verbose': True,
        'random_seed': 42,
        'early_stopping_rounds': 20
    }
    # Train the final model using default parameters
    model_catboost_brooms = CatBoostRegressor(**best_params)

[I 2024-11-08 20:46:19,923] A new study created in memory with name: no-name-e13ff31c-3ef0-4207-986d-857e2a60a45d
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-5, 10),
[I 2024-11-08 20:46:42,721] Trial 0 finished with value: 0.24775883294741924 and parameters: {'iterations': 586, 'depth': 6, 'learning_rate': 0.03131796659879032, 'l2_leaf_reg': 0.6985433638258146, 'random_strength': 9.23038176409964, 'border_count': 150}. Best is trial 0 with value: 0.24775883294741924.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-5, 10),
[I 2024-11-08 20:47:05,764] Trial 1 finished with value: 0.2678752587233469 and parameters: {'iterations': 334, 'depth': 10, 'learning_rate': 0.01846158395428869, 'l2_leaf_reg': 0.0001346676990409456, 'random_strength': 8.125981854285607, 'border_count': 155}. Best is trial 0 with value: 0.24775

Best hyperparameters:  {'iterations': 944, 'depth': 7, 'learning_rate': 0.09979032275593576, 'l2_leaf_reg': 4.195116890336925, 'random_strength': 5.19008290314254, 'border_count': 91}
Best MAPE:  0.2372881132906737


In [1180]:
# Fit the model on the training set
model_catboost_brooms.fit(X_train_brooms, y_train_brooms, cat_features = cat_feats_brooms)

0:	learn: 195.6038253	total: 75.6ms	remaining: 1m 11s
1:	learn: 180.6005440	total: 120ms	remaining: 56.7s
2:	learn: 165.3427366	total: 169ms	remaining: 52.9s
3:	learn: 151.5467860	total: 212ms	remaining: 49.8s
4:	learn: 139.9218439	total: 243ms	remaining: 45.6s
5:	learn: 129.5778971	total: 284ms	remaining: 44.5s
6:	learn: 118.4493358	total: 323ms	remaining: 43.3s
7:	learn: 109.5384424	total: 359ms	remaining: 42s
8:	learn: 101.4330163	total: 406ms	remaining: 42.2s
9:	learn: 93.7253040	total: 431ms	remaining: 40.3s
10:	learn: 86.9849406	total: 462ms	remaining: 39.2s
11:	learn: 81.0722663	total: 510ms	remaining: 39.6s
12:	learn: 75.1903939	total: 554ms	remaining: 39.7s
13:	learn: 70.5759928	total: 576ms	remaining: 38.2s
14:	learn: 66.7803102	total: 610ms	remaining: 37.8s
15:	learn: 62.7037646	total: 642ms	remaining: 37.2s
16:	learn: 59.3123711	total: 691ms	remaining: 37.7s
17:	learn: 56.0739933	total: 732ms	remaining: 37.7s
18:	learn: 53.4891227	total: 777ms	remaining: 37.8s
19:	learn: 50

<catboost.core.CatBoostRegressor at 0x34d7cec10>

In [1181]:
y_pred = model_catboost_brooms.predict(X_test_brooms)
mape_catboost_brooms = mean_absolute_percentage_error(y_test_brooms, y_pred)

print(f"MAPE: {mape_catboost_brooms:.4f}")

MAPE: 0.3005


In [1272]:
model_catboost_brooms.fit(pd.concat([X_train_brooms, X_test_brooms], axis = 0), pd.concat([y_train_brooms, y_test_brooms], axis = 0), cat_features = cat_feats_brooms)

0:	learn: 194.7249690	total: 75.3ms	remaining: 1m 11s
1:	learn: 179.3376175	total: 145ms	remaining: 1m 8s
2:	learn: 165.5630352	total: 210ms	remaining: 1m 5s
3:	learn: 152.8463823	total: 287ms	remaining: 1m 7s
4:	learn: 141.2195508	total: 368ms	remaining: 1m 9s
5:	learn: 129.9864856	total: 411ms	remaining: 1m 4s
6:	learn: 120.6495493	total: 471ms	remaining: 1m 3s
7:	learn: 112.6813527	total: 559ms	remaining: 1m 5s
8:	learn: 104.9564446	total: 615ms	remaining: 1m 3s
9:	learn: 98.4710894	total: 670ms	remaining: 1m 2s
10:	learn: 92.0158609	total: 732ms	remaining: 1m 2s
11:	learn: 86.4277148	total: 794ms	remaining: 1m 1s
12:	learn: 81.6921541	total: 875ms	remaining: 1m 2s
13:	learn: 77.4231133	total: 951ms	remaining: 1m 3s
14:	learn: 74.4694836	total: 1.02s	remaining: 1m 3s
15:	learn: 70.8742797	total: 1.15s	remaining: 1m 6s
16:	learn: 67.6817541	total: 1.2s	remaining: 1m 5s
17:	learn: 65.2247998	total: 1.27s	remaining: 1m 5s
18:	learn: 63.0846654	total: 1.34s	remaining: 1m 5s
19:	learn: 6

<catboost.core.CatBoostRegressor at 0x34d7cec10>

### taxi

In [1183]:
def objective(trial):
    # Hyperparameters for tuning
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-5, 10),
        'random_strength': trial.suggest_float('random_strength', 1e-5, 10),
        'border_count': trial.suggest_int('border_count', 1, 255),
        'loss_function': 'MAPE',  # Set for regression with MAPE
        'eval_metric': 'MAPE',    # Evaluation metric for regression with MAPE
        'verbose': 0,
        'use_best_model': True
    }

    # Create and train the model
    model = CatBoostRegressor(**params)

    # Split data into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_train_taxi, y_train_taxi, test_size=0.2, random_state=42)

    # Train the model
    model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=100, cat_features = cat_feats_taxi)

    # Predictions on the validation set
    y_pred = model.predict(X_val)

    # MAPE metric calculation
    mape = mean_absolute_percentage_error(y_val, y_pred)

    return mape

In [1184]:
use_optuna = True

In [1185]:
if use_optuna:
    # Optimize the objective function
    study = optuna.create_study(direction='minimize')  # Minimize MAPE
    study.optimize(objective, n_trials=50)

    # Display the best parameters and MAPE score
    print("Best hyperparameters: ", study.best_params)
    print("Best MAPE: ", study.best_value)

    # Train the final model using the best parameters
    model_catboost_taxi = CatBoostRegressor(**study.best_params)
else:
    print("Training without hyperparameter optimization")

    # Default parameters if not using Optuna
    best_params = {
        'iterations': 900,
        'depth': 4,
        'learning_rate': 0.03,
        'loss_function': 'MAPE',
        'eval_metric': 'MAPE',
        'verbose': True,
        'random_seed': 42,
        'early_stopping_rounds': 20
    }
    # Train the final model using default parameters
    model_catboost_taxi = CatBoostRegressor(**best_params)

[I 2024-11-08 21:11:21,027] A new study created in memory with name: no-name-895ce541-a85f-4d93-b65b-cde8ad30f023
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-5, 10),
[I 2024-11-08 21:11:54,268] Trial 0 finished with value: 0.42683151617080206 and parameters: {'iterations': 599, 'depth': 6, 'learning_rate': 0.0010780068787459227, 'l2_leaf_reg': 0.0056571836194291335, 'random_strength': 8.262322351260204, 'border_count': 17}. Best is trial 0 with value: 0.42683151617080206.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-5, 10),
[I 2024-11-08 21:12:50,992] Trial 1 finished with value: 0.6111694484414893 and parameters: {'iterations': 692, 'depth': 9, 'learning_rate': 0.00010064691294113809, 'l2_leaf_reg': 0.2726521590919978, 'random_strength': 3.022610677923362, 'border_count': 251}. Best is trial 0 with value: 0.4

Best hyperparameters:  {'iterations': 869, 'depth': 10, 'learning_rate': 0.09417649461788927, 'l2_leaf_reg': 0.00016771403935439449, 'random_strength': 9.362217823569788, 'border_count': 119}
Best MAPE:  0.1434660973588027


In [1186]:
# Fit the model on the training set
model_catboost_taxi.fit(X_train_taxi, y_train_taxi, cat_features = cat_feats_taxi)

0:	learn: 2531.8377913	total: 184ms	remaining: 2m 39s
1:	learn: 2381.3393767	total: 297ms	remaining: 2m 8s
2:	learn: 2245.0076100	total: 341ms	remaining: 1m 38s
3:	learn: 2123.0964403	total: 516ms	remaining: 1m 51s
4:	learn: 2024.8530750	total: 770ms	remaining: 2m 13s
5:	learn: 1921.1178498	total: 961ms	remaining: 2m 18s
6:	learn: 1841.3700766	total: 1.2s	remaining: 2m 27s
7:	learn: 1765.8832980	total: 1.34s	remaining: 2m 24s
8:	learn: 1709.7120780	total: 1.49s	remaining: 2m 22s
9:	learn: 1665.3454929	total: 1.56s	remaining: 2m 13s
10:	learn: 1610.6415246	total: 1.75s	remaining: 2m 16s
11:	learn: 1562.8702073	total: 1.93s	remaining: 2m 17s
12:	learn: 1530.4635376	total: 2s	remaining: 2m 12s
13:	learn: 1498.7046748	total: 2.15s	remaining: 2m 11s
14:	learn: 1465.3626332	total: 2.31s	remaining: 2m 11s
15:	learn: 1438.7554379	total: 2.46s	remaining: 2m 10s
16:	learn: 1405.1976199	total: 2.62s	remaining: 2m 11s
17:	learn: 1386.4113198	total: 2.77s	remaining: 2m 11s
18:	learn: 1365.5977227	t

<catboost.core.CatBoostRegressor at 0x342017f90>

In [1187]:
y_pred = model_catboost_taxi.predict(X_test_taxi)
mape_catboost_taxi = mean_absolute_percentage_error(y_test_taxi, y_pred)

print(f"MAPE: {mape_catboost_taxi:.4f}")

MAPE: 0.0436


In [1274]:
model_catboost_taxi.fit(pd.concat([X_train_taxi, X_test_taxi], axis = 0), pd.concat([y_train_taxi, y_test_taxi], axis = 0), cat_features = cat_feats_taxi)

0:	learn: 2542.1594308	total: 66ms	remaining: 57.3s
1:	learn: 2390.7053214	total: 203ms	remaining: 1m 28s
2:	learn: 2257.5235975	total: 329ms	remaining: 1m 34s
3:	learn: 2136.2905109	total: 419ms	remaining: 1m 30s
4:	learn: 2029.1393599	total: 591ms	remaining: 1m 42s
5:	learn: 1943.5926470	total: 673ms	remaining: 1m 36s
6:	learn: 1856.6141138	total: 924ms	remaining: 1m 53s
7:	learn: 1791.2873904	total: 1.03s	remaining: 1m 50s
8:	learn: 1738.8955258	total: 1.09s	remaining: 1m 44s
9:	learn: 1684.8347973	total: 1.16s	remaining: 1m 39s
10:	learn: 1637.7753416	total: 1.29s	remaining: 1m 40s
11:	learn: 1585.0566603	total: 1.47s	remaining: 1m 44s
12:	learn: 1540.2853779	total: 1.66s	remaining: 1m 49s
13:	learn: 1505.0938459	total: 1.85s	remaining: 1m 53s
14:	learn: 1475.1672048	total: 2.07s	remaining: 1m 57s
15:	learn: 1444.8880367	total: 2.34s	remaining: 2m 4s
16:	learn: 1424.4283591	total: 2.43s	remaining: 2m 1s
17:	learn: 1401.7401640	total: 2.51s	remaining: 1m 58s
18:	learn: 1378.6994155	

<catboost.core.CatBoostRegressor at 0x342017f90>

## Stacking

### bikes

In [500]:
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
import catboost as ctb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold


In [513]:
# Импорт необходимых библиотек
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, KFold, cross_val_predict
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

from sklearn.linear_model import LinearRegression

# Инициализация базовых моделей
model_xgb = XGBRegressor(random_state=42)
model_lgb = LGBMRegressor(random_state=42)
model_cat = CatBoostRegressor(random_state=42, verbose=0)

# Обучение базовых моделей
model_xgb.fit(X_train_bikes, y_train_bikes)
model_lgb.fit(X_train_bikes, y_train_bikes)
model_cat.fit(X_train_bikes, y_train_bikes)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001658 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2898
[LightGBM] [Info] Number of data points in the train set: 119232, number of used features: 32
[LightGBM] [Info] Start training from score 77.343079


<catboost.core.CatBoostRegressor at 0x3069b4250>

In [516]:
# Получение предсказаний базовых моделей на тестовой выборке
pred_xgb = model_xgb.predict(X_test_bikes)
pred_lgb = model_lgb.predict(X_test_bikes)
pred_cat = model_cat.predict(X_test_bikes)

# Оценка производительности базовых моделей
mape_catboost_taxi = mean_absolute_percentage_error(y_test_bikes, pred_xgb)
mape_catboost_taxi = mean_absolute_percentage_error(y_test_bikes, pred_lgb)
mape_catboost_taxi = mean_absolute_percentage_error(y_test_bikes, pred_cat)


print(f"MAPE: {mape_catboost_taxi:.4f}")
print(f"MAPE: {mape_catboost_taxi:.4f}")
print(f"MAPE: {mape_catboost_taxi:.4f}")

Производительность базовых моделей:
XGBoost RMSE: 7.770236509490514
LightGBM RMSE: 10.962747007645696
CatBoost RMSE: 7.725244565443332


In [518]:
# Стекинг
# Подготовка мета-признаков для стекинга
meta_X_train = np.column_stack((
    cross_val_predict(model_xgb, X_train_bikes, y_train_bikes, cv=5),
    cross_val_predict(model_lgb, X_train_bikes, y_train_bikes, cv=5),
    cross_val_predict(model_cat, X_train_bikes, y_train_bikes, cv=5)
))

meta_X_test = np.column_stack((
    pred_xgb,
    pred_lgb,
    pred_cat
))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001610 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2886
[LightGBM] [Info] Number of data points in the train set: 95385, number of used features: 32
[LightGBM] [Info] Start training from score 77.264832
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001636 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2883
[LightGBM] [Info] Number of data points in the train set: 95385, number of used features: 32
[LightGBM] [Info] Start training from score 77.443183
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001943 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not eno

In [520]:
# Обучение мета-модели
meta_model = LinearRegression()
meta_model.fit(meta_X_train, y_train_bikes)

In [522]:
# Предсказание с помощью мета-модели
meta_pred = meta_model.predict(meta_X_test)

mape_catboost_taxi = mean_absolute_percentage_error(y_test_bikes, meta_pred)

print(f"MAPE: {mape_catboost_taxi:.4f}")

MAPE: 0.3896


# Собираем предикт

In [1236]:
predict = pd.read_csv('data/sample_submission.csv', header = None)

In [1238]:
test.shape

(35064, 69)

In [1240]:
predict.shape

(35064, 1)

In [1276]:
bikes = model_catboost_bikes.predict(test[model_catboost_bikes.feature_names_])
brooms = model_catboost_brooms.predict(test[model_catboost_brooms.feature_names_])
taxi = model_catboost_taxi.predict(test[model_catboost_taxi.feature_names_])

In [None]:
def apply_covid_impact(predictions, start_decline='2020-03-1'):
    # Adding dates to predictions
    y_pred = predictions.copy()
    y_pred['date'] = y_pred.index

    # Additional modifications based on provided logic
    mask = (y_pred['date'] >= '2020-03-1')
    y_pred.loc[mask, 'predictions'] *= 0.2

    mask = (y_pred['date'] >= '2020-03-2')
    y_pred.loc[mask, 'predictions'] *= 1.5

    return y_pred

In [1278]:
def save_submit(submit, name):
    np.savetxt(f'submit_{name}.csv', submit)

In [1279]:

bikes_df = predict.copy()
bikes_df[0] = bikes
save_submit(bikes_df, 'bikes')

In [1280]:
brooms_df = predict.copy()
brooms_df[0] = brooms
save_submit(brooms_df, 'brooms')

In [1281]:
taxi_df = predict.copy()
taxi_df[0] = taxi
save_submit(taxi_df, 'taxi')