In [130]:
!pip install catboost optuna



In [284]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import optuna

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator

# Clssical ML Regression Models
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import (
    LinearRegression,
    Lasso,
    Ridge,
    ElasticNet,
    SGDRegressor,
)
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (
    BaggingRegressor,
    RandomForestRegressor,
    AdaBoostRegressor,
    GradientBoostingRegressor
)
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

# Deep Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Dense,
    Dropout,
    BatchNormalization
)
from tensorflow.keras.optimizers import Adam

import warnings
warnings.filterwarnings("ignore")

In [285]:
X = pd.read_csv("X.csv", index_col=0)
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1008 entries, 0 to 1007
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   cpm               1008 non-null   float64
 1   hour_start        1008 non-null   int64  
 2   hour_end          1008 non-null   int64  
 3   audience_size     1008 non-null   int64  
 4   duration          1008 non-null   int64  
 5   publishers_count  1008 non-null   int64  
 6   middle_hour       1008 non-null   int64  
dtypes: float64(1), int64(6)
memory usage: 63.0 KB


In [286]:
X.head()

Unnamed: 0,cpm,hour_start,hour_end,audience_size,duration,publishers_count,middle_hour
0,220.0,1058,1153,1906,95,2,1106
1,312.0,1295,1301,1380,6,2,1298
2,70.0,1229,1249,888,20,6,1239
3,240.0,1295,1377,440,82,2,1336
4,262.0,752,990,1476,238,4,871


In [287]:
y = pd.read_csv("y.csv", index_col=0)
y.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1008 entries, 0 to 1007
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   at_least_one    1008 non-null   float64
 1   at_least_two    1008 non-null   float64
 2   at_least_three  1008 non-null   float64
dtypes: float64(3)
memory usage: 31.5 KB


In [288]:
y.head()

Unnamed: 0,at_least_one,at_least_two,at_least_three
0,0.043,0.0152,0.0073
1,0.013,0.0,0.0
2,0.0878,0.0135,0.0
3,0.2295,0.1295,0.0727
4,0.3963,0.2785,0.227


In [289]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((806, 7), (202, 7), (806, 3), (202, 3))

In [290]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [291]:
train_data = (X_train_scaled, y_train)
test_data = (X_test_scaled, y_test)

In [292]:
EPS = 0.005

def log_mape_column_value(responses_column, answers_column, epsilon=EPS):
    return np.abs(np.log(
        (responses_column + epsilon) / (answers_column + epsilon)
    )).mean()

def mean_log_accuracy_ratio(answers, responses, epsilon=EPS):
    log_accuracy_ratio_mean = np.array(
        [
            log_mape_column_value(responses.at_least_one, answers.at_least_one, epsilon),
            log_mape_column_value(responses.at_least_two, answers.at_least_two, epsilon),
            log_mape_column_value(responses.at_least_three, answers.at_least_three, epsilon),
        ]
    ).mean()

    percentage_error = 100 * (np.exp(log_accuracy_ratio_mean) - 1)

    return percentage_error.round(decimals=2)

In [293]:
def try_model_on_sample(
    model_class: BaseEstimator,
    train_data: tuple[pd.DataFrame],
    test_data: tuple[pd.DataFrame],
    **model_params
) -> float:

    y_pred = test_data[1].copy()

    for i in range(3):
        model = model_class(**model_params)
        model.fit(train_data[0], train_data[1].iloc[:, i])
        pred = model.predict(test_data[0])
        y_pred.iloc[:, i] = pred

    return mean_log_accuracy_ratio(y_pred, test_data[1])


def try_model(
    model_class: BaseEstimator,
    X: pd.DataFrame,
    y: pd.DataFrame,
    n_times: int=5,
    **model_params
) -> float:

    results = np.zeros(n_times)

    for i in range(n_times):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y,
            test_size=0.2
        )

        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        train_data = (X_train_scaled, y_train)
        test_data = (X_test_scaled, y_test)

        results[i] = try_model_on_sample(
            model_class,
            train_data,
            test_data,
            **model_params
        )

    return results.mean()


In [188]:
model_results = pd.Series()

# Случайное предсказание

In [189]:
dummy_result = try_model(
    DummyRegressor, X, y
)

model_results["dummy"] = dummy_result
dummy_result

363.318

# Линейные модели

In [190]:
linear_regression_result = try_model(
    LinearRegression, X, y
)

model_results["linear_regression"] = linear_regression_result
linear_regression_result

186.536

In [191]:
lasso_regression_result = try_model(
    Lasso, X, y
)

model_results["lasso"] = lasso_regression_result
lasso_regression_result

359.08399999999995

In [192]:
ridge_regression_result = try_model(
    Ridge, X, y
)

model_results["ridge"] = ridge_regression_result
ridge_regression_result

193.53799999999998

In [193]:
elastic_net_regression_result = try_model(
    ElasticNet, X, y
)

model_results["elastic_net"] = elastic_net_regression_result
elastic_net_regression_result

362.266

In [194]:
sgd_regression_result = try_model(
    SGDRegressor, X, y
)

model_results["sgd"] = sgd_regression_result
sgd_regression_result

186.13

# Метод опорных веторов

In [195]:
svr_regression_result = try_model(
    SVR, X, y
)

model_results["svm"] = svr_regression_result
svr_regression_result

332.83

# Дерево решений

In [196]:
tree_regression_result = try_model(
    DecisionTreeRegressor, X, y
)

model_results["tree"] = tree_regression_result
tree_regression_result

135.55599999999998

# Ансаблевые беггинговые методы

In [294]:
bagging_regression_result = try_model(
    BaggingRegressor, X, y
)

model_results["bagging"] = bagging_regression_result
bagging_regression_result

105.404

In [295]:
random_forest_regression_result = try_model(
    RandomForestRegressor, X, y
)

model_results["random_forest"] = random_forest_regression_result
random_forest_regression_result

95.476

# Ансаблевые бустинговые методы

In [199]:
ada_boost_regression_result = try_model(
    AdaBoostRegressor, X, y
)

model_results["ada_boost"] = ada_boost_regression_result
ada_boost_regression_result

327.892

In [200]:
gradient_boost_regression_result = try_model(
    GradientBoostingRegressor, X, y
)

model_results["gradient_boost"] = gradient_boost_regression_result
gradient_boost_regression_result

140.032

In [201]:
xgb_regression_result = try_model(
    XGBRegressor, X, y
)

model_results["xgb"] = xgb_regression_result
xgb_regression_result

131.51999999999998

In [202]:
cat_boost_regression_result = try_model(
    CatBoostRegressor, X, y, silent=True
)

model_results["cat_boost"] = cat_boost_regression_result
cat_boost_regression_result

138.722

In [203]:
# model_results.columns = ["mean_log_accuracy_ratio"]
model_results.sort_values()

Unnamed: 0,0
random_forest,100.752
bagging,101.004
xgb,131.52
tree,135.556
cat_boost,138.722
gradient_boost,140.032
sgd,186.13
linear_regression,186.536
ridge,193.538
ada_boost,327.892


In [273]:
def custom_loss(
    y_true: np.ndarray,
    y_pred: np.ndarray
) -> float:

    E = y_true - y_pred
    E[E < 0] = E[E < 0] * -2
    return E.sum()

def optimize_rf_hyperparameters(X_train, y_train):

    def objective(trial):

        # Определяем гиперпараметры для оптимизации
        n_estimators = trial.suggest_int('n_estimators', 10, 100)
        max_depth = trial.suggest_int('max_depth', 1, 50)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
        bootstrap = trial.suggest_categorical('bootstrap', [True, False])
        max_features = trial.suggest_int('max_features', 2, 7)

        # Создаем модель Random Forest с заданными гиперпараметрами
        model = RandomForestRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            bootstrap=bootstrap,
            max_features=max_features
        )

        model.fit(X_train, y_train)
        y_pred = model.predict(X_train)

        return log_mape_column_value(y_train, y_pred)

    # Создаем объект исследования
    study = optuna.create_study(direction='minimize')  # Максимизируем оценку
    study.optimize(objective, n_trials=50, n_jobs=-1)  # Оптимизация на 100 испытаниях

    # Возвращаем наилучшие гиперпараметры
    return study.best_params


def optimize_all_targets(
    X_train: pd.DataFrame,
    y_train: pd.DataFrame,
    optimize_func
) -> dict[dict]:

    best_params = {}

    for target in tqdm(y_train.columns):
        best_params[target] = optimize_func(
            X_train, y_train[target]
        )

    return best_params

In [328]:
def optimize_rf_hyperparameters(
    X_train, y_train,
    X_val, y_val
) -> dict:

    def objective(trial):

        # Определяем гиперпараметры для оптимизации
        n_estimators = trial.suggest_int('n_estimators', 10, 300)
        max_depth = trial.suggest_int('max_depth', 1, 50)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
        bootstrap = trial.suggest_categorical('bootstrap', [True, False])
        max_features = trial.suggest_int('max_features', 2, 7)

        y_pred = y_val.copy()

        for target in y_train.columns:
            # Создаем модель Random Forest с заданными гиперпараметрами
            model = RandomForestRegressor(
                n_estimators=n_estimators,
                max_depth=max_depth,
                min_samples_split=min_samples_split,
                min_samples_leaf=min_samples_leaf,
                bootstrap=bootstrap,
                max_features=max_features
            )

            model.fit(X_train, y_train[target])
            y_pred[target] = model.predict(X_val)

        return mean_log_accuracy_ratio(y_val, y_pred)

    # Создаем объект исследования
    study = optuna.create_study(direction='minimize')  # Максимизируем оценку
    study.optimize(objective, n_trials=100, n_jobs=-1)  # Оптимизация на 100 испытаниях

    # Возвращаем наилучшие гиперпараметры
    return study.best_params

In [329]:
best_rf_params = optimize_rf_hyperparameters(
    X_train_scaled, y_train, X_test_scaled, y_test
)

[I 2024-12-13 18:50:15,242] A new study created in memory with name: no-name-5fe1caf8-9c95-4e67-88c2-b561e4d1e4c2
[I 2024-12-13 18:50:16,179] Trial 0 finished with value: 95.37 and parameters: {'n_estimators': 79, 'max_depth': 4, 'min_samples_split': 4, 'min_samples_leaf': 6, 'bootstrap': True, 'max_features': 6}. Best is trial 0 with value: 95.37.
[I 2024-12-13 18:50:16,236] Trial 1 finished with value: 102.27 and parameters: {'n_estimators': 81, 'max_depth': 5, 'min_samples_split': 2, 'min_samples_leaf': 9, 'bootstrap': False, 'max_features': 4}. Best is trial 0 with value: 95.37.
[I 2024-12-13 18:50:17,977] Trial 2 finished with value: 85.48 and parameters: {'n_estimators': 95, 'max_depth': 8, 'min_samples_split': 2, 'min_samples_leaf': 1, 'bootstrap': True, 'max_features': 7}. Best is trial 2 with value: 85.48.
[I 2024-12-13 18:50:18,575] Trial 4 finished with value: 88.87 and parameters: {'n_estimators': 31, 'max_depth': 9, 'min_samples_split': 6, 'min_samples_leaf': 1, 'bootstrap

In [330]:
best_rf_params

{'n_estimators': 169,
 'max_depth': 24,
 'min_samples_split': 10,
 'min_samples_leaf': 2,
 'bootstrap': True,
 'max_features': 7}

In [331]:
try_model(
    RandomForestRegressor, X, y, **best_rf_params
)

96.38399999999999

In [332]:
def fit_regressors(
    X_train, y_train,
    regressor_class: BaseEstimator,
    params_for_all: dict[dict]
) -> tuple[BaseEstimator]:

    trained_regressors = {}

    for name, params in tqdm(params_for_all.items()):
        regressor = regressor_class(**params)
        regressor.fit(X_train, y_train[name])
        trained_regressors[name] = regressor

    return trained_regressors

In [322]:
class InfPipeline:
    def __init__(self, scaler, regressors):
        """
        Инициализация пайплайна.

        :param scaler: Обученный StandardScaler
        :param regressors: Список обученных регрессоров
        """
        self.scaler = scaler
        self.regressors = regressors

    def predict(self, X):
        """
        Метод для предсказания на основе входных данных X.

        :param X: Входные данные
        :return: DataFrame с предсказаниями
        """
        # Стандартизация данных
        X_scaled = self.scaler.transform(X)

        # Получение предсказаний от каждого регрессора
        predictions = {
            name: regressor.predict(X_scaled)
            for name, regressor in self.regressors.items()
        }

        # Создание DataFrame с предсказаниями
        predictions_df = pd.DataFrame(predictions)
        # predictions_df.columns = [f'prediction_{i+1}' for i in range(len(self.regressors))]

        return predictions_df