###Prepare downloads

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from catboost import CatBoostClassifier, cv
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
import joblib

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
path_to_data = "/content/drive/MyDrive/SBER_Base/Data"

In [None]:
def get_metrics(y_gt, y_pred, average='binary'):
    metrics = {
        'accuracy': accuracy_score(y_gt, y_pred),
        'precision': precision_score(y_gt, y_pred, average=average, zero_division=0),
        'recall': recall_score(y_gt, y_pred, average=average, zero_division=0),
        'f1': f1_score(y_gt, y_pred, average=average, zero_division=0)
    }

    metrics = {k: round(v, 4) for k, v in metrics.items()}

    return metrics

In [None]:
seed = 42

###Необработанные данные

In [None]:
path_to_train = os.path.join(path_to_data, "invest_train.csv")

train_df = pd.read_csv(path_to_train, index_col='customer_id')

X_train_raw = train_df.drop(columns=["accepted"])
y_train_raw = train_df["accepted"]

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train_raw, y_train_raw, test_size=0.2, random_state=seed)

In [None]:
num_features = ["age", "balance", "offer_amount"]
binary_features = ["previous_investments", "responded_before"]
cat_features = ["risk_profile", "marketing_channel", "membership_tier"]

###Обработанные данные

In [None]:
path_to_train_final = os.path.join(path_to_data, "train_final.csv")

train_final_df = pd.read_csv(path_to_train_final)

In [None]:
X_train_final, X_val_final, y_train_final, y_val_final = train_test_split(train_final_df, y_train_raw, test_size=0.2, random_state=seed)

###Обучение модели Catboost для необработанных данных

In [None]:
# grid = {
#     'iterations': [100, 200, 500],
#     'learning_rate': [0.01, 0.03, 0.1, 0.3],
#     'depth': [4, 6, 8],
#     'l2_leaf_reg': [1, 3, 5]
# }

# model = CatBoostClassifier(cat_features=cat_features, verbose=False)
# grid_search_result = model.grid_search(
#     grid,
#     X=X_train,
#     y=y_train,
#     cv=3,
#     plot=False
# )

# print("Best params:", grid_search_result['params'])

# best_model = CatBoostClassifier(**grid_search_result['params'], cat_features=cat_features)
# best_model.fit(X_train, y_train, verbose=False)

# y_pred = best_model.predict(X_val)
# val_metrics = get_metrics(y_val, y_pred)
# print("Validation metrics:", val_metrics)

Best params: {'depth': 6, 'learning_rate': 0.3, 'l2_leaf_reg': 3, 'iterations': 200}


Validation metrics: {'accuracy': 0.7675, 'precision': 0.8092, 'recall': 0.8899, 'f1': 0.8476}

In [None]:
catboost_model_nondata = CatBoostClassifier(
    depth=6,
    learning_rate=0.3,
    l2_leaf_reg=3,
    iterations=200,
    cat_features=cat_features,
    random_seed=seed,
    verbose=False
)

catboost_model_nondata.fit(X_train, y_train)

y_pred_boost = catboost_model_nondata.predict(X_val)
val_metrics = get_metrics(y_val, y_pred_boost)
print("Best model metrics:", val_metrics)

Best model metrics: {'accuracy': 0.765, 'precision': 0.806, 'recall': 0.8911, 'f1': 0.8464}


###Обучение модели Catboost для обработанных данных

In [None]:
# grid = {
#     'iterations': [100, 200, 500],
#     'learning_rate': [0.01, 0.03, 0.1, 0.3],
#     'depth': [4, 6, 8],
#     'l2_leaf_reg': [1, 3, 5]
# }

# model = CatBoostClassifier(verbose=False, seed=seed)
# grid_search_result = model.grid_search(
#     grid,
#     X=train_df,
#     y=y_train_df,
#     cv=5,
#     plot=False
# )

# print("Best params:", grid_search_result['params'])

# best_model = CatBoostClassifier(**grid_search_result['params'])
# best_model.fit(train_df, y_train_df, verbose=False)

# y_pred = best_model.predict(val_df)
# val_metrics = get_metrics(y_val_df, y_pred)
# print("Validation metrics:", val_metrics)

Best params: {'depth': 6, 'learning_rate': 0.1, 'l2_leaf_reg': 5, 'iterations': 100}
Validation metrics: {'accuracy': 0.7842, 'precision': 0.8223, 'recall': 0.8968, 'f1': 0.8579}

In [None]:
catboost_model_processedData = CatBoostClassifier(
    depth=6,
    learning_rate=0.1,
    l2_leaf_reg=5,
    iterations=100,
    random_seed=seed,
    verbose=False
)

catboost_model_processedData.fit(X_train_final, y_train_final)

y_pred_boost = catboost_model_processedData.predict(X_val_final)
val_metrics = get_metrics(y_val_final, y_pred_boost)
print("Best model metrics:", val_metrics)

Best model metrics: {'accuracy': 0.7867, 'precision': 0.8263, 'recall': 0.8945, 'f1': 0.859}


###Обучение RandomForestClassifier для обработанных данных

In [None]:
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [5, 10, 20, None],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

# rf = RandomForestClassifier(random_state=seed)
# grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='f1', n_jobs=-1)
# grid_search.fit(X_train_final, y_train_final)

# print("Best params:", grid_search.best_params_)

# best_rf = grid_search.best_estimator_
# y_pred = best_rf.predict(X_val_final)
# val_metrics = get_metrics(y_val_final, y_pred)
# print("Validation metrics:", val_metrics)

Best params: {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}


Validation metrics: {'accuracy': 0.7758, 'precision': 0.7976, 'recall': 0.9266, 'f1': 0.8573}

In [None]:
best_rf1 = RandomForestClassifier(
    max_depth=5,
    min_samples_leaf=1,
    min_samples_split=5,
    n_estimators=100,
    random_state=42
)

best_rf1.fit(X_train_final, y_train_final)

y_pred = best_rf1.predict(X_val_final)
val_metrics = get_metrics(y_val_final, y_pred)
print("Best RF metrics:", val_metrics)

Best RF metrics: {'accuracy': 0.7758, 'precision': 0.7976, 'recall': 0.9266, 'f1': 0.8573}


###Обучение RandomForestClassifier для обработанных данных 2 для последующего стейкинга

In [None]:
# param_grid = {
#     'n_estimators': [25, 50, 75, 100],
#     'max_depth': [5, 10, 20, 30, None],
#     'min_samples_split': [2, 5, 7, 10],
#     'min_samples_leaf': [1, 2, 3, 4]
# }

# rf = RandomForestClassifier(random_state=seed)
# grid_search = GridSearchCV(rf, param_grid, cv=4, scoring='f1', n_jobs=-1, verbose=3)
# grid_search.fit(X_train_final, y_train_final)

# print("Best params:", grid_search.best_params_)

# best_rf2 = grid_search.best_estimator_
# y_pred = best_rf2.predict(X_val_final)
# val_metrics = get_metrics(y_val_final, y_pred)
# print("Validation metrics:", val_metrics)

Fitting 4 folds for each of 320 candidates, totalling 1280 fits


KeyboardInterrupt: 

RF2 Best params: {'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 25}

Validation metrics: {'accuracy': 0.7775, 'precision': 0.8016, 'recall': 0.922, 'f1': 0.8576}

In [None]:
best_rf2 = RandomForestClassifier(
    max_depth=5,
    min_samples_leaf=2,
    min_samples_split=2,
    n_estimators=25,
    random_state=seed
)

best_rf2.fit(X_train_final, y_train_final)

y_pred = best_rf2.predict(X_val_final)
val_metrics = get_metrics(y_val_final, y_pred)
print("Best RF metrics:", val_metrics)

Best RF metrics: {'accuracy': 0.7775, 'precision': 0.8016, 'recall': 0.922, 'f1': 0.8576}


###Обучение Лог регрессии

In [None]:
# param_grid = {
#     'C': [0.1, 1, 5, 10],
#     'penalty': ['l1', 'l2'],
#     'solver': ['liblinear', 'saga']
# }

# logreg = LogisticRegression(class_weight='balanced', random_state=seed)
# grid_search = GridSearchCV(logreg, param_grid, cv=5, scoring='f1', n_jobs=-1)
# grid_search.fit(X_train_final, y_train_final)

# print("Best params:", grid_search.best_params_)

# best_logreg = grid_search.best_estimator_
# y_pred = best_logreg.predict(X_val_final)
# val_metrics = get_metrics(y_val_final, y_pred)
# print("Validation metrics:", val_metrics)

Best params: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}


Validation metrics: {'accuracy': 0.7367, 'precision': 0.8757, 'recall': 0.7431, 'f1': 0.804}

In [None]:
best_logreg = LogisticRegression(
    C=0.1,
    penalty='l2',
    solver='liblinear',
    class_weight='balanced',
    random_state=42
)

best_logreg.fit(X_train_final, y_train_final)

y_pred = best_logreg.predict(X_val_final)
val_metrics = get_metrics(y_val_final, y_pred)
print("Best Logistic Regression metrics:", val_metrics)

Best Logistic Regression metrics: {'accuracy': 0.7367, 'precision': 0.8757, 'recall': 0.7431, 'f1': 0.804}


###Обучение SVM

In [None]:
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Параметр регуляризации
    'kernel': ['rbf', 'linear'],  # Тип ядра
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],  # Коэффициент ядра для rbf/poly
}

# Создаем модель SVM
svm = SVC(probability=True, random_state=seed)  # probability=True для predict_proba

# Настраиваем GridSearchCV
grid_search = GridSearchCV(
    svm,
    param_grid,
    cv=4,
    scoring='f1',
    n_jobs=1,
    verbose=3
)

# Обучаем на тренировочных данных
grid_search.fit(X_train_final, y_train_final)

# Выводим лучшие параметры
print("Best params:", grid_search.best_params_)
print("Best CV score:", grid_search.best_score_)

# Получаем лучшую модель
best_svm = grid_search.best_estimator_

# Делаем предсказания на валидационной выборке
y_pred = best_svm.predict(X_val_final)

# Вычисляем метрики
val_metrics = get_metrics(y_val_final, y_pred)
print("Validation metrics:", val_metrics)

Fitting 4 folds for each of 60 candidates, totalling 240 fits
[CV 1/4] END ...C=0.01, gamma=scale, kernel=rbf;, score=0.841 total time=   3.6s
[CV 2/4] END ...C=0.01, gamma=scale, kernel=rbf;, score=0.841 total time=   2.0s
[CV 3/4] END ...C=0.01, gamma=scale, kernel=rbf;, score=0.841 total time=   2.0s
[CV 4/4] END ...C=0.01, gamma=scale, kernel=rbf;, score=0.841 total time=   2.5s
[CV 1/4] END C=0.01, gamma=scale, kernel=linear;, score=0.866 total time=   2.1s
[CV 2/4] END C=0.01, gamma=scale, kernel=linear;, score=0.864 total time=   1.5s
[CV 3/4] END C=0.01, gamma=scale, kernel=linear;, score=0.861 total time=   1.3s
[CV 4/4] END C=0.01, gamma=scale, kernel=linear;, score=0.866 total time=   1.3s
[CV 1/4] END ....C=0.01, gamma=auto, kernel=rbf;, score=0.841 total time=   2.0s
[CV 2/4] END ....C=0.01, gamma=auto, kernel=rbf;, score=0.841 total time=   2.0s
[CV 3/4] END ....C=0.01, gamma=auto, kernel=rbf;, score=0.841 total time=   2.0s
[CV 4/4] END ....C=0.01, gamma=auto, kernel=rbf

Best params: {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}

Best CV score: 0.8657867357351154

Validation metrics: {'accuracy': 0.79, 'precision': 0.8229, 'recall': 0.906, 'f1': 0.8624}

###Стейкинг (meta model: LogReg)

In [None]:
estimators = [
    ('catboost', catboost_model_nondata),
    ('rf', best_rf1),
    ('logreg', best_logreg)
]

stacking_model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(C=0.1, random_state=seed),
    cv='prefit',
    passthrough=True
)

val_predictions = np.column_stack([
    catboost_model_nondata.predict_proba(X_val)[:, 1],
    best_rf1.predict_proba(X_val_final)[:, 1],
    best_logreg.predict_proba(X_val_final)[:, 1]
])

meta_model = LogisticRegression(C=0.1, random_state=seed)
meta_model.fit(val_predictions, y_val_final)

stacking_proba = meta_model.predict_proba(val_predictions)[:, 1]
y_pred_stacking = (stacking_proba > 0.5).astype(int)

stacking_metrics = get_metrics(y_val_final, y_pred_stacking)
print("Stacking metrics:", stacking_metrics)

Stacking metrics: {'accuracy': 0.7792, 'precision': 0.8002, 'recall': 0.9278, 'f1': 0.8593}


Stacking metrics: {'accuracy': 0.7792, 'precision': 0.8002, 'recall': 0.9278, 'f1': 0.8593}

###Стейкинг ((CatBoost, RF1, RF2) -> SVM)

In [None]:
# from sklearn.svm import SVC
# from sklearn.model_selection import GridSearchCV

# # Step 1: Create prefit stacking (no retraining of base models)
# estimators = [
#     ('catboost', catboost_model_nondata),
#     ('rf1', best_rf1),
#     ('rf2', best_rf2)
# ]

# # Step 2: Get base model predictions on validation set
# val_predictions = np.column_stack([
#     catboost_model_nondata.predict_proba(X_val)[:, 1],
#     best_rf1.predict_proba(X_val_final)[:, 1],  # Use correct feature set
#     best_rf2.predict_proba(X_val_final)[:, 1]   # Use correct feature set
# ])

# # Step 3: Tune SVM meta-model with GridSearchCV
# param_grid = {
#     'C': [0.01, 0.1, 1, 10, 100],
#     'kernel': ['rbf', 'linear'],
#     'gamma': ['scale', 0.001, 0.01, 0.1]
# }

# svm_meta = GridSearchCV(
#     SVC(probability=True, random_state=seed),
#     param_grid=param_grid,
#     cv=5,
#     scoring='f1',
#     n_jobs=-1,
#     verbose=2
# )

# # Step 4: Fit SVM on stacked predictions
# svm_meta.fit(val_predictions, y_val_final)

# # Step 5: Get predictions
# stacking_proba = svm_meta.predict_proba(val_predictions)[:, 1]
# y_pred_stacking = (stacking_proba > 0.5).astype(int)

# print("Best SVM parameters:", svm_meta.best_params_)
# print("Best CV score:", svm_meta.best_score_)

# stacking_metrics = get_metrics(y_val_final, y_pred_stacking)
# print("Stacking metrics:", stacking_metrics)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
Best SVM parameters: {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
Best CV score: 0.8603875379654051
Stacking metrics: {'accuracy': 0.7808, 'precision': 0.7988, 'recall': 0.9335, 'f1': 0.8609}


###Бустинг SVM с помощью XGboost

In [None]:
!pip install xgboost



In [None]:
from xgboost import XGBClassifier

In [None]:
SVM_best = SVC(C=10, gamma='scale', kernel='linear', probability=True, random_state=seed)
SVM_best.fit(X_train_final, y_train_final)

# Compute probabilities separately for train and validation
proba_train = pd.DataFrame(SVM_best.predict_proba(X_train_final),
                           columns=['prob_class_0', 'prob_class_1'])
proba_val = pd.DataFrame(SVM_best.predict_proba(X_val_final),  # Use X_val_final here!
                         columns=['prob_class_0', 'prob_class_1'])

X_train_xg = pd.concat([X_train_final.reset_index(drop=True), proba_train], axis=1)
X_val_xg = pd.concat([X_val_final.reset_index(drop=True), proba_val], axis=1)

model_xg = XGBClassifier()
model_xg.fit(X_train_xg, y_train_final)
y_pred = model_xg.predict(X_val_xg)
print(get_metrics(y_val_final, y_pred))  # Use y_val_final, not y_train_final!


{'accuracy': 0.7617, 'precision': 0.8124, 'recall': 0.8739, 'f1': 0.842}


Best SVM parameters: {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}

Best CV score: 0.8603875379654051

Stacking metrics: {'accuracy': 0.7808, 'precision': 0.7988, 'recall': 0.9335, 'f1': 0.8609}

In [None]:
val_predictions = np.column_stack([
    catboost_model_nondata.predict_proba(X_val)[:, 1],       # catboost на X_val
    best_rf1.predict_proba(X_val_final)[:, 1],              # rf1 на X_val_final
    best_rf2.predict_proba(X_val_final)[:, 1]               # rf2 на X_val_final
])

meta_model_SVM = SVC(probability=True, C=10, gamma=0.1, kernel='rbf', random_state=seed)
meta_model_SVM.fit(val_predictions, y_val_final)

stacking_proba = meta_model_SVM.predict_proba(val_predictions)[:, 1]
y_pred_stacking = (stacking_proba > 0.5).astype(int)

stacking_metrics = get_metrics(y_val_final, y_pred_stacking)
print("Stacking metrics:", stacking_metrics)

Stacking metrics: {'accuracy': 0.7808, 'precision': 0.7988, 'recall': 0.9335, 'f1': 0.8609}


###Сохранение модели


In [None]:
model_path = '/content/drive/MyDrive/SBER_Base/models'

joblib.dump(catboost_model_nondata, os.path.join(model_path, 'catboost_model.pkl'))
joblib.dump(catboost_model_processedData, os.path.join(model_path, 'catboost_model_processedData.pkl'))
joblib.dump(best_rf1, os.path.join(model_path, 'random_forest_model1.pkl'))
joblib.dump(best_rf2, os.path.join(model_path, 'random_forest_model2.pkl'))
joblib.dump(best_logreg, os.path.join(model_path, 'logreg_model.pkl'))
joblib.dump(best_svm, os.path.join(model_path, 'svm_model.pkl'))
joblib.dump(meta_model, os.path.join(model_path, 'meta_model.pkl'))
joblib.dump(meta_model_SVM, os.path.join(model_path, 'meta_model_SVM.pkl'))

NameError: name 'catboost_model_nondata' is not defined

In [None]:
joblib.dump(model_xg, os.path.join(model_path, 'meta_model_XG.pkl'))

['/content/drive/MyDrive/SBER_Base/models/meta_model_XG.pkl']