In [1]:
# HW06.ipynb — Домашнее задание по семинару S06: Деревья и ансамбли

# =============================================================================
# 0. Импорты и настройки
# =============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import joblib
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import (
    RandomForestClassifier,
    HistGradientBoostingClassifier,
    StackingClassifier
)
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score,
    roc_curve, confusion_matrix, ConfusionMatrixDisplay
)
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Фиксируем воспроизводимость
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Создаем только необходимые папки для артефактов в текущей директории
ARTIFACTS_DIR = Path('artifacts')
ARTIFACTS_DIR.mkdir(exist_ok=True)
(ARTIFACTS_DIR / 'figures').mkdir(exist_ok=True)

def save_json(data, path):
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

# =============================================================================
# 1. Загрузка данных и базовый EDA
# =============================================================================

# Путь к датасету в текущей директории
DATA_PATH = Path('S06-hw-dataset-01.csv')
df = pd.read_csv(DATA_PATH)

# Определяем категориальные и числовые признаки
cat_cols = [col for col in df.columns
            if col not in ['id', 'target'] and
            df[col].nunique() <= 10 and
            (df[col].dtype == 'object' or df[col].dtype.name == 'category' or
             (df[col].dtype in ['int64', 'int32'] and df[col].nunique() <= 10))]
num_cols = [col for col in df.columns
            if col not in ['id', 'target'] and col not in cat_cols]

X = df.drop(columns=['id', 'target'])
y = df['target']

# =============================================================================
# 2. Разбиение на train / test
# =============================================================================

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=RANDOM_STATE,
    stratify=y
)

# =============================================================================
# 3. Предобработка данных
# =============================================================================

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
    ])

# =============================================================================
# 4. Метрики и словарик для хранения результатов
# =============================================================================

metrics = {}
search_summaries = {}
models = {}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# =============================================================================
# 5. Baseline модели
# =============================================================================

# Dummy — most frequent
dummy = DummyClassifier(strategy='most_frequent', random_state=RANDOM_STATE)
dummy.fit(X_train, y_train)
y_pred = dummy.predict(X_test)
y_proba = dummy.predict_proba(X_test)[:, 1]

metrics['Dummy'] = {
    'accuracy': accuracy_score(y_test, y_pred),
    'f1': f1_score(y_test, y_pred, zero_division=0),
    'roc_auc': roc_auc_score(y_test, y_proba)
}

# Logistic Regression с предобработкой
logreg_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=2000, random_state=RANDOM_STATE))
])

logreg_pipe.fit(X_train, y_train)
y_pred = logreg_pipe.predict(X_test)
y_proba = logreg_pipe.predict_proba(X_test)[:, 1]

metrics['LogReg'] = {
    'accuracy': accuracy_score(y_test, y_pred),
    'f1': f1_score(y_test, y_pred),
    'roc_auc': roc_auc_score(y_test, y_proba)
}

models['LogReg'] = logreg_pipe

# =============================================================================
# 6. Decision Tree
# =============================================================================

param_grid_tree = {
    'max_depth': [3, 5, 7, 10],
    'min_samples_leaf': [1, 5, 10],
    'ccp_alpha': [0.0, 0.001, 0.01]
}

grid_tree = GridSearchCV(
    DecisionTreeClassifier(max_depth=5, random_state=RANDOM_STATE),
    param_grid_tree, cv=cv, scoring='roc_auc', n_jobs=-1
)
grid_tree.fit(X_train, y_train)

models['Tree'] = grid_tree.best_estimator_
search_summaries['Tree'] = {
    'best_params': grid_tree.best_params_,
    'cv_roc_auc': round(grid_tree.best_score_, 4)
}

y_pred = models['Tree'].predict(X_test)
y_proba = models['Tree'].predict_proba(X_test)[:, 1]
metrics['Tree'] = {
    'accuracy': accuracy_score(y_test, y_pred),
    'f1': f1_score(y_test, y_pred),
    'roc_auc': roc_auc_score(y_test, y_proba)
}

# Визуализация дерева
plt.figure(figsize=(14, 8))
plot_tree(models['Tree'], max_depth=3, filled=True,
          feature_names=X.columns, class_names=['0', '1'],
          rounded=True, fontsize=10)
plt.title("Decision Tree (первые 3 уровня)")
plt.savefig(ARTIFACTS_DIR / 'figures' / 'tree_visualization.png', dpi=150, bbox_inches='tight')
plt.close()

# =============================================================================
# 7. Random Forest
# =============================================================================

param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_leaf': [1, 5]
}

grid_rf = GridSearchCV(
    RandomForestClassifier(random_state=RANDOM_STATE, oob_score=True),
    param_grid_rf, cv=cv, scoring='roc_auc', n_jobs=-1
)
grid_rf.fit(X_train, y_train)

models['RF'] = grid_rf.best_estimator_
search_summaries['RF'] = {
    'best_params': grid_rf.best_params_,
    'cv_roc_auc': round(grid_rf.best_score_, 4),
    'oob_score': round(models['RF'].oob_score_, 4)
}

y_pred = models['RF'].predict(X_test)
y_proba = models['RF'].predict_proba(X_test)[:, 1]
metrics['RF'] = {
    'accuracy': accuracy_score(y_test, y_pred),
    'f1': f1_score(y_test, y_pred),
    'roc_auc': roc_auc_score(y_test, y_proba)
}

# =============================================================================
# 8. HistGradientBoosting
# =============================================================================

param_grid_hgb = {
    'max_iter': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1]
}

grid_hgb = GridSearchCV(
    HistGradientBoostingClassifier(random_state=RANDOM_STATE),
    param_grid_hgb, cv=cv, scoring='roc_auc', n_jobs=-1
)
grid_hgb.fit(X_train, y_train)

models['HGB'] = grid_hgb.best_estimator_
search_summaries['HGB'] = {
    'best_params': grid_hgb.best_params_,
    'cv_roc_auc': round(grid_hgb.best_score_, 4)
}

y_pred = models['HGB'].predict(X_test)
y_proba = models['HGB'].predict_proba(X_test)[:, 1]
metrics['HGB'] = {
    'accuracy': accuracy_score(y_test, y_pred),
    'f1': f1_score(y_test, y_pred),
    'roc_auc': roc_auc_score(y_test, y_proba)
}

# =============================================================================
# 9. Stacking Classifier
# =============================================================================

estimators = [
    ('tree', models['Tree']),
    ('rf', models['RF']),
    ('hgb', models['HGB'])
]

stack = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=1000, random_state=RANDOM_STATE),
    cv=5,
    n_jobs=-1
)
stack.fit(X_train, y_train)

models['Stacking'] = stack

y_pred = stack.predict(X_test)
y_proba = stack.predict_proba(X_test)[:, 1]
metrics['Stacking'] = {
    'accuracy': accuracy_score(y_test, y_pred),
    'f1': f1_score(y_test, y_pred),
    'roc_auc': roc_auc_score(y_test, y_proba)
}

# =============================================================================
# 10. Сравнение результатов
# =============================================================================

metrics_df = pd.DataFrame(metrics).T.round(4)

# Лучшая модель по ROC-AUC
best_model_name = metrics_df['roc_auc'].idxmax()
best_model = models[best_model_name]

# =============================================================================
# 11. Визуализация для лучшей модели
# =============================================================================

y_pred_best = best_model.predict(X_test)
y_proba_best = best_model.predict_proba(X_test)[:, 1]

# ROC-кривая
fpr, tpr, _ = roc_curve(y_test, y_proba_best)
plt.figure(figsize=(7, 6))
plt.plot(fpr, tpr, label=f'ROC AUC = {roc_auc_score(y_test, y_proba_best):.4f}', linewidth=2)
plt.plot([0, 1], [0, 1], 'k--', label='Random chance')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(f'ROC-кривая — {best_model_name}')
plt.legend(loc='lower right')
plt.grid(True, alpha=0.3)
plt.savefig(ARTIFACTS_DIR / 'figures' / 'roc_curve.png', dpi=150)
plt.close()

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_best)
disp = ConfusionMatrixDisplay(cm, display_labels=['Class 0', 'Class 1'])
disp.plot(cmap='Blues')
plt.title(f'Confusion Matrix — {best_model_name}')
plt.tight_layout()
plt.savefig(ARTIFACTS_DIR / 'figures' / 'confusion_matrix.png', dpi=150)
plt.close()

# =============================================================================
# 12. Permutation Importance для лучшей модели
# =============================================================================

perm_imp = permutation_importance(
    best_model, X_test, y_test,
    n_repeats=10, random_state=RANDOM_STATE,
    scoring='roc_auc', n_jobs=-1
)

imp_mean = perm_imp.importances_mean
sorted_idx = np.argsort(imp_mean)[::-1][:15]

plt.figure(figsize=(10, 6))
plt.barh(range(len(sorted_idx)), imp_mean[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), X.columns[sorted_idx])
plt.xlabel('Среднее падение ROC-AUC')
plt.title(f'Permutation Importance — {best_model_name} (top-15)')
plt.tight_layout()
plt.savefig(ARTIFACTS_DIR / 'figures' / 'permutation_importance.png', dpi=150)
plt.close()

# =============================================================================
# 13. Сохранение лучших результатов
# =============================================================================

joblib.dump(best_model, ARTIFACTS_DIR / 'best_model.joblib')

meta = {
    'best_model': best_model_name,
    'test_metrics': metrics_df.loc[best_model_name].to_dict(),
    'search_summary': search_summaries.get(best_model_name, {}),
    'random_state': RANDOM_STATE,
    'dataset': 'S06-hw-dataset-01.csv',
    'date': '2026-01-17',
    'categorical_features': cat_cols,
    'numerical_features': num_cols
}

save_json(metrics_df.to_dict(orient='index'), ARTIFACTS_DIR / 'metrics_test.json')
save_json(search_summaries, ARTIFACTS_DIR / 'search_summaries.json')
save_json(meta, ARTIFACTS_DIR / 'best_model_meta.json')