## Предсказание вида поломки машин

In [1]:
import warnings

warnings.simplefilter("ignore", FutureWarning)
warnings.simplefilter("ignore", UserWarning)

import os
import shutil

# Путь к папке с данными
input_dir = '/kaggle/input/competative-data-science-course-by-data-feeling/'
output_dir = '.'

# Скачиваем все .csv файлы
for file_name in os.listdir(input_dir):
    if file_name.endswith('.csv'):
        full_path = os.path.join(input_dir, file_name)
        shutil.copy(full_path, output_dir)
        print(f'Скачан файл: {file_name}')

Скачан файл: rides_info.csv
Скачан файл: car_test.csv
Скачан файл: driver_info.csv
Скачан файл: car_train.csv
Скачан файл: fix_info.csv


In [2]:
!pip install catboost==1.2.7 -q
!pip install featuretools==1.31.0 -q
!pip install optuna==4.1.0 -q
!pip install optuna-integration==4.1.0 -q

# Скачиваем дополнительные модули из репо
!wget -q https://raw.githubusercontent.com/saspav/Introduction_Competitive_Data_Science/main/df_addons.py > /dev/null 2>&1
!wget -q https://raw.githubusercontent.com/saspav/Introduction_Competitive_Data_Science/main/print_time.py > /dev/null 2>&1
!wget -q https://raw.githubusercontent.com/saspav/Introduction_Competitive_Data_Science/main/data_process.py > /dev/null 2>&1

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.4/97.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import numpy as np
import pandas as pd

from catboost import CatBoostClassifier, Pool

from sklearn.model_selection import train_test_split

from data_process import RANDOM_SEED, WORK_PATH, PREDICTIONS_DIR
from data_process import DataTransform, set_all_seeds, make_predict


set_all_seeds(seed=RANDOM_SEED)

max_num = 0
submit_prefix = 'cb_'

numeric_columns = []

targets = ['target_reg', 'target_class']

cat_columns = ['model', 'car_type', 'fuel_type']

# Чтение и предобработка данных
data_cls = DataTransform(use_catboost=True,
                         category_columns=cat_columns,
                         drop_first=False,
                         )

train_df, test_df = data_cls.make_agg_data()

# # Добавление группировок по целевому признаку
# train_df = data_cls.fit_transform(train_df)
# test_df = data_cls.transform(test_df)

features2drop = ['car_id']

exclude_columns = [
    # 'total_count',
    'mode_work_type',
]

exclude_columns.extend(data_cls.exclude_columns)

model_columns = test_df.columns.to_list()

model_columns = [col for col in model_columns if col not in exclude_columns]

# Добавим в категориальные признаки те, что были посчитаны как мода
cat_columns.extend([col for col in model_columns if col.upper().startswith('MODE_')])

cat_columns = [col for col in cat_columns if col in model_columns]

exclude_columns = features2drop + exclude_columns

print(f'Размер train_df = {train_df.shape}, test = {test_df.shape}')

Загрузка данных...
Время обработки: 9.7 сек
Сохраняем предобработанные данные...
Время обработки: 11.8 сек
Агрегация данных...
Время обработки: 8.1 сек
Сохраняем агрегированные данные...
Время обработки: 0.3 сек
Размер train_df = (2337, 139), test = (1913, 137)


In [4]:
train = train_df[model_columns].drop(columns=features2drop, errors='ignore')
target = train_df['target_class']
test_df = test_df[model_columns].copy()

print('train.shape', train.shape, 'пропусков:', train.isna().sum().sum())
print('test.shape', test_df.drop(columns=features2drop, errors='ignore').shape,
      'пропусков:', test_df.isna().sum().sum())

test_size = 0.2

stratified = ['target_class']

# Разделение на обучающую и валидационную выборки
X_train, X_valid, y_train, y_valid = train_test_split(train, target,
                                                      test_size=test_size,
                                                      stratify=target,
                                                      random_state=RANDOM_SEED)
splited = X_train, X_valid, y_train, y_valid

print('X_train.shape', X_train.shape, 'пропусков:', X_train.isna().sum().sum())
print('X_valid.shape', X_valid.shape, 'пропусков:', X_valid.isna().sum().sum())

pool_train = Pool(data=X_train, label=y_train, cat_features=cat_columns)
pool_valid = Pool(data=X_valid, label=y_valid, cat_features=cat_columns)

loss_function = 'MultiClass'  # Используем многоклассовую классификацию
eval_metric = 'AUC:type=Mu'
iterations = 1000

clf_params = dict(cat_features=cat_columns,
                  loss_function=loss_function,
                  eval_metric=eval_metric,
                  # iterations=iterations,
                  # learning_rate=0.01,
                  early_stopping_rounds=iterations // (10, 20)[iterations > 5_000],
                  random_seed=RANDOM_SEED,
                  # task_type="GPU",
                  # border_count=254,
                  )

clf = CatBoostClassifier(**clf_params)

train.shape (2337, 135) пропусков: 0
test.shape (1913, 135) пропусков: 0
X_train.shape (1869, 135) пропусков: 0
X_valid.shape (468, 135) пропусков: 0


In [5]:
# Обучение модели
clf.fit(pool_train, eval_set=pool_valid, use_best_model=True, verbose=100)

Learning rate set to 0.109335
0:	test: 0.9966308	best: 0.9966308 (0)	total: 202ms	remaining: 3m 22s
100:	test: 1.0000000	best: 1.0000000 (5)	total: 14s	remaining: 2m 4s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 1
bestIteration = 5

Shrink model to first 6 iterations.


<catboost.core.CatBoostClassifier at 0x7f33acfc97b0>

In [6]:
test = test_df[model_columns].drop(columns=features2drop, errors='ignore').copy()

# Предсказание на тестовой выборке
predict_test = clf.predict(test)

# Сохранение предсказаний в файл
submit_csv = f'{submit_prefix}submit_{max_num:03}.csv'
file_submit_csv = PREDICTIONS_DIR.joinpath(submit_csv)
submission = pd.DataFrame({'car_id': test_df['car_id'],
                           'target_class': predict_test.flatten()})
submission.to_csv(file_submit_csv, index=False)

#### На локальном ПК CatBoost выдал ROC-AUC = 1.0, на каггле почему-то чуть меньше.

### Возможно подбор гиперпараметров поможет?

In [7]:
import optuna
from optuna.integration import CatBoostPruningCallback
from optuna.logging import set_verbosity, WARNING
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score

# Подавляем ненужные сообщения
set_verbosity(WARNING)

In [8]:
def fit_catboost(trial, train, valid):

    X_train, y_train = train
    X_valid, y_valid = valid
    
    pool_train = Pool(data=X_train, label=y_train, cat_features=cat_columns)
    pool_valid = Pool(data=X_valid, label=y_valid, cat_features=cat_columns)

    params = {
        "depth": trial.suggest_int("depth", 3, 7),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.1, 50),
        "border_count": trial.suggest_int("border_count", 32, 512),
        "random_strength": trial.suggest_float("random_strength", 0.1, 10),
        "one_hot_max_size": trial.suggest_int("one_hot_max_size", 2, 10),
        "rsm": trial.suggest_float("rsm", 0.1, 1.0),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
        "leaf_estimation_method": trial.suggest_categorical("leaf_estimation_method", ["Newton", "Gradient"]),
    }

    if params["bootstrap_type"] == "Bayesian":
        params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0.0, 20.0)
    elif params["bootstrap_type"] == "Bernoulli":
        params["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    loss_function = 'MultiClass' 
    eval_metric = 'AUC:type=Mu'

    clf = CatBoostClassifier(
        loss_function=loss_function,
        eval_metric=eval_metric,
        cat_features=cat_columns,
        random_seed=RANDOM_SEED,
        # task_type="GPU",
        verbose=0,
        thread_count=-1,
        **params
    )

    pruning_callback = CatBoostPruningCallback(trial, eval_metric)

    clf.fit(pool_train, eval_set=pool_valid,
            verbose=0,
            early_stopping_rounds=50,
            callbacks=[pruning_callback],
            )

    # evoke pruning manually.
    pruning_callback.check_pruned()

    preds = clf.predict_proba(X_valid)

    return clf, preds


def objective(trial, X_train, y_train, return_models=False, **kwargs):
    kf = StratifiedKFold(n_splits=5, random_state=RANDOM_SEED, shuffle=True)

    scores, models = [], []

    for train_idx, valid_idx in kf.split(X_train, y_train):
        train = X_train.iloc[train_idx, :], y_train.iloc[train_idx]
        valid = X_train.iloc[valid_idx, :], y_train.iloc[valid_idx]

        model, y_pred = fit_catboost(trial, train, valid, **kwargs)
        scores.append(roc_auc_score(valid[1], y_pred, multi_class="ovr"))
        models.append(model)

    result = np.mean(scores)

    if return_models:
        return result, models
    else:
        return result

In [9]:
study = optuna.create_study(direction="maximize", 
                            sampler=optuna.samplers.TPESampler(seed=RANDOM_SEED))
study.optimize(lambda trial: objective(trial, train, target),
               n_trials=50,
               show_progress_bar=True,
              )
catboost_best_params = study.best_params

  0%|          | 0/50 [00:00<?, ?it/s]

In [10]:
study.best_params

{'depth': 5,
 'learning_rate': 0.23916000908711055,
 'l2_leaf_reg': 38.201416599217566,
 'border_count': 198,
 'random_strength': 0.7012772832099687,
 'one_hot_max_size': 4,
 'rsm': 0.5642231218519764,
 'boosting_type': 'Ordered',
 'bootstrap_type': 'Bernoulli',
 'leaf_estimation_method': 'Gradient',
 'subsample': 0.5634572038278047}

In [11]:
catboost_best_value, catboost_best_models = objective(
    optuna.trial.FixedTrial(study.best_params),
    train,
    target,
    return_models=True,
)

print(f"Best CatBoost ROC-AUC: {catboost_best_value}")

Best CatBoost ROC-AUC: 0.9999941360570566


In [12]:
# Получим предсказания на тестовой выборке
preds = [catboost_model.predict_proba(test)
         for catboost_model in catboost_best_models]

mean_preds = np.mean(preds, axis=0)
y_pred = np.argmax(mean_preds, axis=1)
class_mapping = dict(zip(range(y_train.nunique()), sorted(y_train.unique())))
predict_test = np.vectorize(class_mapping.get)(y_pred)
# Сохранение предсказаний в файл
submit_csv = f'{submit_prefix}tuned_submit_{max_num:03}.csv'
file_submit_csv = PREDICTIONS_DIR.joinpath(submit_csv)
submission = pd.DataFrame({'car_id': test_df['car_id'], 'target_class': predict_test.flatten()})
submission.to_csv(file_submit_csv, index=False)

In [13]:
from IPython.display import display, FileLink
from zipfile import ZipFile, ZIP_DEFLATED as ZD
from glob import glob

files = glob(f'{PREDICTIONS_DIR}/*.csv')
zip_filename = WORK_PATH.joinpath('predictions.zip')
with ZipFile(zip_filename, 'w',  compression=ZD, compresslevel=7) as zip_file:
    for filename in files:
        print(filename)
        zip_file.write(filename)
FileLink(zip_filename)

predictions/cb_submit_000.csv
predictions/cb_tuned_submit_000.csv
