In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde


from imblearn.over_sampling import RandomOverSampler
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    log_loss,
    confusion_matrix,
    classification_report
)
import time
import joblib
import multiprocessing
multiprocessing.set_start_method("spawn", force=True)

np.random.seed(0)

In [9]:
df = pd.read_csv("Dataset_2_DATA.csv", encoding='cp1251')[lambda df: df['DAGSOORT'] == 1]
df_d = pd.read_csv("Dataset_2_DESCRIPTION.csv", encoding='cp1251')

## Подготовка данных

### 1. Обработка пропущенных значений 

In [10]:
# Проверяем датафрейм на содержание NaN-значений
has_any_missing = df.isna().any().any()
print("Есть ли NaN в датафрейме?", has_any_missing)

# Может показаться, что пропусков нет, но если проверить столбцы на соотвествие типу object можно понять,
# что два столбца потенциально могут содержать "нетипичные" пропуски.
print(df.dtypes[df.dtypes != 'int64'])

# Проверим уникальные значения столбцов KREISDUUR и PARKEERKOSTEN
for col in df.select_dtypes(include=['object']).columns:
    print(f"Уникальные значения в {col}:")
    print(df[col].unique())

# Бинго! Пропуски в наборе данных обозначаются одним пробелом(' ')

Есть ли NaN в датафрейме? False
KREISDUUR        object
PARKEERKOSTEN    object
dtype: object
Уникальные значения в KREISDUUR:
['11' '8' '7' '5' '4' '3' '10' '6' '2' '9' ' ' '1']
Уникальные значения в PARKEERKOSTEN:
['0' '9' '3' '11' ' ' '13' '5' '4' '14' '12' '16' '10' '7' '2' '6' '1' '8']


In [11]:
# Заменяем пропуски на NaN-значение
df = df.replace({' ': np.nan})

# Функция для замены пропусков модой (или средним) значением
def fill_with_mode(group):
    mode_vals = group.mode()
    if not mode_vals.empty:
        return group.fillna(mode_vals.iloc[0])
    return group.fillna(group.mean())

# Заменяем пропуски в найденных столбцах
cols_to_fill = ['KREISDUUR', 'PARKEERKOSTEN']
for col in cols_to_fill:
    df[col] = df.groupby('PERSID')[col].transform(fill_with_mode)

### 2. Удаление дубликатов

In [12]:
df = df.drop_duplicates()
display(df) # Как можно заметить, количество строк не изменилось
print(df.info())

Unnamed: 0,HHID,PERSID,VPLID,RitID,HH_VALID,P_VALID,KHVM,WEGGEWEEST,VERTREKP,AANTVPL,VPLDAGNR,VERPL,VERPLNR,TOER,TOER_TYPE,AANTRIT,KMOTIEF,VERTPROV,AANKPROV,KAFSTV,KVERTTIJD,KREISDUUR,ROLAUTO,PARKEERKOSTEN,VERTRAGING,DAGSOORT,OORDEEL_AUTO,OORDEEL_TREIN,OORDEEL_BTM,OORDEEL_FIETS,OORDEEL_BROMMER,OORDEEL_LOPEN,GEBRUIK_AUTO_STELLING3,GEBRUIK_AUTO_STELLING4,GEBRUIK_TREIN_STELLING3,GEBRUIK_TREIN_STELLING4,GEBRUIK_BTM_STELLING3,GEBRUIK_BTM_STELLING4,GEBRUIK_FIETS_STELLING2,GEBRUIK_FIETS_STELLING3,GEBRUIK_LOPEN_STELLING3,GEBRUIK_LOPEN_STELLING4,HHPERS,HHSAM,N_KIND,HHBRUTOINK2_w5,HHAUTO_N,HHAUTO,HHBESTEL,HHHYBRID,HHMOTOR,HHBROM,HHSNOR,HHFIETS,HHVOUWFIETS,HHEBIKE
0,30055622,3005562201,13957101,1395710101,2,3,2,1,2,2,1,3,1,0,9,1,2,0,0,13,8,11,2,0,0,1,4,3,3,4,4,4,4,4,3,4,3,4,4,3,2,4,3,3,0,2,3,1,0,0,1,0,0,1,0,1
1,30055622,3005562201,13957102,1395710201,2,3,2,1,2,2,1,3,2,0,9,1,2,0,0,13,13,8,2,0,0,1,4,3,3,4,4,4,4,4,3,4,3,4,4,3,2,4,3,3,0,2,3,1,0,0,1,0,0,1,0,1
2,30055622,3005562201,13957201,1395720101,2,3,1,1,2,4,2,3,1,0,9,1,2,0,0,8,5,7,1,0,0,1,4,3,3,4,4,4,4,4,3,4,3,4,4,3,2,4,3,3,0,2,3,1,0,0,1,0,0,1,0,1
3,30055622,3005562201,13957202,1395720201,2,3,1,1,2,4,2,3,2,0,9,1,9,0,0,8,8,5,1,0,0,1,4,3,3,4,4,4,4,4,3,4,3,4,4,3,2,4,3,3,0,2,3,1,0,0,1,0,0,1,0,1
7,30356310,3035631001,17204101,1720410101,2,3,8,1,2,1,1,3,1,1,6,3,8,0,0,8,5,11,0,0,0,1,4,3,3,4,2,4,4,4,2,4,2,4,4,4,2,4,2,2,0,4,2,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7304,30862294,3086229401,19042102,1904210201,2,3,1,1,2,5,1,1,2,0,9,1,1,27,20,13,2,10,1,0,0,1,3,4,3,5,3,5,4,3,3,4,2,3,5,5,2,4,1,1,0,3,1,1,0,0,0,0,0,1,0,0
7306,30862294,3086229401,19042103,1904210301,2,3,7,1,2,5,1,1,3,0,9,1,7,20,20,1,7,2,0,0,0,1,3,4,3,5,3,5,4,3,3,4,2,3,5,5,2,4,1,1,0,3,1,1,0,0,0,0,0,1,0,0
7307,30862294,3086229401,19042104,1904210401,2,3,7,1,2,5,1,1,4,0,9,1,1,20,20,1,8,2,0,0,0,1,3,4,3,5,3,5,4,3,3,4,2,3,5,5,2,4,1,1,0,3,1,1,0,0,0,0,0,1,0,0
7308,30862294,3086229401,19042201,1904220101,2,3,1,1,2,3,2,1,1,0,9,1,1,22,20,9,3,6,1,0,3,1,3,4,3,5,3,5,4,3,3,4,2,3,5,5,2,4,1,1,0,3,1,1,0,0,0,0,0,1,0,0


<class 'pandas.core.frame.DataFrame'>
Index: 5700 entries, 0 to 7309
Data columns (total 56 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   HHID                     5700 non-null   int64 
 1   PERSID                   5700 non-null   int64 
 2   VPLID                    5700 non-null   int64 
 3   RitID                    5700 non-null   int64 
 4   HH_VALID                 5700 non-null   int64 
 5   P_VALID                  5700 non-null   int64 
 6   KHVM                     5700 non-null   int64 
 7   WEGGEWEEST               5700 non-null   int64 
 8   VERTREKP                 5700 non-null   int64 
 9   AANTVPL                  5700 non-null   int64 
 10  VPLDAGNR                 5700 non-null   int64 
 11  VERPL                    5700 non-null   int64 
 12  VERPLNR                  5700 non-null   int64 
 13  TOER                     5700 non-null   int64 
 14  TOER_TYPE                5700 non-null   int6

### 3. Приведение типов данных

In [13]:
# Для качественной работы LightAutoML нужно перевести int64-столбцы к category
# Получение признаков и их типов
var_types = df_d[['Variable', 'Measurement_level']].drop_duplicates()

# Сопоставим с колонками датасета
matched_columns = pd.DataFrame(df.columns, columns=['Variable'])
merged = matched_columns.merge(var_types, on='Variable', how='left')

nominal_cols = list(merged[merged['Measurement_level'] == 'Nominal']['Variable'])
ordinal_cols = list(merged[merged['Measurement_level'] == 'Ordinal']['Variable'])

id_cols = ['HHID', 'PERSID', 'VPLID', 'RitID']
df = df.drop(columns=id_cols, errors='ignore')

for col in nominal_cols + ordinal_cols:
    if col in df.columns:
        df[col] = df[col].astype('category')

print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 5700 entries, 0 to 7309
Data columns (total 52 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   HH_VALID                 5700 non-null   category
 1   P_VALID                  5700 non-null   category
 2   KHVM                     5700 non-null   category
 3   WEGGEWEEST               5700 non-null   category
 4   VERTREKP                 5700 non-null   category
 5   AANTVPL                  5700 non-null   category
 6   VPLDAGNR                 5700 non-null   category
 7   VERPL                    5700 non-null   category
 8   VERPLNR                  5700 non-null   category
 9   TOER                     5700 non-null   category
 10  TOER_TYPE                5700 non-null   category
 11  AANTRIT                  5700 non-null   category
 12  KMOTIEF                  5700 non-null   category
 13  VERTPROV                 5700 non-null   category
 14  AANKPROV     

## 4. Разбиение на train/test выборки

In [14]:
target_name = 'KHVM'
y = df[target_name]
X = df.drop(columns=[target_name])


# Стратифицированное деление, чтобы сохранить баланс классов
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y
)


ros = RandomOverSampler(random_state=0)
X_res, y_res = ros.fit_resample(X_train, y_train)

# Объединяем обратно
train_data = X_res.copy()
train_data[target_name] = y_res

print(train_data)


     HH_VALID P_VALID WEGGEWEEST VERTREKP  ... HHFIETS HHVOUWFIETS HHEBIKE KHVM
0           2       3          1        1  ...       1           0       0    2
1           2       3          1        1  ...       1           0       0    1
2           2       3          1        1  ...       1           0       0    3
3           2       3          1        1  ...       1           0       0    2
4           2       3          1        1  ...       1           0       1    1
...       ...     ...        ...      ...  ...     ...         ...     ...  ...
8819        2       3          1        1  ...       1           1       1    8
8820        2       3          1        1  ...       1           1       1    8
8821        2       3          1        1  ...       1           0       0    8
8822        2       3          1        1  ...       0           0       0    8
8823        2       3          1        1  ...       1           0       0    8

[8824 rows x 52 columns]


## AutoML (LightAutoML)

Для реализации AutoML-модели я выбрал библиотеку LAMA от братьев наших меньших из сбера.
Далее идет описание AutoML-пайплайна

In [None]:
categorical_columns = list(train_data.select_dtypes(include=['category']).columns)

task = Task('multiclass', metric='crossentropy')

roles = {
    'target': 'KHVM'
}

automl = TabularAutoML(
    task=task,
    timeout=1500,
    cpu_limit=6,
    memory_limit=10000,
    reader_params={
        'n_jobs': 4,
        'cv': 5
    },
    lgb_params={
        'default_params': {
            'metric': 'multi_logloss',
        }
    },
    tuning_params={
        'max_iters': 20,
    }
)

start = time.time()
oof_preds = automl.fit_predict(train_data, roles=roles, verbose=1)
print("⏱ Обучение заняло:", round(time.time() - start, 2), "секунд")

joblib.dump(automl, "automl_khvm_model.joblib")
print("✅ Модель сохранена в 'automl_khvm_model.joblib'")


[14:58:54] Start automl [1mutilizator[0m with listed constraints:
[14:58:54] - time: 1500.00 seconds
[14:58:54] - CPU: 6 cores
[14:58:54] - memory: 10000 GB

[14:58:54] [1mIf one preset completes earlier, next preset configuration will be started[0m

[14:58:54] Start 0 automl preset configuration:
[14:58:54] [1mconf_0_sel_type_0.yml[0m, random state: {'reader_params': {'random_state': 42}, 'nn_params': {'random_state': 42}, 'general_params': {'return_all_predictions': False}}
[14:58:54] Stdout logging level is INFO.
[14:58:54] Task: multiclass

[14:58:54] Start automl preset with listed constraints:
[14:58:54] - time: 1500.00 seconds
[14:58:54] - CPU: 6 cores
[14:58:54] - memory: 10000 GB

[14:58:54] [1mTrain data shape: (8824, 52)[0m

[14:58:57] Layer [1m1[0m train process start. Time left 1496.63 secs
[14:58:58] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[14:59:19] Fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m finished. score = [1m-0.08689711960926995[0m
[14:59:1

Optimization Progress:  15%|█▍        | 15/101 [02:31<14:28, 10.10s/it, best_trial=3, best_value=-0.0363]

[15:02:32] Hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m completed
[15:02:32] Start fitting [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m ...





[15:02:45] Fitting [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m finished. score = [1m-0.034090747644993646[0m
[15:02:45] [1mLvl_0_Pipe_1_Mod_1_Tuned_LightGBM[0m fitting and predicting completed
[15:02:45] Start fitting [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m ...
[15:04:02] Fitting [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m finished. score = [1m-0.03472071131500964[0m
[15:04:02] [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m fitting and predicting completed
[15:04:02] Start hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m ... Time budget is 300.00 secs


Optimization Progress:  17%|█▋        | 17/101 [05:11<25:37, 18.30s/it, best_trial=4, best_value=-0.0342]

[15:09:13] Hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m completed
[15:09:13] Start fitting [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m ...





[15:11:12] Fitting [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m finished. score = [1m-0.03222211634483694[0m
[15:11:12] [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m fitting and predicting completed
[15:11:12] Time left 761.68 secs

[15:11:12] [1mLayer 1 training completed.[0m

[15:11:12] Layer [1m2[0m train process start. Time left 761.67 secs
[15:11:13] Start fitting [1mLvl_1_Pipe_0_Mod_0_LinearL2[0m ...
[15:11:22] Fitting [1mLvl_1_Pipe_0_Mod_0_LinearL2[0m finished. score = [1m-0.04007992077043304[0m
[15:11:22] [1mLvl_1_Pipe_0_Mod_0_LinearL2[0m fitting and predicting completed
[15:11:22] Time left 751.60 secs

[15:11:23] Start fitting [1mLvl_1_Pipe_1_Mod_0_LightGBM[0m ...
[15:11:37] Fitting [1mLvl_1_Pipe_1_Mod_0_LightGBM[0m finished. score = [1m-0.038476916982713924[0m
[15:11:37] [1mLvl_1_Pipe_1_Mod_0_LightGBM[0m fitting and predicting completed
[15:11:37] Time left 736.90 secs

[15:11:37] [1mLayer 2 training completed.[0m

[15:11:37] Blending: optimization starts wi

Далее тестируем модель и расчитываем метрики

In [22]:
# Предсказания
X_test_clean = X_test.copy()
test_preds = automl.predict(X_test_clean)

# Получаем метки классов в правильном порядке
class_labels = np.array(test_preds.features)  # список меток классов

# Обработка предсказаний
y_proba = test_preds.data  # вероятности
y_pred_idx = y_proba.argmax(axis=1)  # индексы наиболее вероятных классов
y_pred = class_labels[y_pred_idx]    # индексы → реальные метки

# Приводим y_true к тому же типу
y_true = y_test.astype(class_labels.dtype)

# Метрики
print("\n📊 Основные метрики:")
print(f"Accuracy:      {accuracy_score(y_true, y_pred):.4f}")
print(f"F1 Macro:      {f1_score(y_true, y_pred, average='macro'):.4f}")
print(f"F1 Weighted:   {f1_score(y_true, y_pred, average='weighted'):.4f}")
print(f"Log Loss:      {log_loss(y_true, y_proba, labels=class_labels):.4f}")

print("\n🧩 Матрица ошибок:")
print(confusion_matrix(y_true, y_pred, labels=class_labels))

print("\n📝 Классификационный отчёт:")
print(classification_report(y_true, y_pred, labels=class_labels))



📊 Основные метрики:
Accuracy:      0.9497
F1 Macro:      0.9187
F1 Weighted:   0.9489
Log Loss:      0.1500

🧩 Матрица ошибок:
[[473   0   0   0   0   0   0   0]
 [  0 109   0   0   0   0   0   0]
 [  0   0 199   0   0   0   0   0]
 [  0   0   0 217   0   0   0   0]
 [  0   0   0   0  19  11   1   0]
 [  0   0   0   0   0 341  26   0]
 [  0   0   0   0   0  32 231   0]
 [  0   0   0   0   0  15   1  35]]

📝 Классификационный отчёт:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00       473
           2       1.00      1.00      1.00       109
           3       1.00      1.00      1.00       199
           4       1.00      1.00      1.00       217
           5       1.00      0.61      0.76        31
           6       0.85      0.93      0.89       367
           7       0.89      0.88      0.89       263
           8       1.00      0.69      0.81        51

    accuracy                           0.95      1710
   macro avg       0.

In [19]:
print("y_true unique:", np.unique(y_true))
print("y_pred unique:", np.unique(y_pred))

y_true unique: [1 2 3 4 5 6 7 8]
y_pred unique: [1 2 3 4 5 6 7 8]
