## Задание 3

Цель задания: 
 - обработка кардиологического дата-сета для решения задач бинарной классификации


Ключевой навык:
 - применение AutoML моделей, расчет точности классификатора 

Датасет: Необходимо загрузить датасет, расположенный по адресу (см. файлы с названием «модуль 3….»): https://github.com/AI-is-out-there/data2lab.git.  

#### 1.	
Сформировать обучающую выборку из загруженного датасета, состоящую из столбцов: ['Count_subj', 'rr_interval', 'p_end',  'qrs_onset', 'qrs_end', 'p_axis', 'qrs_axis', 't_axis', 'Healthy_Status’]. Датасет состоит из числовых параметров ЭКГ и классификационного признака Healthy_Status. 

In [1]:
import pandas as pd
from urllib.parse import quote

from sklearn.metrics import confusion_matrix, f1_score, classification_report, roc_auc_score
import numpy as np

import logging

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [2]:
base_url = "https://raw.githubusercontent.com/AI-is-out-there/data2lab/main/"
filename = "модуль 2 - датасет - практика.csv"
encoded_url = base_url + quote(filename)

data = pd.read_csv(encoded_url)
print(data.shape)
data.head()

(10000, 36)


Unnamed: 0,subject_id,Count_subj,study_id,cart_id,Healthy_Status,eeg_time,eeg_date,report_0,report_1,report_2,...,filtering,rr_interval,p_onset,p_end,qrs_onset,qrs_end,t_end,p_axis,qrs_axis,t_axis
0,19557662,27,40000017,6848296,0,8:44 AM,27.06.2015,Sinus rhythm,Possible right atrial abnormality,,...,60 Hz notch Baseline filter,659,40,128,170,258,518,81,77,79
1,18477137,93,40000029,6848296,0,9:54 AM,27.06.2015,Sinus rhythm,Possible right atrial abnormality,,...,60 Hz notch Baseline filter,722,40,124,162,246,504,77,75,70
2,16598616,3,40000035,6376932,1,9:07 AM,28.06.2015,Sinus tachycardia,,Normal ECG except for rate,...,60 Hz notch Baseline filter,600,40,130,162,244,474,79,72,77
3,16368287,7,40000079,6214760,1,5:14 PM,15.07.2015,Sinus rhythm,,Normal ECG,...,60 Hz notch Baseline filter,659,40,146,180,254,538,79,66,69
4,18370366,2,40000084,6632385,0,1:52 PM,27.09.2015,Sinus rhythm,,,...,<not specified>,659,368,29999,504,590,868,84,80,77


In [3]:
columns = ['Count_subj', 'rr_interval', 'p_end', 'qrs_onset', 'qrs_end', 'p_axis', 'qrs_axis', 't_axis', 'Healthy_Status']
data = data[columns]
target = 'Healthy_Status'

In [4]:
data[target] = data[target].astype('category')

In [5]:
data.isna().sum()

Count_subj        0
rr_interval       0
p_end             0
qrs_onset         0
qrs_end           0
p_axis            0
qrs_axis          0
t_axis            0
Healthy_Status    0
dtype: int64

In [6]:
# Разделение данных
train_df, temp_df = train_test_split(data, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)



# Разделение на признаки и целевую переменную
X_train, y_train = train_df.drop(target, axis=1), train_df[target]
X_val, y_val = val_df.drop(target, axis=1), val_df[target]
X_test, y_test = test_df.drop(target, axis=1), test_df[target]

In [7]:
data.isna().sum()

Count_subj        0
rr_interval       0
p_end             0
qrs_onset         0
qrs_end           0
p_axis            0
qrs_axis          0
t_axis            0
Healthy_Status    0
dtype: int64

In [8]:
data.shape

(10000, 9)

In [9]:
data.dtypes

Count_subj           int64
rr_interval          int64
p_end                int64
qrs_onset            int64
qrs_end              int64
p_axis               int64
qrs_axis             int64
t_axis               int64
Healthy_Status    category
dtype: object

In [10]:
data[target].value_counts()

Healthy_Status
0    7993
1    2007
Name: count, dtype: int64

#### 2.
Используя выбранные ранее решения AutoML, найдите наиболее эффективное, используя такие метрика качества как: матрица ошибок (confusion matrix) и F1-метрика для оценки обученного классификатора по признаку Healthy_Status на основе данных параметров ЭКГ. 

#### fedot

In [11]:
from fedot.core.data.data import InputData
from fedot.core.pipelines.pipeline import Pipeline
from fedot.core.pipelines.node import PipelineNode
from fedot.core.repository.tasks import Task, TaskTypesEnum

In [12]:
# Создание конвейера
node_final = PipelineNode('rf', nodes_from=[PipelineNode('scaling')])
pipeline = Pipeline(node_final)

2025-04-18 16:11:02,336 - Topological features operation requires extra dependencies for time series forecasting, which are not installed. It can infuence the performance. Please install it by 'pip install fedot[extra]'


In [13]:

# Создание InputData с обработкой
train_input = InputData.from_dataframe(
    features_df=train_df.drop(target, axis=1),
    target_df=train_df[target],
    task=Task(TaskTypesEnum.classification)
)

val_input = InputData.from_dataframe(
    features_df=val_df.drop(target, axis=1),
    target_df=val_df[target],
    task=Task(TaskTypesEnum.classification)
)


test_input = InputData.from_dataframe(
    features_df=test_df.drop(target, axis=1),
    target_df=test_df[target],
    task=Task(TaskTypesEnum.classification)
)

# Оптимизированный пайплайн
node_final = PipelineNode('rf', nodes_from=[
    PipelineNode('scaling'),
    PipelineNode('pca', params={'n_components': 5})  # Добавление PCA
])
pipeline = Pipeline(node_final)

# Обучение с параметрами
pipeline.fit(train_input)
print('success fit')

2025-04-18 16:11:02,714 - TableTypesCorrector - Preprocessing was unable to define the categorical columns
success fit


In [14]:
pipeline

{'depth': 2, 'length': 3, 'nodes': [rf, scaling, pca]}

In [15]:
# Оценка с оптимизацией порога
predicted = pipeline.predict(val_input)
print(f"\nROC-AUC: {roc_auc_score(val_input.target, predicted.predict):.4f}")

# Поиск оптимального порога через F1-score
thresholds = np.linspace(0, 1, 100)
f1_scores = [f1_score(val_input.target, (predicted.predict >= t).astype(int)) for t in thresholds]
best_threshold = thresholds[np.argmax(f1_scores)]
print(f'best treshold: {best_threshold}')
binary_pred = (predicted.predict >= best_threshold).astype(int)

# Метрики
print("Матрица ошибок:\n", confusion_matrix(val_input.target, binary_pred))
print("F1-метрика:", f1_score(val_input.target, binary_pred))
print("Полный отчет:\n", classification_report(val_input.target, binary_pred))


ROC-AUC: 0.9038
best treshold: 0.24242424242424243
Матрица ошибок:
 [[955 233]
 [ 18 294]]
F1-метрика: 0.700834326579261
Полный отчет:
               precision    recall  f1-score   support

           0       0.98      0.80      0.88      1188
           1       0.56      0.94      0.70       312

    accuracy                           0.83      1500
   macro avg       0.77      0.87      0.79      1500
weighted avg       0.89      0.83      0.85      1500



In [16]:
train_predict = pipeline.predict(train_input)
print(f"\nROC-AUC: {roc_auc_score(train_input.target, train_predict.predict):.4f}")

# Метрики на валидации
y_pred_train = (train_predict.predict >= best_threshold).astype(int)
print("\tTrain Metrics:")
print("Confusion Matrix:\n", confusion_matrix(y_train, y_pred_train))
print("F1-score:", f1_score(y_train, y_pred_train))
print("Classification Report:\n", classification_report(y_train, y_pred_train))


ROC-AUC: 1.0000
	Train Metrics:
Confusion Matrix:
 [[5429  185]
 [   0 1386]]
F1-score: 0.9374365911396686
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.97      0.98      5614
           1       0.88      1.00      0.94      1386

    accuracy                           0.97      7000
   macro avg       0.94      0.98      0.96      7000
weighted avg       0.98      0.97      0.97      7000



In [17]:
predicted = pipeline.predict(test_input)
print(f"\nROC-AUC: {roc_auc_score(test_input.target, predicted.predict):.4f}")


binary_pred = (predicted.predict >= best_threshold).astype(int)

# Метрики
print("Матрица ошибок:\n", confusion_matrix(test_input.target, binary_pred))
print("F1-метрика:", f1_score(test_input.target, binary_pred))
print("Полный отчет:\n", classification_report(test_input.target, binary_pred))



ROC-AUC: 0.9050
Матрица ошибок:
 [[930 261]
 [ 20 289]]
F1-метрика: 0.6728754365541327
Полный отчет:
               precision    recall  f1-score   support

           0       0.98      0.78      0.87      1191
           1       0.53      0.94      0.67       309

    accuracy                           0.81      1500
   macro avg       0.75      0.86      0.77      1500
weighted avg       0.89      0.81      0.83      1500



#### TPOT

In [18]:
from tpot import TPOTClassifier

In [19]:
print("\n" + "="*50 + "\nTraining TPOT\n" + "="*50)

# Настройка TPOT
tpot = TPOTClassifier(
    generations=5,           # Количество поколений
    population_size=20,      # Размер популяции
    verbosity=2,            # Уровень вывода (0-3)
    random_state=42,
    scoring='roc_auc',      # Метрика для оптимизации
    n_jobs=-1,              # Использовать все ядра
    max_time_mins=2,        # 2 минуты на обучение
    config_dict='TPOT light' # Используем только легкие алгоритмы
)

# Обучение
tpot.fit(X_train, y_train)


Training TPOT


Version 0.12.2 of tpot is outdated. Version 1.0.0 was released Wednesday February 26, 2025.


Optimization Progress:   0%|          | 0/20 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.8981548082631884

Generation 2 - Current best internal CV score: 0.8981548082631884

Generation 3 - Current best internal CV score: 0.8981548082631884

Generation 4 - Current best internal CV score: 0.8981548082631884

Generation 5 - Current best internal CV score: 0.8995407681609395

Best pipeline: DecisionTreeClassifier(BernoulliNB(input_matrix, alpha=10.0, fit_prior=False), criterion=gini, max_depth=10, min_samples_leaf=20, min_samples_split=4)


In [20]:
# Оценка с оптимизацией порога
val_predict = tpot.predict_proba(X_val)[:, 1]
print(f"\nROC-AUC: {roc_auc_score(y_val, val_predict):.4f}")

# Поиск оптимального порога через F1-score
thresholds = np.linspace(0, 1, 100)
f1_scores = [f1_score(y_val, (val_predict >= t).astype(int)) for t in thresholds]
best_threshold = thresholds[np.argmax(f1_scores)]
print(f'Best threshold: {best_threshold:.4f}')

# Метрики на валидации
y_pred_val = (val_predict >= best_threshold).astype(int)
print("\nValidation Metrics:")
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred_val))
print("F1-score:", f1_score(y_val, y_pred_val))
print("Classification Report:\n", classification_report(y_val, y_pred_val))


ROC-AUC: 0.8850
Best threshold: 0.2525

Validation Metrics:
Confusion Matrix:
 [[955 233]
 [ 25 287]]
F1-score: 0.6899038461538461
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.80      0.88      1188
           1       0.55      0.92      0.69       312

    accuracy                           0.83      1500
   macro avg       0.76      0.86      0.79      1500
weighted avg       0.89      0.83      0.84      1500



In [21]:
train_predict = tpot.predict_proba(X_train)[:, 1]
print(f"\nROC-AUC: {roc_auc_score(y_train, train_predict):.4f}")

# Метрики на валидации
y_pred_train = (train_predict >= best_threshold).astype(int)
print("\tTrain Metrics:")
print("Confusion Matrix:\n", confusion_matrix(y_train, y_pred_train))
print("F1-score:", f1_score(y_train, y_pred_train))
print("Classification Report:\n", classification_report(y_train, y_pred_train))


ROC-AUC: 0.9221
	Train Metrics:
Confusion Matrix:
 [[4594 1020]
 [  59 1327]]
F1-score: 0.7109563353870881
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.82      0.89      5614
           1       0.57      0.96      0.71      1386

    accuracy                           0.85      7000
   macro avg       0.78      0.89      0.80      7000
weighted avg       0.90      0.85      0.86      7000



In [22]:
# Предсказания на тесте
test_predict = tpot.predict_proba(X_test)[:, 1]
print(f"\nTest ROC-AUC: {roc_auc_score(y_test, test_predict):.4f}")

y_pred_test = (test_predict >= best_threshold).astype(int)
print("\nTest Metrics:")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))
print("F1-score:", f1_score(y_test, y_pred_test))
print("Classification Report:\n", classification_report(y_test, y_pred_test))

# Экспорт лучшего пайплайна
tpot.export('best_pipeline.py')  # Сохраняет код лучшего пайплайна


Test ROC-AUC: 0.8828

Test Metrics:
Confusion Matrix:
 [[929 262]
 [ 27 282]]
F1-score: 0.6611957796014069
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.78      0.87      1191
           1       0.52      0.91      0.66       309

    accuracy                           0.81      1500
   macro avg       0.75      0.85      0.76      1500
weighted avg       0.88      0.81      0.82      1500



#### lightautoml

In [23]:
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task
from lightgbm import LGBMClassifier


logging.getLogger('lightautoml').setLevel(logging.WARNING)

In [24]:

train_df[target] = train_df[target].astype(int)
val_df[target] = val_df[target].astype(int)
test_df[target] = test_df[target].astype(int)


task = Task('binary')
roles = {
    'target': target,
    'drop': []  # Укажите колонки для исключения, если есть
}


automl = TabularAutoML(
    task=task,
    timeout=600,  # 10 минут
    cpu_limit=4,
    general_params={
        'use_algos': [['lgb', 'cb']],  # Только LightGBM и CatBoost
        'default_params': {
            'lgb': {
                'class_weight': 'balanced',  # Учет дисбаланса классов
                'random_state': 42
            },
            'cb': {
                'auto_class_weights': 'Balanced',
                'random_state': 42
            }
        }
    },
    reader_params={
        'cv': 5,
        'random_state': 42
    }
)

train_pred = automl.fit_predict(train_df, roles=roles, verbose=0)

2025-04-18 16:11:13,007 - Stdout logging level is ERROR.
2025-04-18 16:11:13,008 - Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer
2025-04-18 16:11:13,009 - Task: binary

2025-04-18 16:11:13,009 - Start automl preset with listed constraints:
2025-04-18 16:11:13,009 - - time: 600.00 seconds
2025-04-18 16:11:13,010 - - CPU: 4 cores
2025-04-18 16:11:13,010 - - memory: 16 GB

2025-04-18 16:11:13,011 - [1mTrain data shape: (7000, 9)[0m

2025-04-18 16:11:15,530 - Layer [1m1[0m train process start. Time left 597.48 secs
2025-04-18 16:11:15,812 - [1mSelector_LightGBM[0m fitting and predicting completed
2025-04-18 16:11:15,841 - Start fitting [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m ...
2025-04-18 16:11:17,374 - Fitting [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m finished. score = [1m0.9161462968018009[0m
2025-04-18 16:11:17,375 - [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m fitting and predicting completed
2025-04-18 16:11:17,377 - Start fitting [1mLvl_0_P

In [25]:
val_predict = automl.predict(X_val).data

print(f"\nROC-AUC: {roc_auc_score(y_val, val_predict):.4f}")

# Поиск оптимального порога через F1-score
thresholds = np.linspace(0, 1, 100)
f1_scores = [f1_score(y_val, (val_predict >= t).astype(int)) for t in thresholds]
best_threshold = thresholds[np.argmax(f1_scores)]
print(f'Best threshold: {best_threshold:.4f}')

# Метрики на валидации
y_pred_val = (val_predict >= best_threshold).astype(int)
print("\nValidation Metrics:")
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred_val))
print("F1-score:", f1_score(y_val, y_pred_val))
print("Classification Report:\n", classification_report(y_val, y_pred_val))


ROC-AUC: 0.9133
Best threshold: 0.2121

Validation Metrics:
Confusion Matrix:
 [[944 244]
 [  9 303]]
F1-score: 0.7054714784633295
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.79      0.88      1188
           1       0.55      0.97      0.71       312

    accuracy                           0.83      1500
   macro avg       0.77      0.88      0.79      1500
weighted avg       0.90      0.83      0.85      1500



In [26]:
train_predict = automl.predict(X_train).data
print(f"\nROC-AUC: {roc_auc_score(y_train, train_predict):.4f}")

# Метрики на валидации
y_pred_train = (train_predict >= best_threshold).astype(int)
print("\tTrain Metrics:")
print("Confusion Matrix:\n", confusion_matrix(y_train, y_pred_train))
print("F1-score:", f1_score(y_train, y_pred_train))
print("Classification Report:\n", classification_report(y_train, y_pred_train))


ROC-AUC: 0.9627
	Train Metrics:
Confusion Matrix:
 [[4490 1124]
 [  16 1370]]
F1-score: 0.7061855670103093
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.80      0.89      5614
           1       0.55      0.99      0.71      1386

    accuracy                           0.84      7000
   macro avg       0.77      0.89      0.80      7000
weighted avg       0.91      0.84      0.85      7000



In [27]:
# Предсказания на тесте
test_predict = automl.predict(X_test).data
print(f"\nROC-AUC: {roc_auc_score(y_test, test_predict):.4f}")

y_pred_test = (test_predict >= best_threshold).astype(int)
print("\nTest Metrics:")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))
print("F1-score:", f1_score(y_test, y_pred_test))
print("Classification Report:\n", classification_report(y_test, y_pred_test))



ROC-AUC: 0.9070

Test Metrics:
Confusion Matrix:
 [[912 279]
 [ 14 295]]
F1-score: 0.6681766704416761
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.77      0.86      1191
           1       0.51      0.95      0.67       309

    accuracy                           0.80      1500
   macro avg       0.75      0.86      0.76      1500
weighted avg       0.89      0.80      0.82      1500



#### 3.	
Усовершенствуйте решения за счет предобработки исходных данных

#### preprocessing

In [28]:
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Заменяем выбросы на медиану
    median = df[column].median()
    df[column] = df[column].apply(lambda x: median if x < lower_bound or x > upper_bound else x)
    return df

# Применяем к временным параметрам
time_columns = ['rr_interval', 'p_end', 'qrs_onset', 'qrs_end']
for col in time_columns:
    data = remove_outliers(data, col)

data[time_columns].describe()

Unnamed: 0,rr_interval,p_end,qrs_onset,qrs_end
count,10000.0,10000.0,10000.0,10000.0
mean,814.4024,8930.689,199.3193,297.5902
std,186.830027,13602.948503,23.467033,34.82657
min,314.0,0.0,128.0,183.0
25%,682.0,144.0,188.0,278.0
50%,810.0,158.0,200.0,300.0
75%,937.0,29999.0,208.0,312.0
max,1333.0,29999.0,288.0,438.0


In [29]:
# Частота сердечных сокращений
data['heart_rate'] = 60000 / data['rr_interval']
data[['rr_interval', 'heart_rate']].head()

Unnamed: 0,rr_interval,heart_rate
0,659.0,91.047041
1,722.0,83.102493
2,600.0,100.0
3,659.0,91.047041
4,659.0,91.047041


In [30]:
numeric_cols = ['rr_interval', 'p_end', 'qrs_onset', 'qrs_end', 'p_axis', 'qrs_axis', 't_axis', 'heart_rate']

for col in numeric_cols:
    data[col] = (data[col] - data[col].min()) / (data[col].max() - data[col].min())

data[numeric_cols].describe()

Unnamed: 0,rr_interval,p_end,qrs_onset,qrs_end,p_axis,qrs_axis,t_axis,heart_rate
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,0.491072,0.2977,0.445746,0.449373,0.492989,0.010672,0.012376,0.226363
std,0.183346,0.453447,0.146669,0.136575,0.20596,0.065053,0.072173,0.138569
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.361138,0.0048,0.375,0.372549,0.400692,0.005401,0.005979,0.13023
50%,0.486752,0.005267,0.45,0.458824,0.401058,0.006362,0.006768,0.198963
75%,0.611384,1.0,0.5,0.505882,0.401333,0.007423,0.007588,0.294139
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [31]:
# Разделение данных
train_df, temp_df = train_test_split(data, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)



# Разделение на признаки и целевую переменную
X_train, y_train = train_df.drop(target, axis=1), train_df[target]
X_val, y_val = val_df.drop(target, axis=1), val_df[target]
X_test, y_test = test_df.drop(target, axis=1), test_df[target]

#### 4.	
Проверьте свои модели на тестовой и валидационной выборке.

#### fedot

In [32]:
from fedot.core.data.data import InputData
from fedot.core.pipelines.pipeline import Pipeline
from fedot.core.pipelines.node import PipelineNode
from fedot.core.repository.tasks import Task, TaskTypesEnum

In [33]:
# Создание конвейера
node_final = PipelineNode('rf', nodes_from=[PipelineNode('scaling')])
pipeline = Pipeline(node_final)

In [34]:

# Создание InputData с обработкой
train_input = InputData.from_dataframe(
    features_df=train_df.drop(target, axis=1),
    target_df=train_df[target],
    task=Task(TaskTypesEnum.classification)
)

val_input = InputData.from_dataframe(
    features_df=val_df.drop(target, axis=1),
    target_df=val_df[target],
    task=Task(TaskTypesEnum.classification)
)


test_input = InputData.from_dataframe(
    features_df=test_df.drop(target, axis=1),
    target_df=test_df[target],
    task=Task(TaskTypesEnum.classification)
)

# Оптимизированный пайплайн
node_final = PipelineNode('rf', nodes_from=[
    PipelineNode('scaling'),
    PipelineNode('pca', params={'n_components': 5})  # Добавление PCA
])
pipeline = Pipeline(node_final)

# Обучение с параметрами
pipeline.fit(train_input)
print('success fit')

2025-04-18 16:11:20,302 - TableTypesCorrector - Preprocessing was unable to define the categorical columns
success fit


In [35]:
pipeline

{'depth': 2, 'length': 3, 'nodes': [rf, scaling, pca]}

In [36]:
# Оценка с оптимизацией порога
predicted = pipeline.predict(val_input)
print(f"\nROC-AUC: {roc_auc_score(val_input.target, predicted.predict):.4f}")

# Поиск оптимального порога через F1-score
thresholds = np.linspace(0, 1, 100)
f1_scores = [f1_score(val_input.target, (predicted.predict >= t).astype(int)) for t in thresholds]
best_threshold = thresholds[np.argmax(f1_scores)]
print(f'best treshold: {best_threshold}')
binary_pred = (predicted.predict >= best_threshold).astype(int)

# Метрики
print("Матрица ошибок:\n", confusion_matrix(val_input.target, binary_pred))
print("F1-метрика:", f1_score(val_input.target, binary_pred))
print("Полный отчет:\n", classification_report(val_input.target, binary_pred))


ROC-AUC: 0.9038
best treshold: 0.24242424242424243
Матрица ошибок:
 [[958 230]
 [ 18 294]]
F1-метрика: 0.7033492822966507
Полный отчет:
               precision    recall  f1-score   support

           0       0.98      0.81      0.89      1188
           1       0.56      0.94      0.70       312

    accuracy                           0.83      1500
   macro avg       0.77      0.87      0.79      1500
weighted avg       0.89      0.83      0.85      1500



In [37]:
train_predict = pipeline.predict(train_input)
print(f"\nROC-AUC: {roc_auc_score(train_input.target, train_predict.predict):.4f}")

# Метрики на валидации
y_pred_train = (train_predict.predict >= best_threshold).astype(int)
print("\tTrain Metrics:")
print("Confusion Matrix:\n", confusion_matrix(y_train, y_pred_train))
print("F1-score:", f1_score(y_train, y_pred_train))
print("Classification Report:\n", classification_report(y_train, y_pred_train))


ROC-AUC: 1.0000
	Train Metrics:
Confusion Matrix:
 [[5435  179]
 [   0 1386]]
F1-score: 0.9393425957302609
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.97      0.98      5614
           1       0.89      1.00      0.94      1386

    accuracy                           0.97      7000
   macro avg       0.94      0.98      0.96      7000
weighted avg       0.98      0.97      0.97      7000



In [38]:
predicted = pipeline.predict(test_input)
print(f"\nROC-AUC: {roc_auc_score(test_input.target, predicted.predict):.4f}")


binary_pred = (predicted.predict >= best_threshold).astype(int)

# Метрики
print("Матрица ошибок:\n", confusion_matrix(test_input.target, binary_pred))
print("F1-метрика:", f1_score(test_input.target, binary_pred))
print("Полный отчет:\n", classification_report(test_input.target, binary_pred))



ROC-AUC: 0.9042
Матрица ошибок:
 [[932 259]
 [ 19 290]]
F1-метрика: 0.675990675990676
Полный отчет:
               precision    recall  f1-score   support

           0       0.98      0.78      0.87      1191
           1       0.53      0.94      0.68       309

    accuracy                           0.81      1500
   macro avg       0.75      0.86      0.77      1500
weighted avg       0.89      0.81      0.83      1500



#### TPOT

In [39]:
from tpot import TPOTClassifier

In [40]:
print("\n" + "="*50 + "\nTraining TPOT\n" + "="*50)

# Настройка TPOT
tpot = TPOTClassifier(
    generations=5,           # Количество поколений
    population_size=20,      # Размер популяции
    verbosity=2,            # Уровень вывода (0-3)
    random_state=42,
    scoring='roc_auc',      # Метрика для оптимизации
    n_jobs=-1,              # Использовать все ядра
    max_time_mins=2,        # 2 минуты на обучение
    config_dict='TPOT light' # Используем только легкие алгоритмы
)

# Обучение
tpot.fit(X_train, y_train)


Training TPOT


Version 0.12.2 of tpot is outdated. Version 1.0.0 was released Wednesday February 26, 2025.


Optimization Progress:   0%|          | 0/20 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.8949651913380368

Generation 2 - Current best internal CV score: 0.8949651913380368

Generation 3 - Current best internal CV score: 0.895529220785939

Generation 4 - Current best internal CV score: 0.895529220785939

Generation 5 - Current best internal CV score: 0.8972786022309922

Best pipeline: DecisionTreeClassifier(StandardScaler(DecisionTreeClassifier(input_matrix, criterion=gini, max_depth=5, min_samples_leaf=19, min_samples_split=9)), criterion=gini, max_depth=7, min_samples_leaf=20, min_samples_split=4)


In [41]:
# Оценка с оптимизацией порога
val_predict = tpot.predict_proba(X_val)[:, 1]
print(f"\nROC-AUC: {roc_auc_score(y_val, val_predict):.4f}")

# Поиск оптимального порога через F1-score
thresholds = np.linspace(0, 1, 100)
f1_scores = [f1_score(y_val, (val_predict >= t).astype(int)) for t in thresholds]
best_threshold = thresholds[np.argmax(f1_scores)]
print(f'Best threshold: {best_threshold:.4f}')

# Метрики на валидации
y_pred_val = (val_predict >= best_threshold).astype(int)
print("\nValidation Metrics:")
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred_val))
print("F1-score:", f1_score(y_val, y_pred_val))
print("Classification Report:\n", classification_report(y_val, y_pred_val))


ROC-AUC: 0.8958
Best threshold: 0.2222

Validation Metrics:
Confusion Matrix:
 [[948 240]
 [ 19 293]]
F1-score: 0.693491124260355
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.80      0.88      1188
           1       0.55      0.94      0.69       312

    accuracy                           0.83      1500
   macro avg       0.77      0.87      0.79      1500
weighted avg       0.89      0.83      0.84      1500



In [42]:
train_predict = tpot.predict_proba(X_train)[:, 1]
print(f"\nROC-AUC: {roc_auc_score(y_train, train_predict):.4f}")

# Метрики на валидации
y_pred_train = (train_predict >= best_threshold).astype(int)
print("\tTrain Metrics:")
print("Confusion Matrix:\n", confusion_matrix(y_train, y_pred_train))
print("F1-score:", f1_score(y_train, y_pred_train))
print("Classification Report:\n", classification_report(y_train, y_pred_train))


ROC-AUC: 0.9126
	Train Metrics:
Confusion Matrix:
 [[4494 1120]
 [  40 1346]]
F1-score: 0.6988577362409139
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.80      0.89      5614
           1       0.55      0.97      0.70      1386

    accuracy                           0.83      7000
   macro avg       0.77      0.89      0.79      7000
weighted avg       0.90      0.83      0.85      7000



In [43]:
# Предсказания на тесте
test_predict = tpot.predict_proba(X_test)[:, 1]
print(f"\nTest ROC-AUC: {roc_auc_score(y_test, test_predict):.4f}")

y_pred_test = (test_predict >= best_threshold).astype(int)
print("\nTest Metrics:")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))
print("F1-score:", f1_score(y_test, y_pred_test))
print("Classification Report:\n", classification_report(y_test, y_pred_test))

# Экспорт лучшего пайплайна
tpot.export('best_pipeline.py')  # Сохраняет код лучшего пайплайна


Test ROC-AUC: 0.8913

Test Metrics:
Confusion Matrix:
 [[923 268]
 [ 14 295]]
F1-score: 0.676605504587156
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.77      0.87      1191
           1       0.52      0.95      0.68       309

    accuracy                           0.81      1500
   macro avg       0.75      0.86      0.77      1500
weighted avg       0.89      0.81      0.83      1500



#### lightautoml

In [44]:
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task
from lightgbm import LGBMClassifier


logging.getLogger('lightautoml').setLevel(logging.WARNING)

In [45]:

train_df[target] = train_df[target].astype(int)
val_df[target] = val_df[target].astype(int)
test_df[target] = test_df[target].astype(int)


task = Task('binary')
roles = {
    'target': target,
    'drop': []  # Укажите колонки для исключения, если есть
}


automl = TabularAutoML(
    task=task,
    timeout=600,  # 10 минут
    cpu_limit=4,
    general_params={
        'use_algos': [['lgb', 'cb']],  # Только LightGBM и CatBoost
        'default_params': {
            'lgb': {
                'class_weight': 'balanced',  # Учет дисбаланса классов
                'random_state': 42
            },
            'cb': {
                'auto_class_weights': 'Balanced',
                'random_state': 42
            }
        }
    },
    reader_params={
        'cv': 5,
        'random_state': 42
    }
)

train_pred = automl.fit_predict(train_df, roles=roles, verbose=0)

2025-04-18 16:11:28,304 - Stdout logging level is ERROR.
2025-04-18 16:11:28,305 - Task: binary

2025-04-18 16:11:28,305 - Start automl preset with listed constraints:
2025-04-18 16:11:28,306 - - time: 600.00 seconds
2025-04-18 16:11:28,306 - - CPU: 4 cores
2025-04-18 16:11:28,306 - - memory: 16 GB

2025-04-18 16:11:28,307 - [1mTrain data shape: (7000, 10)[0m

2025-04-18 16:11:30,760 - Layer [1m1[0m train process start. Time left 597.55 secs
2025-04-18 16:11:31,166 - [1mSelector_LightGBM[0m fitting and predicting completed
2025-04-18 16:11:31,446 - Start fitting [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m ...
2025-04-18 16:11:32,915 - Fitting [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m finished. score = [1m0.9142753300216785[0m
2025-04-18 16:11:32,915 - [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m fitting and predicting completed
2025-04-18 16:11:32,919 - Start fitting [1mLvl_0_Pipe_0_Mod_1_CatBoost[0m ...
2025-04-18 16:11:35,242 - Fitting [1mLvl_0_Pipe_0_Mod_1_CatBoost[0m finished. score = [1m0.9

In [46]:
val_predict = automl.predict(X_val).data

print(f"\nROC-AUC: {roc_auc_score(y_val, val_predict):.4f}")

# Поиск оптимального порога через F1-score
thresholds = np.linspace(0, 1, 100)
f1_scores = [f1_score(y_val, (val_predict >= t).astype(int)) for t in thresholds]
best_threshold = thresholds[np.argmax(f1_scores)]
print(f'Best threshold: {best_threshold:.4f}')

# Метрики на валидации
y_pred_val = (val_predict >= best_threshold).astype(int)
print("\nValidation Metrics:")
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred_val))
print("F1-score:", f1_score(y_val, y_pred_val))
print("Classification Report:\n", classification_report(y_val, y_pred_val))


ROC-AUC: 0.9098
Best threshold: 0.3636

Validation Metrics:
Confusion Matrix:
 [[1008  180]
 [  45  267]]
F1-score: 0.7035573122529645
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.85      0.90      1188
           1       0.60      0.86      0.70       312

    accuracy                           0.85      1500
   macro avg       0.78      0.85      0.80      1500
weighted avg       0.88      0.85      0.86      1500



In [47]:
train_predict = automl.predict(X_train).data
print(f"\nROC-AUC: {roc_auc_score(y_train, train_predict):.4f}")

# Метрики на валидации
y_pred_train = (train_predict >= best_threshold).astype(int)
print("\tTrain Metrics:")
print("Confusion Matrix:\n", confusion_matrix(y_train, y_pred_train))
print("F1-score:", f1_score(y_train, y_pred_train))
print("Classification Report:\n", classification_report(y_train, y_pred_train))


ROC-AUC: 0.9549
	Train Metrics:
Confusion Matrix:
 [[4867  747]
 [ 107 1279]]
F1-score: 0.7497069167643611
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.87      0.92      5614
           1       0.63      0.92      0.75      1386

    accuracy                           0.88      7000
   macro avg       0.80      0.89      0.83      7000
weighted avg       0.91      0.88      0.89      7000



In [48]:
# Предсказания на тесте
test_predict = automl.predict(X_test).data
print(f"\nROC-AUC: {roc_auc_score(y_test, test_predict):.4f}")

y_pred_test = (test_predict >= best_threshold).astype(int)
print("\nTest Metrics:")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_test))
print("F1-score:", f1_score(y_test, y_pred_test))
print("Classification Report:\n", classification_report(y_test, y_pred_test))



ROC-AUC: 0.9053

Test Metrics:
Confusion Matrix:
 [[983 208]
 [ 48 261]]
F1-score: 0.6709511568123393
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.83      0.88      1191
           1       0.56      0.84      0.67       309

    accuracy                           0.83      1500
   macro avg       0.75      0.84      0.78      1500
weighted avg       0.87      0.83      0.84      1500



#### MLFLOW

In [62]:
import mlflow
from mlflow.pyfunc import PythonModel

class LightAutoMLWrapper(PythonModel):
    def __init__(self, automl):
        self.automl = automl
        
    def predict(self, context, model_input):
        predictions = self.automl.predict(model_input)
        # Convert NumpyDataset to JSON-serializable format
        return predictions.data[:, 0]

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("ppa_lab")



<Experiment: artifact_location='/Users/vladimir/Desktop/University/ppa/labs/artifacts/3', creation_time=1744904699558, experiment_id='3', last_update_time=1744904699558, lifecycle_stage='active', name='ppa_lab', tags={}>

In [63]:
# Save to MLflow
with mlflow.start_run(run_name="lr3"):
    wrapped_model = LightAutoMLWrapper(automl)

    mlflow.pyfunc.log_model(
        artifact_path="lr3",
        python_model=wrapped_model,
    )



🏃 View run lr3 at: http://localhost:5000/#/experiments/3/runs/f5e7e3a53931486b9c214b3f048f338d
🧪 View experiment at: http://localhost:5000/#/experiments/3


In [64]:
# проверка корректности выгрузки модели из mlflow
run_id = 'f5e7e3a53931486b9c214b3f048f338d'
loaded_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/lr3")
predictions_mlflow = loaded_model.predict(X_test)

In [65]:
# сохранение артефактов локально
mlflow.artifacts.download_artifacts(
    run_id=run_id,
    artifact_path="lr3", 
    dst_path="./downloaded_model"
)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

'/Users/vladimir/Desktop/University/ppa/labs/downloaded_model/lr3'

In [66]:
# !mlflow models serve -m "./downloaded_model/lr3" --port 1234 --no-conda
# проверка сервиса
!curl http://localhost:1234/ping




In [68]:

import requests

# отправка post запроса к развернутому сервису
response = requests.post(
    "http://localhost:1234/invocations",
    json={"dataframe_records": X_test.to_dict(orient="records")}
)


In [73]:
predictions_mlflow_server = response.json()['predictions']

In [74]:
predictions_mlflow_server[:10]

[0.012369676493108273,
 0.6041978597640991,
 0.0067258113995194435,
 0.5716642737388611,
 0.006006792653352022,
 0.706897497177124,
 0.13147854804992676,
 0.02307809516787529,
 0.7169305086135864,
 0.4455425441265106]

In [75]:
predictions_mlflow[:10]

array([0.01236968, 0.60419786, 0.00672581, 0.5716643 , 0.00600679,
       0.7068975 , 0.13147855, 0.0230781 , 0.7169305 , 0.44554254],
      dtype=float32)

In [79]:
# Предсказания на тесте
test_predict = automl.predict(X_test).data.flatten()
test_predict[:10]

array([0.01236968, 0.60419786, 0.00672581, 0.5716643 , 0.00600679,
       0.7068975 , 0.13147855, 0.0230781 , 0.7169305 , 0.44554254],
      dtype=float32)

In [80]:
np.allclose(predictions_mlflow, test_predict)

True

In [81]:
np.allclose(predictions_mlflow_server, test_predict)

True

Сохранение модели в mlflow и ее развертывание с помощью mlflow server прошло корректно, резульаты работы модели совпадают с исходными