In [57]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

# Фиксируем seed для воспроизводимости
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Загружаем данные
train = pd.read_parquet('train.parquet')

In [58]:
# Функция для вычисления признаков
def compute_features(dates_array, values_array):
    values = np.array(values_array)
    features = {}
    features['mean'] = np.mean(values)
    features['std'] = np.std(values)
    features['max'] = np.max(values)
    features['min'] = np.min(values)
    features['median'] = np.median(values)
    features['skew'] = pd.Series(values).skew()
    features['kurtosis'] = pd.Series(values).kurtosis()
    features['length'] = len(values)
    return features

# Вычисляем признаки для обучающей выборки
train_features_list = []
for idx, row in train.iterrows():
    features = compute_features(row['dates'], row['values'])
    features['id'] = row['id']
    features['label'] = row['label']
    train_features_list.append(features)

train_features_df = pd.DataFrame(train_features_list)

print('\nОбучающие признаки:')
print(train_features_df.head())


Обучающие признаки:
       mean       std   max       min    median      skew  kurtosis  length  \
0  0.023710  1.337272  3.49 -4.840000 -0.110000 -0.311759  2.154785      62   
1 -0.310777  1.556337  2.92 -3.534409 -0.274409 -0.012368  0.025976      45   
2 -0.046042  1.065023  2.61 -1.950000 -0.145000  0.389424 -0.167206      48   
3 -0.130000  1.158201  2.49 -2.220000 -0.345000  0.116053 -0.809032      48   
4 -0.270536  1.270529  2.87 -2.500000 -0.475000  0.409712 -0.527906      56   

      id  label  
0  19114    0.0  
1  22769    1.0  
2  76935    0.0  
3  66297    0.0  
4   2191    0.0  


In [59]:
# Разделяем признаки и целевую переменную
X_train = train_features_df.drop(columns=['id', 'label'])
y_train = train_features_df['label']

# Масштабирование признаков
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Разбиение на обучающую и валидационную выборки
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train_scaled, y_train, test_size=0.2, random_state=RANDOM_STATE, stratify=y_train
)


In [60]:
# Обучение модели
train_data = lgb.Dataset(X_tr, label=y_tr)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'random_state': RANDOM_STATE,
    'max_depth': 5, # установка глубины
    'is_unbalance': True  # учитываем дисбаланс классов
}

model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, val_data],
    num_boost_round=100
)


[LightGBM] [Info] Number of positive: 17744, number of negative: 46256
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003051 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1848
[LightGBM] [Info] Number of data points in the train set: 64000, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.277250 -> initscore=-0.958144
[LightGBM] [Info] Start training from score -0.958144


In [61]:
# Предсказание и расчет ROC AUC на валидационной выборке
y_val_pred = model.predict(X_val, num_iteration=model.best_iteration)
roc_auc = roc_auc_score(y_val, y_val_pred)
print(f'\nROC AUC на валидационной выборке: {roc_auc:.4f}')


ROC AUC на валидационной выборке: 0.8314


In [62]:
# сохранение модели
joblib.dump(model, 'trained_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
print('Сохранено')

Сохранено
