In [1]:
# Модели
from lightgbm import LGBMClassifier

# Инструменты
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.model_selection import KFold, RepeatedKFold
from sklearn.metrics import confusion_matrix

# Тюнинг
from functools import partial
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
from hyperopt.pyll.base import scope

# Анализ данных
import pandas as pd
import seaborn as sns

# Факторы
from sklearn.preprocessing import StandardScaler, RobustScaler, SplineTransformer, PolynomialFeatures
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from sklearn.base import TransformerMixin, BaseEstimator

# Линейная алгебра
import numpy as np

# Скоринг
from sklearn.metrics import make_scorer, f1_score
from functools import partial

# Отображение
sns.set_style('darkgrid')
import warnings
warnings.filterwarnings('ignore')

In [2]:
train_data = pd.read_csv('/kaggle/input/prod-2023-new/updated_train_AIC_day_pricol.csv').sort_index().rename(columns={'y': 'Result'})
data = pd.read_csv('/kaggle/input/aiijc-prod-2023/train_AIC.csv').sort_index().rename(columns={'y': 'Result'})
test_data = pd.read_csv('/kaggle/input/aiijc-prod-2023/test_AIC.csv').sort_index().rename(columns={'y': 'Result'})

In [3]:
data.columns

Index(['Поставщик', 'Материал', 'Категорийный менеджер',
       'Операционный менеджер', 'Завод', 'Закупочная организация',
       'Группа закупок', 'Балансовая единица', 'ЕИ', 'Группа материалов',
       'Вариант поставки', 'НРП', 'Длительность', 'До поставки', 'Месяц1',
       'Месяц2', 'Месяц3', 'День недели 2', 'Сумма', 'Количество позиций',
       'Количество', 'Количество обработчиков 7', 'Количество обработчиков 15',
       'Количество обработчиков 30', 'Согласование заказа 1',
       'Согласование заказа 2', 'Согласование заказа 3',
       'Изменение даты поставки 7', 'Изменение даты поставки 15',
       'Изменение даты поставки 30',
       'Отмена полного деблокирования заказа на закупку',
       'Изменение позиции заказа на закупку: изменение даты поставки на бумаге',
       'Изменение позиции заказа на закупку: дата поставки',
       'Количество циклов согласования',
       'Количество изменений после согласований', 'Дней между 0_1',
       'Дней между 1_2', 'Дней между 2_3'

In [4]:
list(set(data.columns) - set(train_data.columns))

['Согласование заказа 3',
 'Дней между 1_2',
 'Дней между 6_7',
 'Месяц1',
 'Согласование заказа 2',
 'Дней между 2_3',
 'Дней между 5_6',
 'Дней между 0_1',
 'Дней между 3_4',
 'Месяц2']

In [None]:
for i in train_data.columns[:-1]:
    for j in ['count', 'mean', 'min', 'max']:
        print(f"{i}:{j} = {train_data.describe()[i][j]}")
    print()

In [6]:
features_not_to_use = ['Согласование заказа 3',
 'Материал',
 'Количество позиций',
 'Дней между 3_4',
 'Дней между 6_7',
 'Дней между 2_3',
 'Месяц1',
 'Дней между 0_1',
 'Поставщик',
 'Дней между 5_6',
 'Дней между 1_2',
 'Согласование заказа 2',
 'Месяц2']

In [7]:
class DataPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, transform_test=True):
        self.transform_test = transform_test
        

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_ = X.copy()
        
        X_.dropna(inplace=True)
        
        if self.transform_test:
            return X_
        
        
        X_ = X_.drop(columns=features_not_to_use)
        
        
        return X_


In [8]:
X = train_data.iloc[:, :-1]
y = train_data['Result']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

adasyn = ADASYN(sampling_strategy='minority', random_state=0)
X_train, y_train = adasyn.fit_resample(X_train, y_train)
print("Class distribution after ADASYN:", Counter(y_train))

Class distribution after ADASYN: Counter({1: 179213, 0: 173385})


In [9]:
datetime_preproc = DataPreprocessor()
spline = SplineTransformer()
estimator_lgbm = LGBMClassifier(n_jobs=-1)

In [None]:
# # Define the scoring metric as 'f1'
# scoring = {
#     'f1': make_scorer(f1_score, average='binary')
# }

# def objective(params, pipeline, X, y):
#     pipeline.set_params(**params)
#     cv_scores = cross_validate(pipeline, X, y, scoring=scoring, cv=3, n_jobs=-1)
#     mean_f1 = cv_scores['test_f1'].mean()
#     return {'loss': -mean_f1, 'params': params, 'status': STATUS_OK}

# X_train_preproc = datetime_preproc.transform(X_train)
# X_train_preproc = spline.fit_transform(X_train_preproc)
# optimize_fn = partial(objective, pipeline=estimator_lgbm, X=X_train_preproc, y=y_train)

In [None]:
# space = {
#     'n_estimators': scope.int(hp.quniform('n_estimators', 100, 1500, 50)),
#     'max_depth': scope.int(hp.quniform('max_depth', 5, 25, 1)),
#     'num_leaves': scope.int(hp.quniform('num_leaves', 16, 272, 16)),
#     'learning_rate': hp.loguniform('learning_rate', np.log(0.1), np.log(0.7)),
#     'reg_lambda': hp.uniform('reg_lambda', 0.1, 0.7),
#     'max_bin': scope.int(hp.quniform('max_bin', 10, 156, 2))
# }

In [None]:
# trials = Trials()
# tune_lgbm = fmin(
#     fn=optimize_fn,
#     space=space,
#     algo=tpe.suggest,
#     max_evals=50,
#     trials=trials,
#     rstate=np.random.default_rng(42),
#     show_progressbar=True,
# )

In [None]:
# print(tune_lgbm)

{'learning_rate': 0.3641464381222404, 'max_bin': 10.0, 'max_depth': 18.0, 'n_estimators': 1400.0, 'num_leaves': 112.0, 'reg_lambda': 0.4971933561810771}

{'learning_rate': 0.21749116621829656,
 'max_bin': 78.0,
 'max_depth': 14.0,
 'n_estimators': 1200.0,
 'num_leaves': 256.0,
 'reg_lambda': 0.1978057698370551}

In [10]:
lgbm_model = LGBMClassifier(
    learning_rate=0.21749116621829656, 
    max_bin=78, 
    max_depth=14, 
    num_leaves=256, 
    n_estimators=1200, 
    reg_lambda=0.1978057698370551
    )

datetime_preproc = DataPreprocessor()
spline = SplineTransformer()
scaler = RobustScaler()
pipeline = Pipeline([
    ('datetime', datetime_preproc),
    ('spline', spline),
    ('scaler', scaler),
    ('model', lgbm_model)
])

In [None]:
pipeline.fit(X_train, y_train)

In [17]:
# cv_score = cross_val_score(pipeline, X_train, y_train, cv=RepeatedKFold(n_splits=5, n_repeats=3, random_state=1), n_jobs=-1)
# cv_score.mean()

In [18]:
# cv_score = cross_val_score(pipeline, X_test, y_test, cv=RepeatedKFold(n_splits=5, n_repeats=3, random_state=1), n_jobs=-1)
# cv_score.mean()

In [None]:
f1_score(y_test, pipeline.predict(X_test))

0.8117392029348007 - 0.891584

In [None]:
pipeline.score(X_test, y_test)

0.9498222222222222

In [37]:
confusion_matrix(y_test, pipeline.predict(X_test))

array([[18937,   341],
       [  788,  2434]])

In [None]:
test_data = pd.read_csv('/kaggle/input/aiijc-prod-2023/test_AIC.csv')

In [None]:
data_preproc = DataPreprocessor()

In [None]:
preproc_data = data_preproc.transform(test_data)
preproc_data

In [None]:
preds = pipeline.predict(preproc_data)

In [None]:
data = {
    'id': range(25000),  # IDs ranging from 0 to 24999
    'value': preds  # Replace with your pipeline predictions
}

df = pd.DataFrame(data)
df.to_csv('submit_newestest.csv', index=False)

In [None]:
df.tail(20)