# Начинаем решение задания №5 - ПРОИЗВОДСТВО

## Наша цель: 
Cоздать модель, которая сможет предсказать фактор *y*=`Result` с наибольшим F1-score. Ориниентируясь на лидреборды, нам нужно набрать миниум 0.91 f1-score на валидац. выборке.

## Задачи:
- Выбрать факторы, которые смогут нам помочь в решении задания и провести Feature-Engineering
- Подготовить данные к началу обучения модели
- Подобрать идеальную мат.модель МО
- Затюнить модель для максимального результата


## GitHub и README файл проекта: https://github.com/plugg1N/aiijc-team-task-2023
___

# Импортируем все нужные библиотеки

In [1]:
# Models
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier

# Model selection
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.model_selection import KFold, RepeatedKFold

# Model tuning
from functools import partial
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
from hyperopt.pyll.base import scope

# Data handling
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Feature handling
from sklearn.preprocessing import StandardScaler, RobustScaler, SplineTransformer, PolynomialFeatures
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import ADASYN
from collections import Counter
from sklearn.base import TransformerMixin, BaseEstimator

# Math
import numpy as np

# Scoring
from sklearn.metrics import make_scorer, f1_score
from functools import partial

# Display
sns.set_style('darkgrid')
import warnings
warnings.filterwarnings('ignore')

___

# Давайте посмотрим на наш Датасет

In [2]:
train_data = pd.read_csv('/kaggle/input/aiijc-prod-2023/train_AIC.csv').sort_index().rename(columns={'y': 'Result'})
test_data = pd.read_csv('/kaggle/input/aiijc-prod-2023/test_AIC.csv').sort_index().rename(columns={'y': 'Result'})

In [3]:
features_to_use = ['Материал', 'Завод', 'Закупочная организация', 'Группа закупок',
       'Балансовая единица', 'ЕИ', 'Вариант поставки', 'Длительность',
       'Месяц1', 'Месяц2', 'Месяц3', 'Сумма', 'Количество обработчиков 7',
       'Количество обработчиков 15', 'Количество обработчиков 30']

In [4]:
class DataPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, transform_train=True):
        self.transform_train = transform_train
        

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_ = X.copy()
        
        X_.dropna(inplace=True)
        
        if self.transform_train:
            X_ = X_[features_to_use]

        return X_


In [5]:
data_preprocessor = DataPreprocessor(transform_train=False)
train_data_preproc = data_preprocessor.transform(train_data)
train_data_preproc.sample(3)

Unnamed: 0,Поставщик,Материал,Категорийный менеджер,Операционный менеджер,Завод,Закупочная организация,Группа закупок,Балансовая единица,ЕИ,Группа материалов,...,Количество изменений после согласований,Дней между 0_1,Дней между 1_2,Дней между 2_3,Дней между 3_4,Дней между 4_5,Дней между 5_6,Дней между 6_7,Дней между 7_8,Result
66080,192,27439,2,4,1,1,6,1,4,8,...,18,14.0,0.0,1.0,0.0,0.0,-1.0,-1.0,-1.0,1
168593,54,6199,8,9,8,3,72,7,1,20,...,11,1.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,0
17291,18,3716,7,17,1,1,29,1,1,30,...,14,39.0,0.0,0.0,2.0,0.0,-1.0,-1.0,-1.0,1


In [6]:
train_data_preproc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225000 entries, 0 to 224999
Data columns (total 44 columns):
 #   Column                                                                  Non-Null Count   Dtype  
---  ------                                                                  --------------   -----  
 0   Поставщик                                                               225000 non-null  int64  
 1   Материал                                                                225000 non-null  int64  
 2   Категорийный менеджер                                                   225000 non-null  int64  
 3   Операционный менеджер                                                   225000 non-null  int64  
 4   Завод                                                                   225000 non-null  int64  
 5   Закупочная организация                                                  225000 non-null  int64  
 6   Группа закупок                                                      

In [7]:
X = train_data.iloc[:, :-1]
y = train_data['Result']

adasyn = ADASYN(sampling_strategy='minority', random_state=0)
X, y = adasyn.fit_resample(X, y)
print("Class distribution after ADASYN:", Counter(y))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

Class distribution after ADASYN: Counter({1: 197141, 0: 192663})


In [8]:
data_processor = DataPreprocessor()
poly = PolynomialFeatures(degree=2)
scaler = RobustScaler()

estimator_lgbm = LGBMClassifier(
    n_jobs=-1, 
    n_estimators=750, 
    learning_rate=0.1, 
    num_leaves=80, 
    max_depth=12, 
    max_bin=128, 
    reg_lambda=0.1,
    verbose=0,
    force_col_wise=True
)

estimator_pipeline = Pipeline([
    ('data', data_processor),
    ('poly', poly),
    ('scaler', scaler),
    ('model', estimator_lgbm)
])

In [9]:
estimator_pipeline.fit(X_train, y_train)



In [10]:
cv_score = cross_val_score(estimator_pipeline, X_train, y_train, scoring='f1', n_jobs=-1)
print(f'Mean CV: {cv_score.mean()}')

Mean CV: 0.8840394091777843


In [11]:
datetime_preproc = DataPreprocessor()
spline = SplineTransformer()
estimator_lgbm = LGBMClassifier(n_jobs=-1)

In [15]:
# Define the scoring metric as 'f1'
scoring = {
    'f1': make_scorer(f1_score, average='binary')
}

def objective(params, pipeline, X, y):
    pipeline.set_params(**params)
    cv_scores = cross_validate(pipeline, X, y, scoring=scoring, cv=3, n_jobs=-1)
    mean_f1 = cv_scores['test_f1'].mean()
    return {'loss': -mean_f1, 'params': params, 'status': STATUS_OK}

X_train_preproc = datetime_preproc.transform(X_train)
X_train_preproc = spline.fit_transform(X_train_preproc)
optimize_fn = partial(objective, pipeline=estimator_lgbm, X=X_train_preproc, y=y_train)

In [16]:
space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 100, 1200, 50)),
    'max_depth': scope.int(hp.quniform('max_depth', 6, 16, 1)),
    'num_leaves': scope.int(hp.quniform('num_leaves', 32, 256, 16)),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.1), np.log(0.7)),
    'reg_lambda': hp.uniform('reg_lambda', 0.1, 0.7),
    'max_bin': scope.int(hp.quniform('max_bin', 16, 128, 2))
}

In [17]:
trials = Trials()
tune_lgbm = fmin(
    fn=optimize_fn,
    space=space,
    algo=tpe.suggest,
    max_evals=50,
    trials=trials,
    rstate=np.random.default_rng(42),
    show_progressbar=True,
)

100%|██████████| 50/50 [3:03:12<00:00, 219.85s/trial, best loss: -0.9102432881755754]


In [18]:
tune_lgbm

{'learning_rate': 0.21749116621829656,
 'max_bin': 78.0,
 'max_depth': 14.0,
 'n_estimators': 1200.0,
 'num_leaves': 256.0,
 'reg_lambda': 0.1978057698370551}

In [20]:
lgbm_model = LGBMClassifier(
    learning_rate=0.21749116621829656, 
    max_bin=78, 
    max_depth=14, 
    num_leaves=256, 
    n_estimators=1200, 
    reg_lambda=0.1978057698370551
    )

datetime_preproc = DataPreprocessor()
spline = SplineTransformer()
scaler = RobustScaler()
pipeline = Pipeline([
    ('datetime', datetime_preproc),
    ('spline', spline),
    ('scaler', scaler),
    ('model', lgbm_model)
])

In [21]:
cv_score = cross_val_score(pipeline, X_train, y_train, cv=RepeatedKFold(n_splits=5, n_repeats=3, random_state=1), n_jobs=-1)
cv_score.mean()

0.9160801892608301

In [None]:
cv_score = cross_val_score(pipeline, X_test, y_test, cv=RepeatedKFold(n_splits=5, n_repeats=3, random_state=1), n_jobs=-1)
cv_score.mean()

In [32]:
pipeline.fit(X_train, y_train)

In [26]:
print("Counter:", Counter(y_test))

Counter: Counter({1: 19817, 0: 19164})


In [27]:
X_test

Unnamed: 0,Поставщик,Материал,Категорийный менеджер,Операционный менеджер,Завод,Закупочная организация,Группа закупок,Балансовая единица,ЕИ,Группа материалов,...,Количество циклов согласования,Количество изменений после согласований,Дней между 0_1,Дней между 1_2,Дней между 2_3,Дней между 3_4,Дней между 4_5,Дней между 5_6,Дней между 6_7,Дней между 7_8
188719,798,27439,2,3,1,1,2,1,1,2,...,1.000000,11,15.000000,0.000000,0.000000,0.0,1.0,-1.000000,-1.000000,-1.0
80433,272,12195,1,13,6,5,20,5,1,23,...,0.000000,10,-1.000000,-1.000000,-1.000000,-1.0,-1.0,-1.000000,-1.000000,-1.0
190106,7,1249,1,1,12,9,8,1,3,12,...,0.000000,14,12.000000,-1.000000,-1.000000,-1.0,-1.0,-1.000000,-1.000000,-1.0
169461,64,27439,1,10,2,2,13,2,1,3,...,1.000000,13,7.000000,0.000000,1.000000,0.0,0.0,-1.000000,-1.000000,1.0
177336,43,27439,4,8,1,1,38,1,1,1,...,1.000000,0,1.000000,0.000000,0.000000,0.0,0.0,-1.000000,-1.000000,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
264572,44,17071,1,10,2,2,13,2,1,43,...,1.000000,11,9.764750,0.000000,1.745083,0.0,0.0,-1.000000,-1.000000,-1.0
62113,3,27439,1,1,7,1,124,6,1,6,...,1.000000,10,13.000000,0.000000,0.000000,0.0,0.0,-1.000000,-1.000000,-1.0
376865,46,27439,1,6,1,1,34,1,1,85,...,1.191821,0,29.779942,0.959104,0.959104,0.0,0.0,16.779942,17.588121,-1.0
267595,58,2,1,7,30,1,56,1,2,49,...,9.332995,75,4.444163,2.668020,2.668020,0.0,0.0,4.332995,-1.000000,-1.0


In [34]:
f1_score(y_test, pipeline.predict(X_test))

0.924091404803571

In [33]:
pipeline.score(X_test, y_test)

0.9232190041302173

In [37]:
test_data = pd.read_csv('/kaggle/input/aiijc-prod-2023/test_AIC.csv')
test_data.sample(1)

Unnamed: 0,Поставщик,Материал,Категорийный менеджер,Операционный менеджер,Завод,Закупочная организация,Группа закупок,Балансовая единица,ЕИ,Группа материалов,...,Количество циклов согласования,Количество изменений после согласований,Дней между 0_1,Дней между 1_2,Дней между 2_3,Дней между 3_4,Дней между 4_5,Дней между 5_6,Дней между 6_7,Дней между 7_8
1231,19,2007,1,7,1,1,4,1,2,97,...,3.0,0,21.0,0.0,1.0,0.0,0.0,3.0,9.0,-1.0


In [43]:
preds = pipeline.predict(test_data)

In [44]:
submit_df = pd.DataFrame(preds, columns=['value'])

In [48]:
submit_df.sample(20)

Unnamed: 0,value
23479,0
15447,1
3093,0
20443,0
1748,0
6318,0
11188,0
17182,0
24659,0
1630,0
