# Notebook goal: 
This notebook try to simplify experiments with models, feature selection, and hyperparameter tuning by using sklearn pipeline.
In case it was useful, you know what to do :-)

In [None]:
import os

import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('display.float_format', lambda x: '%.6f' % x)

from sklearn.dummy import DummyClassifier

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer
from sklearn.ensemble import  RandomForestClassifier

from sklearn.feature_selection import SelectFromModel, SelectPercentile

from xgboost import XGBClassifier

### Settings

In [None]:
RANDOM_STATE = 42
OUTPUT_NAME = 'submission_pipeline_poc'

### Data

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

data = pd.read_csv(os.path.join(dirname,'train.csv'), index_col='id')

### Pipeline

In [None]:
numeric_features = [col for col in data.columns if 'cont' in col]
numeric_transformer = Pipeline(
    steps=[('scaler', MinMaxScaler())]
)

cat_features = [col for col in data.columns if 'cat' in col]
cat_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    sparse_threshold=0,
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', cat_transformer, cat_features),
    ]
)

model = XGBClassifier(n_estimators=100)
model = Pipeline(steps=[
    ('preprocessor', preprocessor), 
    ('f_selection', SelectPercentile(percentile=75)),
#     ('f_selection', SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE))),
    ('model', model),

])



In [None]:

x_train, x_test, y_train, y_test = train_test_split(data.drop(columns=['target']), data['target'],
                                                  test_size=0.25, random_state=0, 
                                                  )



### Grid search

In [None]:
%%time
param_grid ={
    'f_selection__percentile': [50, 75, 90, 95],
    'model__max_depth': [8, 12, 16],
#     'model__n_estimators': [100, 120],
}
search = GridSearchCV(model, param_grid, n_jobs=4, scoring=make_scorer(roc_auc_score))
search.fit(x_train, y_train)


In [None]:
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)


In [None]:
model.set_params(**search.best_params_)

In [None]:
%%time
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
metric_dev = roc_auc_score(y_test, y_pred)
metric_train = roc_auc_score(y_train, model.predict(x_train))
print(f'metric_train = {metric_train:.5} \n  metric_dev = {metric_dev:.5}')

### Build prediction 

In [None]:
model.fit(data.drop(columns=['target']), data['target'])

In [None]:
test = pd.read_csv(os.path.join(dirname, 'test.csv'), index_col='id')
submission = test[[]]
submission['target'] = model.predict(test)
submission.to_csv(f'/kaggle/working/{OUTPUT_NAME}.csv', index=True, index_label='id')

### Notes
We have a table below where we could track our progress

In [None]:
print(f'{metric_train:.5} | {metric_dev:.5} | *kaggle_score* | {OUTPUT_NAME}')

#### Model progress:

 train | dev | kaggle | comment 
-------|-----|--------|---------
0.50088 | 0.49967 | *-* | submission_pipeline_dummy


    