# Pipeline to automate data preprocessing and model training

## Load data

In [4]:
import pandas as pd

path = '../../data/simplified_features_cat.csv'
df = pd.read_csv(path)
df

Unnamed: 0,Industry,Ethnicity,Gender,Age,CivilStatus,YearsEmployed,Income,Approved
0,Industrials,White,Male,30,Married,1.25,0,1
1,Materials,Black,Female,58,Married,3.04,560,1
...,...,...,...,...,...,...,...,...
688,ConsumerStaples,White,Male,17,Married,0.04,750,0
689,Energy,Black,Male,35,Married,8.29,0,0


## Feature selection

In [5]:
target = 'Approved'

y = df[target]
X = df.drop(columns=target)

## Train test split

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

## Pipeline

### Data preprocessing

In [7]:
features = X.dtypes

features_categorical = features[features == 'object'].index
features_categorical

Index(['Industry', 'Ethnicity', 'Gender', 'CivilStatus'], dtype='object')

In [8]:
features_numerical = features[features != 'object'].index
features_numerical

Index(['Age', 'YearsEmployed', 'Income'], dtype='object')

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(), features_categorical),
        ('scaler', MinMaxScaler(), features_numerical)
    ])

### Model

In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier()

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 7, 9, 11],
    'learning_rate': [0.001, 0.01, 0.1, 1]
}

cv = GridSearchCV(model, param_grid)

## Alltogether

In [17]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', cv)
])

pipeline.fit(X_train, y_train)

In [18]:
pipeline.named_steps['model'].best_params_

{'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 50}

In [19]:
pipeline.score(X_test, y_test)

0.7101449275362319

In [20]:
pipeline.score(X_train, y_train)

0.8488612836438924

## Export model to use in production

In [22]:
import pickle

path = '../../artifacts/pipeline.pkl'

with open(path, 'wb') as file:
    pickle.dump(pipeline, file)