Feature Engineering Automation

In [76]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split

# libraries for feature engineering
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# libraries for model building
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# libraries for model evaluation
from sklearn.metrics import accuracy_score

# Hyperparameter tuning
from sklearn.model_selection import GridSearchCV

In [4]:
df = sns.load_dataset('tips')
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


Split data into dependent (y) and independent feature(x)

In [8]:
X = df.drop('time', axis = 1)
y = df['time']
X.columns, y.name

(Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'size'], dtype='object'),
 'time')

Split data into training and test 

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((183, 6), (61, 6), (183,), (61,))

In [25]:
# divide the columns into numerical and categorical
num_cols = ['total_bill', 'tip', 'size']
cat_cols = ['sex', 'smoker', 'day']

# create a pipeline for numerical columns
num_pipe = Pipeline(
    [
        ("imputer", SimpleImputer(strategy='median')),
        ("encoder", MinMaxScaler())  
    ]
)

# create a pipeline for categorical columns
cat_pipe = Pipeline(
    [
        ("imputer", SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder())
    ]
)

fe = ColumnTransformer([
    ('num', num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
])


In [26]:
X_train_fe = fe.fit_transform(X_train)
X_test_fe = fe.transform(X_test)

Model building

In [67]:
models=[LogisticRegression,DecisionTreeClassifier,SVC,RandomForestClassifier]

In [68]:
d = {}
for i in models:
    models = i()
    models.fit(X_train_fe, y_train)
    y_pred = models.predict(X_test_fe)
    acc = accuracy_score(y_test, y_pred)
    # take model name
    i = str(i).split(".")[-1]
    i = i[:-2]
    d[i]=round(acc, 2)


In [69]:
d

{'LogisticRegression': 1.0,
 'DecisionTreeClassifier': 0.95,
 'SVC': 1.0,
 'RandomForestClassifier': 0.97}

Hyperparameter tuning

In [77]:
grid = {
    "n_estimators":[100,200,300,400,500],
    "criterion":("gini", "entropy", "log_loss"),
    "max_features":("sqrt", "auto", None)
}

model = RandomForestClassifier()

clf = GridSearchCV(model, param_grid=grid, cv=5)
clf.fit(X_train_fe, y_train)

In [75]:
clf.best_params_

{'criterion': 'entropy', 'max_features': None, 'n_estimators': 100}

In [78]:
model=RandomForestClassifier(criterion="gini",n_estimators=400,max_features=None)

In [82]:
model.fit(X_train_fe,y_train)

In [83]:
y_preds=model.predict(X_test_fe)

In [84]:
accuracy_score(y_test,y_preds)

0.9672131147540983