<h1> Pipeline example to preprocess data and classify income range </h>

In [1]:
# import basic modules

import numpy as np
import pandas as pd

# ignore warnings
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
# Read data from csv file

df = pd.read_csv('D:/Study/DataScience/Data/income_type.csv')

In [3]:
# define feature and target variables

x = df.iloc[:,:-1]

y = df.iloc[:,-1]

In [4]:
# split the data into training and testing datasets

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)

In [5]:
# define numerical and categorical variables

numerical_variables = df.select_dtypes(include=['int64']).columns
categorical_variables = df.select_dtypes(include=['object']).drop(['income'], axis=1).columns

In [7]:
# Create preprocessing pipeline :
# Numerical variables : SimpleImputer for missing values and Standard Scaler to scale the data
# Categorical variables : SimpleImputer for missing values and One Hot Encoder

# import necessory modules for above:

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Create seperate pipelines for numerical and categorical variables

numerical_pipe = Pipeline([('imputer', SimpleImputer(strategy = 'median')), ('scaler', StandardScaler())])

categorical_pipe = Pipeline([('imputer', SimpleImputer(strategy = 'constant', fill_value = 'missing')),('ohe', OneHotEncoder())])

In [8]:
# Create ColumnsTransformer to map respective pipelines to columns

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([('num', numerical_pipe, numerical_variables), 
                                  ('cat', categorical_pipe, categorical_variables)])

In [9]:
# Create a pipeline using preprocessor and Random Forest Classifier 

from sklearn.ensemble import RandomForestClassifier

rf = Pipeline([('preprocessor', preprocessor), ('classifier', RandomForestClassifier())])

In [10]:
# Fit the model with training and testing datasets

rf.fit(x_train, y_train)

Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbo...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [11]:
# Print Score

print('Test Score: \n {:0.2f}'.format(rf.score(x_test, y_test)))

Test Score: 
 0.85


<b> Find best parameters using Grid Search CV </b>

In [12]:
# import module

from sklearn.model_selection import GridSearchCV

# define parameter grid

param_grid = {'classifier__n_estimators' : [10, 50, 100],
              'classifier__max_features' : ['auto', 'sqrt', 'log2'],
              'classifier__max_depth' : [4,5,6,7,8],
              'classifier__criterion' : ['gini', 'entropy']}

# run Grid Search CV

grid = GridSearchCV(rf, param_grid = param_grid, cv = 5)

In [13]:
# Fit the grid to training dataset

grid.fit(x_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('num', Pipeline(memory=None,
     steps=[('imputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbo...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'classifier__n_estimators': [10, 50, 100], 'classifier__max_features': ['auto', 'sqrt', 'log2'], 'classifier__max_depth': [4, 5, 6, 7, 8], 'classifier__criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [14]:
# Print best parameters for classifier

print('Best Classifier Parameters:\n{}'.format(grid.best_params_))

Best Classifier Parameters:
{'classifier__criterion': 'gini', 'classifier__max_depth': 8, 'classifier__max_features': 'sqrt', 'classifier__n_estimators': 100}
