<a href="https://colab.research.google.com/github/shakebkhan/bdda1/blob/main/BDDA_Project_2_035051.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing Packages & Calling Libraries

In [None]:
# 0.0
# For skopt routines
! pip install scikit-optimize

# 0.1 For plotting skopt results
! pip install 'scikit-optimize[plots]'

Collecting scikit-optimize
  Downloading scikit_optimize-0.9.0-py2.py3-none-any.whl (100 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/100.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━[0m [32m71.7/100.3 kB[0m [31m2.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.3/100.3 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-23.7.0-py3-none-any.whl (17 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-23.7.0 scikit-optimize-0.9.0


In [None]:
# 1.1 Data manipulation and plotting modules
import numpy as np
import pandas as pd


# 1.2 Data pre-processing
from sklearn.preprocessing import OrdinalEncoder # For Encoding Categorical Data
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer # For Imputation of Missing Data
from sklearn.preprocessing import StandardScaler as ss # For Rescaling Data
from sklearn.model_selection import train_test_split # For Splitting Data into Training & Testing Sets

# 1.3 Dimensionality reduction and noise removal
from sklearn.decomposition import PCA

# 1.4 Data splitting and model parameter search
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

# 1.5 Model pipelining
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

# 1.6 Hyperparameter optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# 1.7.1
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

# 1.8 Modeling modules
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier

In [None]:
# 1.9 Model evaluation metrics
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import confusion_matrix

# 1.10
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import plot_importance

# 1.11 Permutation feature importance
from sklearn.inspection import permutation_importance

In [None]:
# 1.12 Misc
import time
import os
import gc
import random

# 1.13 Used in Randomized parameter search
from scipy.stats import uniform

In [None]:
# 1.14
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# Display All Columns of Dataset
pd.set_option('display.max_columns', None)

# Mounting gdrive

In [None]:
from google.colab import drive
drive.mount('/gdrive')


Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


# Reading Data

In [None]:
# 2.1 Where is our data file:
pathToFolder = "/gdrive/MyDrive/bdda1/"

In [None]:
from  pathlib import Path

In [None]:
# 2.2 Full file path:
path = Path(pathToFolder) / "superstore_data.csv"

In [None]:
# 3.0 Read data
data = pd.read_csv(path)
data.head()

Unnamed: 0,Id,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Complain,Response
0,1826,1970,Graduation,Divorced,84835.0,0,0,6/16/2014,0,189,104,379,111,189,218,1,4,4,6,1,0,1
1,1,1961,Graduation,Single,57091.0,0,0,6/15/2014,0,464,5,64,7,0,37,1,7,3,7,5,0,1
2,10476,1958,Graduation,Married,67267.0,0,1,5/13/2014,0,134,11,59,15,2,30,1,3,2,5,2,0,0
3,1386,1967,Graduation,Together,32474.0,1,1,11-05-2014,0,10,0,1,0,0,0,1,1,0,2,7,0,0
4,5371,1989,Graduation,Single,21474.0,1,0,08-04-2014,0,6,16,24,11,0,34,2,3,1,2,7,0,1


# Exploring and Processing data

In [None]:
# 3.0 Explore data
print("\n Shape")
data.shape              # (2240, 22)
print("\n\n Columns")
data.columns.values     # 'Response' column is the last one
print("\n")

# 3.0.1
print("\n\nTarget distribution")
data.Response.value_counts()   # Binary data
                               # 0: 1906 , 1: 334

# 3.0.2
print("\n\nData types")
data.dtypes.value_counts()    # 'Response' column is of object type
print("\n")
data.head(5)


 Shape


(2240, 22)



 Columns


array(['Id', 'Year_Birth', 'Education', 'Marital_Status', 'Income',
       'Kidhome', 'Teenhome', 'Dt_Customer', 'Recency', 'MntWines',
       'MntFruits', 'MntMeatProducts', 'MntFishProducts',
       'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases',
       'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases',
       'NumWebVisitsMonth', 'Complain', 'Response'], dtype=object)





Target distribution


0    1906
1     334
Name: Response, dtype: int64



Data types


int64      18
object      3
float64     1
dtype: int64





Unnamed: 0,Id,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Complain,Response
0,1826,1970,Graduation,Divorced,84835.0,0,0,6/16/2014,0,189,104,379,111,189,218,1,4,4,6,1,0,1
1,1,1961,Graduation,Single,57091.0,0,0,6/15/2014,0,464,5,64,7,0,37,1,7,3,7,5,0,1
2,10476,1958,Graduation,Married,67267.0,0,1,5/13/2014,0,134,11,59,15,2,30,1,3,2,5,2,0,0
3,1386,1967,Graduation,Together,32474.0,1,1,11-05-2014,0,10,0,1,0,0,0,1,1,0,2,7,0,0
4,5371,1989,Graduation,Single,21474.0,1,0,08-04-2014,0,6,16,24,11,0,34,2,3,1,2,7,0,1


In [None]:
# 3.1 Data statistics
data.describe()

Unnamed: 0,Id,Year_Birth,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Complain,Response
count,2240.0,2240.0,2216.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0
mean,5592.159821,1968.805804,52247.251354,0.444196,0.50625,49.109375,303.935714,26.302232,166.95,37.525446,27.062946,44.021875,2.325,4.084821,2.662054,5.790179,5.316518,0.009375,0.149107
std,3246.662198,11.984069,25173.076661,0.538398,0.544538,28.962453,336.597393,39.773434,225.715373,54.628979,41.280498,52.167439,1.932238,2.778714,2.923101,3.250958,2.426645,0.096391,0.356274
min,0.0,1893.0,1730.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2828.25,1959.0,35303.0,0.0,0.0,24.0,23.75,1.0,16.0,3.0,1.0,9.0,1.0,2.0,0.0,3.0,3.0,0.0,0.0
50%,5458.5,1970.0,51381.5,0.0,0.0,49.0,173.5,8.0,67.0,12.0,8.0,24.0,2.0,4.0,2.0,5.0,6.0,0.0,0.0
75%,8427.75,1977.0,68522.0,1.0,1.0,74.0,504.25,33.0,232.0,50.0,33.0,56.0,3.0,6.0,4.0,8.0,7.0,0.0,0.0
max,11191.0,1996.0,666666.0,2.0,2.0,99.0,1493.0,199.0,1725.0,259.0,263.0,362.0,15.0,27.0,28.0,13.0,20.0,1.0,1.0


In [None]:
# Dataset Information
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Id                   2240 non-null   int64  
 1   Year_Birth           2240 non-null   int64  
 2   Education            2240 non-null   object 
 3   Marital_Status       2240 non-null   object 
 4   Income               2216 non-null   float64
 5   Kidhome              2240 non-null   int64  
 6   Teenhome             2240 non-null   int64  
 7   Dt_Customer          2240 non-null   object 
 8   Recency              2240 non-null   int64  
 9   MntWines             2240 non-null   int64  
 10  MntFruits            2240 non-null   int64  
 11  MntMeatProducts      2240 non-null   int64  
 12  MntFishProducts      2240 non-null   int64  
 13  MntSweetProducts     2240 non-null   int64  
 14  MntGoldProds         2240 non-null   int64  
 15  NumDealsPurchases    2240 non-null   i

In [None]:
# Variable-wise Missing Data Information
variable_missing_data = data.isna().sum()
variable_missing_data

Id                      0
Year_Birth              0
Education               0
Marital_Status          0
Income                 24
Kidhome                 0
Teenhome                0
Dt_Customer             0
Recency                 0
MntWines                0
MntFruits               0
MntMeatProducts         0
MntFishProducts         0
MntSweetProducts        0
MntGoldProds            0
NumDealsPurchases       0
NumWebPurchases         0
NumCatalogPurchases     0
NumStorePurchases       0
NumWebVisitsMonth       0
Complain                0
Response                0
dtype: int64

In [None]:
# Data Bifurcation

# Categorical Data
data_cat = data[['Education',	'Marital_Status','Kidhome',	'Teenhome','Complain',	'Response']]

# Non-Categorical Data
data_noncat = data[['Id',	'Year_Birth',	'Income',	'Recency',	'MntWines',	'MntFruits',	'MntMeatProducts',	'MntFishProducts',	'MntSweetProducts',	'MntGoldProds',	'NumDealsPurchases',	'NumWebPurchases',	'NumCatalogPurchases',	'NumStorePurchases',	'NumWebVisitsMonth']]

### Missing Data Treatment

In [None]:
# Imputing Missing Non-Categorical Data (Income) using Descriptive Statistics : Central Tendency: mean

si_noncat = SimpleImputer(missing_values=np.nan, strategy='mean')
si_noncat_fit = si_noncat.fit_transform(data_noncat)
data_noncat_si = pd.DataFrame(si_noncat_fit, columns=data_noncat.columns); data_noncat_si # Missing Non-Categorical Data Imputed Subset using Simple Imputer
data_noncat_si.info()

Unnamed: 0,Id,Year_Birth,Income,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth
0,1826.0,1970.0,84835.0,0.0,189.0,104.0,379.0,111.0,189.0,218.0,1.0,4.0,4.0,6.0,1.0
1,1.0,1961.0,57091.0,0.0,464.0,5.0,64.0,7.0,0.0,37.0,1.0,7.0,3.0,7.0,5.0
2,10476.0,1958.0,67267.0,0.0,134.0,11.0,59.0,15.0,2.0,30.0,1.0,3.0,2.0,5.0,2.0
3,1386.0,1967.0,32474.0,0.0,10.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,7.0
4,5371.0,1989.0,21474.0,0.0,6.0,16.0,24.0,11.0,0.0,34.0,2.0,3.0,1.0,2.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,10142.0,1976.0,66476.0,99.0,372.0,18.0,126.0,47.0,48.0,78.0,2.0,5.0,2.0,11.0,4.0
2236,5263.0,1977.0,31056.0,99.0,5.0,10.0,13.0,3.0,8.0,16.0,1.0,1.0,0.0,3.0,8.0
2237,22.0,1976.0,46310.0,99.0,185.0,2.0,88.0,15.0,5.0,14.0,2.0,6.0,1.0,5.0,8.0
2238,528.0,1978.0,65819.0,99.0,267.0,38.0,701.0,149.0,165.0,63.0,1.0,5.0,4.0,10.0,3.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Id                   2240 non-null   float64
 1   Year_Birth           2240 non-null   float64
 2   Income               2240 non-null   float64
 3   Recency              2240 non-null   float64
 4   MntWines             2240 non-null   float64
 5   MntFruits            2240 non-null   float64
 6   MntMeatProducts      2240 non-null   float64
 7   MntFishProducts      2240 non-null   float64
 8   MntSweetProducts     2240 non-null   float64
 9   MntGoldProds         2240 non-null   float64
 10  NumDealsPurchases    2240 non-null   float64
 11  NumWebPurchases      2240 non-null   float64
 12  NumCatalogPurchases  2240 non-null   float64
 13  NumStorePurchases    2240 non-null   float64
 14  NumWebVisitsMonth    2240 non-null   float64
dtypes: float64(15)
memory usage: 262.6 KB


###  Numeric Encoding of Categorical Data

In [None]:
# Using Scikit Learn : Ordinal Encoder
oe = OrdinalEncoder()
oe_fit = oe.fit_transform(data_cat)
data_cat_oe = pd.DataFrame(oe_fit, columns=['Education',	'Marital_Status','Kidhome',	'Teenhome','Complain',	'Response'])
data_cat_oe

Unnamed: 0,Education,Marital_Status,Kidhome,Teenhome,Complain,Response
0,2.0,2.0,0.0,0.0,0.0,1.0
1,2.0,4.0,0.0,0.0,0.0,1.0
2,2.0,3.0,0.0,1.0,0.0,0.0
3,2.0,5.0,1.0,1.0,0.0,0.0
4,2.0,4.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...
2235,4.0,2.0,0.0,1.0,0.0,0.0
2236,0.0,3.0,1.0,0.0,0.0,0.0
2237,2.0,2.0,1.0,0.0,0.0,0.0
2238,2.0,3.0,0.0,0.0,0.0,0.0


### Treatment of Outliers

In [None]:
# 3.1. Standardization
# Standardizing Non-Categorical Dataset using Scikit Learn Standard Scaler

# Scaling Variable : Income

ss = StandardScaler()
ss_fit = ss.fit_transform(data_noncat[['Income']])
data_noncat_std = pd.DataFrame(ss_fit, columns=['Income_std']); data_noncat_std

Unnamed: 0,Income_std
0,1.294840
1,0.192461
2,0.596794
3,-0.785669
4,-1.222743
...,...
2235,0.565364
2236,-0.842012
2237,-0.235910
2238,0.539259


In [None]:
data_noncat_mdt_std = data_noncat_si.join(data_noncat_std)
data_noncat_mdt_std

Unnamed: 0,Id,Year_Birth,Income,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Income_std
0,1826.0,1970.0,84835.0,0.0,189.0,104.0,379.0,111.0,189.0,218.0,1.0,4.0,4.0,6.0,1.0,1.294840
1,1.0,1961.0,57091.0,0.0,464.0,5.0,64.0,7.0,0.0,37.0,1.0,7.0,3.0,7.0,5.0,0.192461
2,10476.0,1958.0,67267.0,0.0,134.0,11.0,59.0,15.0,2.0,30.0,1.0,3.0,2.0,5.0,2.0,0.596794
3,1386.0,1967.0,32474.0,0.0,10.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,7.0,-0.785669
4,5371.0,1989.0,21474.0,0.0,6.0,16.0,24.0,11.0,0.0,34.0,2.0,3.0,1.0,2.0,7.0,-1.222743
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,10142.0,1976.0,66476.0,99.0,372.0,18.0,126.0,47.0,48.0,78.0,2.0,5.0,2.0,11.0,4.0,0.565364
2236,5263.0,1977.0,31056.0,99.0,5.0,10.0,13.0,3.0,8.0,16.0,1.0,1.0,0.0,3.0,8.0,-0.842012
2237,22.0,1976.0,46310.0,99.0,185.0,2.0,88.0,15.0,5.0,14.0,2.0,6.0,1.0,5.0,8.0,-0.235910
2238,528.0,1978.0,65819.0,99.0,267.0,38.0,701.0,149.0,165.0,63.0,1.0,5.0,4.0,10.0,3.0,0.539259


In [None]:
# Pre-Processed Dataset
data_ppd = data_noncat_si.join(data_cat_oe)
data_ppd

Unnamed: 0,Id,Year_Birth,Income,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Education,Marital_Status,Kidhome,Teenhome,Complain,Response
0,1826.0,1970.0,84835.0,0.0,189.0,104.0,379.0,111.0,189.0,218.0,1.0,4.0,4.0,6.0,1.0,2.0,2.0,0.0,0.0,0.0,1.0
1,1.0,1961.0,57091.0,0.0,464.0,5.0,64.0,7.0,0.0,37.0,1.0,7.0,3.0,7.0,5.0,2.0,4.0,0.0,0.0,0.0,1.0
2,10476.0,1958.0,67267.0,0.0,134.0,11.0,59.0,15.0,2.0,30.0,1.0,3.0,2.0,5.0,2.0,2.0,3.0,0.0,1.0,0.0,0.0
3,1386.0,1967.0,32474.0,0.0,10.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,7.0,2.0,5.0,1.0,1.0,0.0,0.0
4,5371.0,1989.0,21474.0,0.0,6.0,16.0,24.0,11.0,0.0,34.0,2.0,3.0,1.0,2.0,7.0,2.0,4.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,10142.0,1976.0,66476.0,99.0,372.0,18.0,126.0,47.0,48.0,78.0,2.0,5.0,2.0,11.0,4.0,4.0,2.0,0.0,1.0,0.0,0.0
2236,5263.0,1977.0,31056.0,99.0,5.0,10.0,13.0,3.0,8.0,16.0,1.0,1.0,0.0,3.0,8.0,0.0,3.0,1.0,0.0,0.0,0.0
2237,22.0,1976.0,46310.0,99.0,185.0,2.0,88.0,15.0,5.0,14.0,2.0,6.0,1.0,5.0,8.0,2.0,2.0,1.0,0.0,0.0,0.0
2238,528.0,1978.0,65819.0,99.0,267.0,38.0,701.0,149.0,165.0,63.0,1.0,5.0,4.0,10.0,3.0,2.0,3.0,0.0,0.0,0.0,0.0


In [None]:
# 3.2 We do not need Id column, Year_Birth column
data_ppd.drop(columns = ['Id','Year_Birth'],inplace = True  )
data_ppd

Unnamed: 0,Income,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Education,Marital_Status,Kidhome,Teenhome,Complain,Response
0,84835.0,0.0,189.0,104.0,379.0,111.0,189.0,218.0,1.0,4.0,4.0,6.0,1.0,2.0,2.0,0.0,0.0,0.0,1.0
1,57091.0,0.0,464.0,5.0,64.0,7.0,0.0,37.0,1.0,7.0,3.0,7.0,5.0,2.0,4.0,0.0,0.0,0.0,1.0
2,67267.0,0.0,134.0,11.0,59.0,15.0,2.0,30.0,1.0,3.0,2.0,5.0,2.0,2.0,3.0,0.0,1.0,0.0,0.0
3,32474.0,0.0,10.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,7.0,2.0,5.0,1.0,1.0,0.0,0.0
4,21474.0,0.0,6.0,16.0,24.0,11.0,0.0,34.0,2.0,3.0,1.0,2.0,7.0,2.0,4.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,66476.0,99.0,372.0,18.0,126.0,47.0,48.0,78.0,2.0,5.0,2.0,11.0,4.0,4.0,2.0,0.0,1.0,0.0,0.0
2236,31056.0,99.0,5.0,10.0,13.0,3.0,8.0,16.0,1.0,1.0,0.0,3.0,8.0,0.0,3.0,1.0,0.0,0.0,0.0
2237,46310.0,99.0,185.0,2.0,88.0,15.0,5.0,14.0,2.0,6.0,1.0,5.0,8.0,2.0,2.0,1.0,0.0,0.0,0.0
2238,65819.0,99.0,267.0,38.0,701.0,149.0,165.0,63.0,1.0,5.0,4.0,10.0,3.0,2.0,3.0,0.0,0.0,0.0,0.0


In [None]:
# 3.3 Dividing data into predictors and target
#     First 18 columns are predictors

X = data_ppd.iloc[ :, 0:18]
X.head(2)

# 3.3.1 18th index or 19th column is target
print("\n\nTarget,y, values")
y = data_ppd.iloc[ : , 18]
y.head()

Unnamed: 0,Income,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Education,Marital_Status,Kidhome,Teenhome,Complain
0,84835.0,0.0,189.0,104.0,379.0,111.0,189.0,218.0,1.0,4.0,4.0,6.0,1.0,2.0,2.0,0.0,0.0,0.0
1,57091.0,0.0,464.0,5.0,64.0,7.0,0.0,37.0,1.0,7.0,3.0,7.0,5.0,2.0,4.0,0.0,0.0,0.0




Target,y, values


0    1.0
1    1.0
2    0.0
3    0.0
4    1.0
Name: Response, dtype: float64

In [None]:
# 3.3.2 Changing datatype to float32

X = X.astype('float32')


### Splitting data into train/test

In [None]:

# 4. Split dataset into train and validation parts
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    shuffle = True,
                                                    stratify = y
                                                    )

# 4.1
X_train.shape        # (1792, 18)
X_test.shape         # (448, 18)
y_train.shape        # (1792,)
y_test.shape         # (448,)

(1792, 18)

(448, 18)

(1792,)

(448,)

### Creating a pipeline

In [None]:
from sklearn.compose import ColumnTransformer

In [None]:
# Missing data Imputation
trf1 = ColumnTransformer([('impute_Income', SimpleImputer(),[4])], remainder = 'passthrough')

In [None]:
# Categorical Data Encoding
trf2 = ColumnTransformer([('ohe_Education_Marital_Status', OneHotEncoder(sparse=False,handle_unknown='ignore'),[2,3])], remainder = 'passthrough')

In [None]:
# Scaling
trf3 = ColumnTransformer([('scale', StandardScaler())])

In [None]:
pipe=make_pipeline(trf1,trf2)

In [None]:
from sklearn import set_config
set_config(display='diagram')
pipe.fit(X_train,y_train)



### Prediction Model using Random Forest

In [None]:
rf=RandomForestClassifier()

In [None]:
rf.fit(X_train, y_train)
y_pred=rf.predict(X_test)
accuracy_score(y_test, y_pred)

0.8683035714285714

### Hyperparameter Tuning: Grid Search

In [None]:
n_estimators= [20,60,100,120]
max_features= [0.2,0.6,1.0]
max_depth= [2,8,None]
max_samples= [0.5,0.75,1.0]

In [None]:
param_grid= {'n_estimators': n_estimators,
              'max_features': max_features,
              'max_depth': max_depth,
              'max_samples': max_samples
             }
print(param_grid)

{'n_estimators': [20, 60, 100, 120], 'max_features': [0.2, 0.6, 1.0], 'max_depth': [2, 8, None], 'max_samples': [0.5, 0.75, 1.0]}


In [None]:
rf = RandomForestClassifier()

In [None]:
rf_grid = GridSearchCV(estimator = rf,
                       param_grid = param_grid,
                       cv = 5,
                       verbose = 2,
                       n_jobs = -1)

In [None]:
rf_grid.fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [None]:
rf_grid.best_params_

{'max_depth': None,
 'max_features': 0.6,
 'max_samples': 1.0,
 'n_estimators': 20}

In [None]:
# Accuracy score
accuracy = rf_grid.best_score_
accuracy * 100.0

87.0538896064487