# Data Preprocessing and Feature Engineering
### Data Preprocessing is a data mining technique which is used to transform the raw data in a useful and efficient format. 
<img src="assets/Data-Preprocessing.png" width="350" height="400">

### Import required libraries

In [216]:
import pandas as pd
import numpy as np

### Read the data

In [217]:
df = pd.read_csv("data/Bank_data.csv", index_col=0)
df.head()

Unnamed: 0_level_0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [218]:
pd.set_option('display.max_columns', 50) 
pd.set_option('display.max_rows', 50)

In [219]:
df.shape

(10000, 13)

In [220]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 1 to 10000
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerId       10000 non-null  int64  
 1   Surname          10000 non-null  object 
 2   CreditScore      10000 non-null  int64  
 3   Geography        10000 non-null  object 
 4   Gender           10000 non-null  object 
 5   Age              10000 non-null  int64  
 6   Tenure           10000 non-null  int64  
 7   Balance          10000 non-null  float64
 8   NumOfProducts    10000 non-null  int64  
 9   HasCrCard        10000 non-null  int64  
 10  IsActiveMember   10000 non-null  int64  
 11  EstimatedSalary  10000 non-null  float64
 12  Exited           10000 non-null  int64  
dtypes: float64(2), int64(8), object(3)
memory usage: 1.1+ MB


### Check for NaN or null values

In [221]:
df.isnull().sum()

CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

### Outlier Observation Analysis

In [222]:
print("Outliers Present")
print("")
for feature in df[['CreditScore','Tenure', 'Balance','EstimatedSalary']]:
    
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3-Q1
    lower = Q1- 1.5*IQR
    upper = Q3 + 1.5*IQR
    
    if df[(df[feature] > upper)].any(axis=None):
        print(feature,"- yes")
    else:
        print(feature, "- no")

Outliers Present

CreditScore - no
Tenure - no
Balance - no
EstimatedSalary - no


# Feature Engineering
### Feature engineering is the process of using domain knowledge to extract features (characteristics, properties, attributes) from raw data. A feature is a property shared by independent units on which analysis or prediction is to be done. Features are used by predictive models and influence results.

In [223]:
df["NewAGT"] = df["Age"] - df["Tenure"]
df["CreditsScore"] = pd.qcut(df['CreditScore'], 10, labels = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
df["AgeScore"] = pd.qcut(df['Age'], 8, labels = [1, 2, 3, 4, 5, 6, 7, 8])
df["BalanceScore"] = pd.qcut(df['Balance'].rank(method="first"), 10, labels = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
df["EstSalaryScore"] = pd.qcut(df['EstimatedSalary'], 10, labels = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
df["NewEstimatedSalary"] = df["EstimatedSalary"] / 12

In [224]:
df.head()

Unnamed: 0_level_0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,NewAGT,CreditsScore,AgeScore,BalanceScore,EstSalaryScore,NewEstimatedSalary
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1,40,4,6,1,6,8445.74
2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0,40,4,6,5,6,9378.548333
3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1,34,1,6,10,6,9494.2975
4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0,38,7,5,1,5,7818.885833
5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0,41,10,6,8,4,6590.341667


In [225]:
df = pd.get_dummies(df, columns =["Geography", "Gender"])

In [226]:
df.head()

Unnamed: 0_level_0,CustomerId,Surname,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,NewAGT,CreditsScore,AgeScore,BalanceScore,EstSalaryScore,NewEstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,15634602,Hargrave,619,42,2,0.0,1,1,1,101348.88,1,40,4,6,1,6,8445.74,1,0,0,1,0
2,15647311,Hill,608,41,1,83807.86,1,0,1,112542.58,0,40,4,6,5,6,9378.548333,0,0,1,1,0
3,15619304,Onio,502,42,8,159660.8,3,1,0,113931.57,1,34,1,6,10,6,9494.2975,1,0,0,1,0
4,15701354,Boni,699,39,1,0.0,2,0,0,93826.63,0,38,7,5,1,5,7818.885833,1,0,0,1,0
5,15737888,Mitchell,850,43,2,125510.82,1,1,1,79084.1,0,41,10,6,8,4,6590.341667,0,0,1,1,0


In [227]:
df = df.drop(["CustomerId","Surname"], axis = 1)

In [228]:
categorical_data = df[["Geography_France", "Geography_Germany", "Geography_Spain", "Gender_Female", "Gender_Male", 
                       "HasCrCard","IsActiveMember"]]

In [229]:
X = df.drop(["Exited", "Geography_France", "Geography_Germany", "Geography_Spain", "Gender_Female", "Gender_Male", 
             "HasCrCard","IsActiveMember"], axis = 1)
y = df["Exited"]
cols = X.columns
index = X.index

In [230]:
print(X.shape, y.shape)

(10000, 12) (10000,)


In [231]:
y.head()

RowNumber
1    1
2    0
3    1
4    0
5    0
Name: Exited, dtype: int64

In [232]:
X.head()

Unnamed: 0_level_0,CreditScore,Age,Tenure,Balance,NumOfProducts,EstimatedSalary,NewAGT,CreditsScore,AgeScore,BalanceScore,EstSalaryScore,NewEstimatedSalary
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,619,42,2,0.0,1,101348.88,40,4,6,1,6,8445.74
2,608,41,1,83807.86,1,112542.58,40,4,6,5,6,9378.548333
3,502,42,8,159660.8,3,113931.57,34,1,6,10,6,9494.2975
4,699,39,1,0.0,2,93826.63,38,7,5,1,5,7818.885833
5,850,43,2,125510.82,1,79084.1,41,10,6,8,4,6590.341667


## Scaling or Standardization
### Is a common requirement for many machine learning estimators. Typically this is done by removing the mean and scaling to unit variance in such a way that they are robust to outliers.

In [233]:
from sklearn.preprocessing import RobustScaler

transformer = RobustScaler().fit(X)
X = transformer.transform(X)
X = pd.DataFrame(X, columns = cols, index = index)

In [234]:
df = pd.concat([X,categorical_data,y], axis = 1)

In [235]:
df.head()

Unnamed: 0_level_0,CreditScore,Age,Tenure,Balance,NumOfProducts,EstimatedSalary,NewAGT,CreditsScore,AgeScore,BalanceScore,EstSalaryScore,NewEstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male,HasCrCard,IsActiveMember,Exited
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,-0.246269,0.416667,-0.75,-0.76148,0.0,0.011739,0.666667,-0.2,0.5,-0.9,0.1,0.011739,1,0,0,1,0,1,1,1
2,-0.328358,0.333333,-1.0,-0.104906,0.0,0.125512,0.666667,-0.2,0.5,-0.1,0.1,0.125512,0,0,1,1,0,0,1,0
3,-1.119403,0.416667,0.75,0.489346,2.0,0.13963,0.166667,-0.8,0.5,0.9,0.1,0.13963,1,0,0,1,0,1,0,1
4,0.350746,0.166667,-1.0,-0.76148,1.0,-0.064717,0.5,0.4,0.25,-0.9,-0.1,-0.064717,1,0,0,1,0,0,0,0
5,1.477612,0.5,-0.75,0.221806,0.0,-0.214561,0.75,1.0,0.5,0.5,-0.3,-0.214561,0,0,1,1,0,1,1,0


### Save the processed data to a csv file so can be used directly for modelling

In [236]:
df.to_csv("data/preprocessed_data.csv")