## Part 2 : Feature Reduction (Extraction/Selection)

In [19]:
# Load Libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import xlrd

In [20]:
#Step 1:  Load data into a dataframe
DataFile = "Data/BankCustomers.xlsx"

data = pd.read_excel(DataFile)

List of columns

In [21]:
data.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

# Step 11- remove columns **RowNumber**,**CustomerId**, and **Surname**

In [22]:
# Step 11- remove unrelated columns
data = data.drop(['RowNumber','CustomerId','Surname'],axis=1)

In [23]:
data.columns

Index(['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Exited'],
      dtype='object')

# Step 12 - Onehot code Geography

In [24]:
# Step 12 - Onehot code Geography
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer

feature = np.array(data['Geography'])
one_hot = LabelBinarizer()

one_hot.fit_transform(feature)

array([[1, 0, 0],
       [0, 0, 1],
       [1, 0, 0],
       ...,
       [1, 0, 0],
       [0, 1, 0],
       [1, 0, 0]], dtype=int32)

In [25]:
one_hot.classes_

array(['France', 'Germany', 'Spain'], dtype='<U7')

In [26]:
dummies = pd.get_dummies(feature)
dummies.head()

Unnamed: 0,France,Germany,Spain
0,1,0,0
1,0,0,1
2,1,0,0
3,1,0,0
4,0,0,1


In [27]:
# Drop Geography column
data = data.drop(['Geography'],axis=1)

# Add Geography dummies to the dataset

In [28]:
# Add dummies
data[dummies.columns] = dummies
data.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,France,Germany,Spain
0,619,Female,42,2,0.0,1,1,1,101348.88,1,1,0,0
1,608,Female,41,1,83807.86,1,0,1,112542.58,0,0,0,1
2,502,Female,42,8,159660.8,3,1,0,113931.57,1,1,0,0
3,699,Female,39,1,0.0,2,0,0,93826.63,0,1,0,0
4,850,Female,43,2,125510.82,1,1,1,79084.1,0,0,0,1


# Onehot code Gender

In [29]:
# one-hot code Gender
feature = np.array(data['Gender'])
one_hot = LabelBinarizer()

one_hot.fit_transform(feature)
dummies = pd.get_dummies(feature)
dummies

Unnamed: 0,Female,Male
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
...,...,...
9995,0,1
9996,0,1
9997,1,0
9998,0,1


In [30]:
 #drop Gender and add dummies
data = data.drop(['Gender'],axis=1)
data[dummies.columns] = dummies
data.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,France,Germany,Spain,Female,Male
0,619,42,2,0.0,1,1,1,101348.88,1,1,0,0,1,0
1,608,41,1,83807.86,1,0,1,112542.58,0,0,0,1,1,0
2,502,42,8,159660.8,3,1,0,113931.57,1,1,0,0,1,0
3,699,39,1,0.0,2,0,0,93826.63,0,1,0,0,1,0
4,850,43,2,125510.82,1,1,1,79084.1,0,0,0,1,1,0


# Dataset with geography and gender dummied and 1 dummy removed to avoid dummy trap.

In [31]:
# Drop spain and male to avoid dummy trap
data = data.drop(['Male','Spain'],axis=1)
data.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,France,Germany,Female
0,619,42,2,0.0,1,1,1,101348.88,1,1,0,1
1,608,41,1,83807.86,1,0,1,112542.58,0,0,0,1
2,502,42,8,159660.8,3,1,0,113931.57,1,1,0,1
3,699,39,1,0.0,2,0,0,93826.63,0,1,0,1
4,850,43,2,125510.82,1,1,1,79084.1,0,0,0,1


#  Move dependent variable to last column

In [32]:
# Move the dependent variable column to the last position.

Exited = data['Exited']

In [33]:
data = data.drop(['Exited'],axis=1)
data['Exited'] = Exited
data.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,France,Germany,Female,Exited
0,619,42,2,0.0,1,1,1,101348.88,1,0,1,1
1,608,41,1,83807.86,1,0,1,112542.58,0,0,1,0
2,502,42,8,159660.8,3,1,0,113931.57,1,0,1,1
3,699,39,1,0.0,2,0,0,93826.63,1,0,1,0
4,850,43,2,125510.82,1,1,1,79084.1,0,0,1,0


# Step 13 - Set up independet variable and depndent variables and perform feature reduction

In [34]:
# Step 13 - Set up independet variable and depndent variables and perform feature reduction
Independents = data.iloc[:-1].values
print(Independents)
Dependent = data.iloc[:,-1].values
print(Dependent)
X = Independents
y = Dependent

[[619.  42.   2. ...   0.   1.   1.]
 [608.  41.   1. ...   0.   1.   0.]
 [502.  42.   8. ...   0.   1.   1.]
 ...
 [516.  35.  10. ...   0.   0.   0.]
 [709.  36.   7. ...   0.   1.   1.]
 [772.  42.   3. ...   1.   0.   1.]]
[1 0 1 ... 1 1 0]


# Attempt at feature reduction using PCA Before feature scaling

In [35]:
# Attempt at feature reduction using PCA Before feature scaling
#Load libraries
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA

# Create a PCA that will retain 99% of variance
pca = PCA(n_components=0.99, whiten=True)

# Conduct PCA
features_pca = pca.fit_transform(X)

# Show results
print("Original number of features:", X.shape[1])
print("Reduced number of features:", features_pca.shape[1])


Original number of features: 12
Reduced number of features: 2


In [36]:
# Feature scaling will normalize all variable to the same scale
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)
print(X)

[[-0.32609367  0.29341451 -1.041749   ... -0.57877454  1.09610816
   1.97704053]
 [-0.4399147   0.19806052 -1.38751174 ... -0.57877454  1.09610816
  -0.50580653]
 [-1.53673561  0.29341451  1.03282743 ... -0.57877454  1.09610816
   1.97704053]
 ...
 [-1.39187247 -0.37406345  1.72435291 ... -0.57877454 -0.91231872
  -0.50580653]
 [ 0.60516937 -0.27870946  0.68706469 ... -0.57877454  1.09610816
   1.97704053]
 [ 1.25705349  0.29341451 -0.69598626 ...  1.7277885  -0.91231872
   1.97704053]]


# # Attempt at feature reduction using PCA After feature scaling

In [37]:
# Attempt at feature reduction using PCA After feature scaling
#Load libraries
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA

# Create a PCA that will retain 99% of variance
pca = PCA(n_components=0.99, whiten=True)

# Conduct PCA
features_pca = pca.fit_transform(X)

# Show results
print("Original number of features:", X.shape[1])
print("Reduced number of features:", features_pca.shape[1])


Original number of features: 12
Reduced number of features: 12
