## Feature Engineering

### Overview
In this file, we focused on feature engineering to enhance the predictive power of the dataset for predicting bank customer churn.


In [51]:
import numpy as np
import pandas as pd

In [52]:
pd.set_option("display.max_columns", None)

In [56]:
df = pd.read_csv("data/Churn_Modelling.csv")
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


## Creating Features

In [57]:
# Combining location and gender
df['Geo_Gender'] = df['Geography'] + "_" + df['Gender']

In [58]:
# Extracting age groups
df['AgeGroup'] = df['Age'] // 10 * 10

In [59]:
# Age tenure interaction
df['Age_Tenure_Interaction'] = df['Age'] * df['Tenure']

In [60]:
# isSenior
df['IsSenior'] = df['Age'].apply(lambda x: 1 if x >= 65 else 0)

In [61]:
# Binerizing high balance
df['High_Balance'] = np.where(df['Balance'] > 125000, 1, 0)

In [62]:
# Balance to Salary ratio
df['Balance_to_Salary_Ratio'] = df['Balance'] / df['EstimatedSalary']

In [63]:
# Credit score binning
df['CreditScoreTier'] = pd.cut(df['CreditScore'], bins=[0, 650, 750, 850], labels=['Low', 'Medium', 'High'])

In [64]:
# Products to tenure ratio
df['Products_Per_Tenure'] = df['NumOfProducts'] / (df['Tenure'] + 0.0001)

In [65]:
# Aggregating tenure and NumOfProdcuts
df['Average_Product_Holding_Duration'] = df['Tenure'] / df['NumOfProducts']

In [66]:
# is Active credit card holder?
df['IsActive_by_CreditCard'] = df['HasCrCard'] * df['IsActiveMember']

In [67]:
# Costomer status
df['Customer_Status'] = df['Tenure'].apply(lambda x: 'New' if x < 2 else 'Long-term')

In [68]:
# Credit score with balance interaction
df['CreditScore_Balance_Interaction'] = df['CreditScore'] * df['Balance']

In [69]:
# Binarizing high credit score
df['High_CreditScore'] = np.where(df['CreditScore'] > 700, 1, 0)

In [70]:
# Aggregating Balance and NumOfProducts
df['Balance_per_Product'] = df['Balance'] / df['NumOfProducts']

In [71]:
df.sample(5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geo_Gender,AgeGroup,Age_Tenure_Interaction,IsSenior,High_Balance,Balance_to_Salary_Ratio,CreditScoreTier,Products_Per_Tenure,Average_Product_Holding_Duration,IsActive_by_CreditCard,Customer_Status,CreditScore_Balance_Interaction,High_CreditScore,Balance_per_Product
2229,2230,15756125,Booth,757,Spain,Male,44,5,140856.16,2,1,0,158735.1,0,Spain_Male,40,220,0,1,0.887366,High,0.399992,2.5,0,Long-term,106628100.0,1,70428.08
5092,5093,15596303,White,688,France,Female,39,0,0.0,2,1,0,53222.15,1,France_Female,30,0,0,0,0.0,Medium,20000.0,0.0,0,New,0.0,0,0.0
3273,3274,15646091,Frankland,560,Spain,Female,43,4,95140.44,2,1,0,123181.44,1,Spain_Female,40,172,0,0,0.77236,Low,0.499988,2.0,0,Long-term,53278650.0,0,47570.22
3789,3790,15765415,King,609,Spain,Female,45,4,89122.3,1,1,1,199256.98,0,Spain_Female,40,180,0,0,0.447273,Low,0.249994,4.0,1,Long-term,54275480.0,0,89122.3
1565,1566,15649523,Kennedy,581,France,Male,38,1,0.0,2,1,0,46176.22,0,France_Male,30,38,0,0,0.0,Low,1.9998,0.5,0,New,0.0,0,0.0


In [72]:
# Drop unnecessary columns
df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1, inplace=True)

In [73]:
df.shape

(10000, 25)

In [74]:
df.sample(5)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geo_Gender,AgeGroup,Age_Tenure_Interaction,IsSenior,High_Balance,Balance_to_Salary_Ratio,CreditScoreTier,Products_Per_Tenure,Average_Product_Holding_Duration,IsActive_by_CreditCard,Customer_Status,CreditScore_Balance_Interaction,High_CreditScore,Balance_per_Product
5983,619,Germany,Female,28,6,99152.73,2,1,0,48475.12,0,Germany_Female,20,168,0,0,2.045435,Low,0.333328,3.0,0,Long-term,61375540.0,0,49576.365
9726,773,Spain,Male,43,7,138150.57,1,1,1,177357.16,0,Spain_Male,40,301,0,1,0.77894,High,0.142855,7.0,1,Long-term,106790400.0,1,138150.57
7700,752,Spain,Female,31,4,144637.86,2,1,0,40496.72,0,Spain_Female,30,124,0,1,3.571594,High,0.499988,2.0,0,Long-term,108767700.0,1,72318.93
6186,618,France,Male,37,5,0.0,1,0,1,178705.45,1,France_Male,30,185,0,0,0.0,Low,0.199996,5.0,0,Long-term,0.0,0,0.0
4245,732,France,Female,34,8,122338.43,2,1,0,187985.85,0,France_Female,30,272,0,0,0.650785,Medium,0.249997,4.0,0,Long-term,89551730.0,1,61169.215


In [75]:
df.to_csv("data/churn_modelling_feature_engineering.csv", index=False)