# ANN Classification - Bank Customer Retention
## Part 1 - DATA PREPROCESSING
In this notebook, we load the raw dataset file and implement initial cleaning and preprocessing to prepare it for the model training phase.

> **INPUT:** the raw dataset file as downloaded from its original source.<br>
> **OUTPUT:** a cleaned version of the dataset stored to an intermediate csv file.

### 1. INITIALIZATION

In [209]:
# Import necessary libraries and modules
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

### 2. LOADING DATASET FILE

In [210]:
# Prepare file location and load dataset
data_file_location = "..\\data\\raw\\"
data_file_name = "churn_modelling"
data_file_ext = "csv"
data = pd.read_csv(data_file_location + data_file_name + "." + data_file_ext)

In [211]:
# Check dataset head
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [212]:
# Check dataset shape
data.shape

(10000, 14)

In [213]:
# Check dataset columns
data.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [214]:
# Check column types
data.dtypes

RowNumber            int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

### 3. DATA CLEANING AND PREPROCESSING

#### Drop irrelevant columns

In [215]:
# Drop irrelevant columns such as identifiers and names
data.drop(["RowNumber", "CustomerId", "Surname"], axis=1, inplace=True)

# Check dataset head
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


#### Encoding categorical features

In [216]:
# Encoding "Gender" feature with label encoding
le = LabelEncoder()
data["Gender"] = le.fit_transform(data["Gender"])

# Check encoded feature
data["Gender"].unique()

array([0, 1])

In [217]:
# Encoding "Geography" feature with one hot encoding
ohe = OneHotEncoder(drop="first")
ct = ColumnTransformer(transformers=[
    ("one_hot_encoder", ohe, ["Geography"])
], remainder="passthrough")

In [218]:
# Apply transformations
data_encoded = ct.fit_transform(data)

In [219]:
# Get column names after one-hot encoding
encoded_columns = ct.named_transformers_['one_hot_encoder'].get_feature_names_out(["Geography"])

# Create DataFrame with encoded data and corresponding columns
data = pd.DataFrame(data_encoded, columns=list(encoded_columns) + [col for col in data.columns if col != 'Geography'])

In [220]:
# Check dataset after encoding
data

Unnamed: 0,Geography_Germany,Geography_Spain,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0.0,0.0,619.0,0.0,42.0,2.0,0.00,1.0,1.0,1.0,101348.88,1.0
1,0.0,1.0,608.0,0.0,41.0,1.0,83807.86,1.0,0.0,1.0,112542.58,0.0
2,0.0,0.0,502.0,0.0,42.0,8.0,159660.80,3.0,1.0,0.0,113931.57,1.0
3,0.0,0.0,699.0,0.0,39.0,1.0,0.00,2.0,0.0,0.0,93826.63,0.0
4,0.0,1.0,850.0,0.0,43.0,2.0,125510.82,1.0,1.0,1.0,79084.10,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.0,0.0,771.0,1.0,39.0,5.0,0.00,2.0,1.0,0.0,96270.64,0.0
9996,0.0,0.0,516.0,1.0,35.0,10.0,57369.61,1.0,1.0,1.0,101699.77,0.0
9997,0.0,0.0,709.0,0.0,36.0,7.0,0.00,1.0,0.0,1.0,42085.58,1.0
9998,1.0,0.0,772.0,1.0,42.0,3.0,75075.31,2.0,1.0,0.0,92888.52,1.0


#### Splitting dataset into training and testing

In [221]:
# Split into independent and dependent features
X = data.iloc[:,0:-1]
y = data.iloc[:,-1]

In [222]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=0)

#### Scaling features

In [223]:
# Scaling independent features using standard scaler
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)

### 4. STORING PREPROCESSED DATASET

In [224]:
# Restore the original DataFrame structure to maintain column names
data_train = pd.DataFrame(X_train_scaled, columns=X_train.columns)
data_train["Exited"] = y_train.values
data_test = pd.DataFrame(X_test_scaled, columns=X_test.columns)
data_test["Exited"] = y_test.values

In [225]:
# Prepare file location and names
data_file_location = "..\\data\\interim\\"
data_train_file_name = "churn_modelling_preprocessed_train"
data_test_file_name = "churn_modelling_preprocessed_test"
data_file_ext = "csv"

# Store training and testing datasets as csv files
data_train.to_csv(data_file_location + "\\" + data_train_file_name + "." + data_file_ext, index = False)
data_test.to_csv(data_file_location + "\\" + data_test_file_name + "." + data_file_ext, index = False)