# 4 Pre-Processing and Training Data<a id='4_Pre-Processing_and_Training_Data'></a>

## Contents <a id ="Content" > </a>

## Introduction <a id = 'Introduction'></a>

## Imports <a id="Imports"></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
import os
#from sklearn.dummy import DummyRegressor
#from imblearn.over_sampling import SMOTE

In [None]:
ccattr_data = pd.read_csv("../data/3.ccattr_data_EDA_final.csv")

In [None]:
ccattr_data.dtypes

In [None]:
ccattr_data.head()

In [None]:
ccattr_data.shape

In [None]:
ccattr_data.head().T

In [None]:
ccattr_data.dtypes

In [None]:
# Label Encode categorical variables 

#marital_status = {'Married':1,'Single':2, 'Divorced':3}
#ccattr_data['Marital_Status_sorted']=ccattr_data['Marital_Status_sorted'].map(marital_status)

education = {'Uneducated':1,'High School':2, 'Graduate':3, 'College':4, 'Post-Graduate':5, 'Doctorate':6}
ccattr_data['Education_Level_sorted']=ccattr_data['Education_Level_sorted'].map(education)

income = {'0K - 40K':1,'40k − 60K':2, '80K - 120K':3, '60K - 80K':4, '120K +':5}
ccattr_data['Income_Category_sorted']=ccattr_data['Income_Category_sorted'].map(income)

cc_cat = {'Blue':1,'Silver':2, 'Gold':3, 'Platinum':4}
ccattr_data['Card_Category_sorted']=ccattr_data['Card_Category_sorted'].map(cc_cat)


In [None]:
#Checking that no column has missing values 
ccattr_data.isnull().sum()

In [None]:
#Corralation

list_cols = ['Attrition_Numeric','Customer_Age','Dependent_count', 'Months_on_book',
             'Total_Relationship_Count', 'Months_Inactive_12_mon',
             'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
              'Total_Trans_Amt','Avg_Utilization_Ratio']
sns.set_palette(sns.color_palette("Set1", 8))
sns.pairplot(ccattr_data[list_cols], hue="Attrition_Numeric",corner=True)
plt.show()

Looking at the plot it is evident that classifiers are the best models than the linear regressor.

## Pre-Processing

In [None]:
# Creating X and y
X = ccattr_data.drop(['Attrition_Numeric','CLIENTNUM','naive_cls1','naive_cls2'],axis=1)
y = ccattr_data['Attrition_Numeric']

In [None]:
## Converting the data type of categorical features to 'category'
'''cat_cols = ['Attrition_Numeric','Gender', 'Education_Level_sorted', 'Marital_Status_sorted', 'Income_Category_sorted', 'Card_Category_sorted','Dependent_count','Total_Relationship_Count','Months_Inactive_12_mon','Contacts_Count_12_mon']'''

#cat_cols = ['Marital_Status_sorted']
#ccattr_data[cat_cols] = ccattr_data[cat_cols].astype('category')
ccattr_data.info()

In [None]:
X=pd.get_dummies(X,drop_first=True)
X.head()

In [None]:
X.describe()

In [None]:
X.dtypes

All of the data are Numeric and ready for scaling

## Train Test Split <a id=Train_Test_Split></a>

In [None]:
# Splitting the data into train and test sets in 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1,stratify=y)
X_train.shape, X_test.shape

In [None]:
y_train.shape, y_test.shape

In [None]:
X_trained_df = X_train.copy()
X_test_df = X_test.copy()

In [None]:
X_trained_df.head()

## Applying Scaling, found best from EDA

In [None]:
#zScore scaling for Customer_Age,Months_on_book
#zScore scaling function
def calczScore(ccdf,col_name):
    df =ccdf.copy()
    col_mean = df[col_name].mean()
    col_std = df[col_name].std(ddof=0)
    df[col_name] = (df[col_name]-col_mean)/col_std
    return df[col_name]

In [None]:
#zScoreCol={"Customer_Age","Months_on_book"}
zScoreCol={"Customer_Age","Months_on_book"}
for col in zScoreCol:
    X_trained_df[col] = calczScore( X_trained_df,col)

In [None]:
X_trained_df

In [None]:
#Applying zScore on x_test
zScoreCol={"Customer_Age","Months_on_book"}
for col in zScoreCol:
    X_test_df[col] = calczScore(X_test_df,col)

In [None]:
X_test_df

In [None]:
# applying div median on Total_Revolving_Bal
col_median = X_trained_df['Total_Revolving_Bal'].median()
X_trained_df['Total_Revolving_Bal'] = X_trained_df['Total_Revolving_Bal']/col_median   
X_trained_df

In [None]:
# applying div median on Total_Revolving_Bal for x_test
col_median = X_test_df['Total_Revolving_Bal'].median()
X_test_df['Total_Revolving_Bal'] = X_test_df['Total_Revolving_Bal']/col_median   
X_test_df

In [None]:
#Applying log scaling
log_col=  ["Credit_Limit","Total_Trans_Amt"]
for col in log_col:
    X_trained_df[col] = np.log(X_trained_df[col])
X_trained_df

In [None]:
#Applying log scaling for x_test
log_col=  ["Credit_Limit","Total_Trans_Amt"]
for col in log_col:
    X_test_df[col] = np.log(X_test_df[col])
X_test_df

In [None]:
X_test_df.info()

In [None]:
# Save the data 
datapath = '../data'

datapath_step4 = os.path.join(datapath, '4.X_train.csv')
X_trained_df.to_csv(datapath_step4, index=False)

datapath_step4 = os.path.join(datapath, '4.y_train.csv')
y_train.to_csv(datapath_step4, index=False)

datapath_step4 = os.path.join(datapath, '4.X_test.csv')
X_test_df.to_csv(datapath_step4, index=False)

datapath_step4 = os.path.join(datapath, '4.y_test.csv')
y_test.to_csv(datapath_step4, index=False)