### NoteBook Contents ( DataPreprocessing Transformations & FeatureSelection):
        > 1. Seperating Numeric and Categorical features, assigning categories to categories not in the dataset description 
        > 2. Splitting the Data into Training and Test Set
        > 3. Missing Values Imputation
        > 4. Data standardization
        > 5. Saving training and test data

#### Importing Essential Data handling libraries

In [99]:
import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

import time
from math import floor
from time import perf_counter
from tqdm import tqdm

pd.set_option('display.max_columns', 100)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [100]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.preprocessing import QuantileTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler

#### Loading Data

In [101]:
df_master = pd.read_csv("..\\0_Modeling_Data\\data.csv") ## looking for dataset ,up one directory

In [102]:
df_raw = df_master.copy()

In [103]:
## Dropping "ID" field since its a unique identifier 
df_raw.drop("ID", axis = 1, inplace = True)

In [104]:
df_raw.shape 

(30000, 24)

In [105]:
df_raw.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,20000.0,2,2,1,24,2,2,-1,-1,-2,-2,3913.0,3102.0,689.0,0.0,0.0,0.0,0,689,0,0,0,0,1
1,120000.0,2,2,2,26,-1,2,0,0,0,2,2682.0,1725.0,2682.0,3272.0,3455.0,3261.0,0,1000,1000,1000,0,2000,1
2,90000.0,2,2,2,34,0,0,0,0,0,0,29239.0,14027.0,13559.0,14331.0,14948.0,15549.0,1518,1500,1000,1000,1000,5000,0
3,50000.0,2,2,1,37,0,0,0,0,0,0,46990.0,48233.0,49291.0,28314.0,28959.0,29547.0,2000,2019,1200,1100,1069,1000,0
4,50000.0,1,2,1,57,-1,0,-1,0,0,0,8617.0,5670.0,35835.0,20940.0,19146.0,19131.0,2000,36681,10000,9000,689,679,0


### 1. Seperating Numeric and Categorical features : 

In [106]:
target_variable = "default payment next month"

In [107]:
## Assumption  : if the nuber of unique values in the feature are < 0.05 % of the total observations, then treat this as categorical feature

categorical_threshold = df_raw.shape[0]*0.05/100
print(categorical_threshold)

categorical_variables = df_raw.drop(target_variable, axis = 1).apply(lambda x: len(np.unique(x.value_counts()))).index[(df_raw.drop(target_variable, axis = 1).apply(lambda x: len(np.unique(x.value_counts()))) < categorical_threshold)]
categorical_variables = list(categorical_variables)
print(categorical_variables)

15.0
['SEX', 'EDUCATION', 'MARRIAGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']


In [108]:
### predictors
numeric_variables = df_raw.drop([target_variable] + categorical_variables,axis = 1).columns.to_list()
print(numeric_variables)

['LIMIT_BAL', 'AGE', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']


In [109]:
print(len(categorical_variables))
print(len(numeric_variables))

9
14


In [110]:
##### Data Mapping in categorical features
for feat in categorical_variables:
    print(feat)
    print(df_raw[feat].value_counts())
    print("\n")

SEX
2    18112
1    11888
Name: SEX, dtype: int64


EDUCATION
2    14030
1    10585
3     4917
5      280
4      123
6       51
0       14
Name: EDUCATION, dtype: int64


MARRIAGE
2    15964
1    13659
3      323
0       54
Name: MARRIAGE, dtype: int64


PAY_0
 0    14737
-1     5686
 1     3688
-2     2759
 2     2667
 3      322
 4       76
 5       26
 8       19
 6       11
 7        9
Name: PAY_0, dtype: int64


PAY_2
 0    15730
-1     6050
 2     3927
-2     3782
 3      326
 4       99
 1       28
 5       25
 7       20
 6       12
 8        1
Name: PAY_2, dtype: int64


PAY_3
 0    15764
-1     5938
-2     4085
 2     3819
 3      240
 4       76
 7       27
 6       23
 5       21
 1        4
 8        3
Name: PAY_3, dtype: int64


PAY_4
 0    16455
-1     5687
-2     4348
 2     3159
 3      180
 4       69
 7       58
 5       35
 6        5
 1        2
 8        2
Name: PAY_4, dtype: int64


PAY_5
 0    16947
-1     5539
-2     4546
 2     2626
 3      178
 4       84
 7 

In [111]:
### Mapping unknown values to respective categories (as per data analysis)
df_raw["EDUCATION"] = df_raw["EDUCATION"].map({0:6, 1:1,2:2,3:3,4:4,5:5,6:6})
df_raw["MARRIAGE"] = df_raw["MARRIAGE"].map({0:3, 1:1,2:2,3:3})
df_raw["PAY_0"] = df_raw["PAY_0"].map({-2:0,-1:0,0:0,1:1,2:2,3:3,4:4,5:5,6:6,7:7,8:8,9:9})
df_raw["PAY_2"] = df_raw["PAY_2"].map({-2:0,-1:0,0:0,1:1,2:2,3:3,4:4,5:5,6:6,7:7,8:8,9:9})
df_raw["PAY_3"] = df_raw["PAY_3"].map({-2:0,-1:0,0:0,1:1,2:2,3:3,4:4,5:5,6:6,7:7,8:8,9:9})
df_raw["PAY_4"] = df_raw["PAY_4"].map({-2:0,-1:0,0:0,1:1,2:2,3:3,4:4,5:5,6:6,7:7,8:8,9:9})
df_raw["PAY_5"] = df_raw["PAY_5"].map({-2:0,-1:0,0:0,1:1,2:2,3:3,4:4,5:5,6:6,7:7,8:8,9:9})
df_raw["PAY_6"] = df_raw["PAY_6"].map({-2:0,-1:0,0:0,1:1,2:2,3:3,4:4,5:5,6:6,7:7,8:8,9:9})

In [112]:
categorical_variables

['SEX',
 'EDUCATION',
 'MARRIAGE',
 'PAY_0',
 'PAY_2',
 'PAY_3',
 'PAY_4',
 'PAY_5',
 'PAY_6']

In [113]:
numeric_variables

['LIMIT_BAL',
 'AGE',
 'BILL_AMT1',
 'BILL_AMT2',
 'BILL_AMT3',
 'BILL_AMT4',
 'BILL_AMT5',
 'BILL_AMT6',
 'PAY_AMT1',
 'PAY_AMT2',
 'PAY_AMT3',
 'PAY_AMT4',
 'PAY_AMT5',
 'PAY_AMT6']

### 2. Splitting the Data into Training and Test Set

In [114]:
X_train, X_test, y_train, y_test = train_test_split(df_raw.drop(target_variable, axis = 1), 
                                                   df_raw[target_variable], 
                                                    test_size = 0.20,
                                                    random_state = 8,
                                                   shuffle = True)

In [115]:
X_train.reset_index(inplace = True, drop  = True)
X_test.reset_index(inplace = True, drop  = True)

In [116]:
print(X_train.shape)
print(X_test.shape)

(24000, 23)
(6000, 23)


In [117]:
y_train.value_counts(normalize=True)

0   0.779
1   0.221
Name: default payment next month, dtype: float64

In [118]:
y_test.value_counts(normalize=True)

0   0.779
1   0.221
Name: default payment next month, dtype: float64

In [119]:
y_train = y_train.values
y_test = y_test.values

In [120]:
X_train.dtypes

LIMIT_BAL    float64
SEX            int64
EDUCATION      int64
MARRIAGE       int64
AGE            int64
PAY_0          int64
PAY_2          int64
PAY_3          int64
PAY_4          int64
PAY_5          int64
PAY_6          int64
BILL_AMT1    float64
BILL_AMT2    float64
BILL_AMT3    float64
BILL_AMT4    float64
BILL_AMT5    float64
BILL_AMT6    float64
PAY_AMT1       int64
PAY_AMT2       int64
PAY_AMT3       int64
PAY_AMT4       int64
PAY_AMT5       int64
PAY_AMT6       int64
dtype: object

##### Missing value imputation process
    > we have all the data in numeric terms, Impute missing values using Imputer (e.g. KNN)

### 3. Missing Values Imputation

In [121]:
missing_value_imputer = KNNImputer(n_neighbors = 11, weights="uniform")

##### fit imputer on trainig data

In [122]:
missing_value_imputer.fit(X_train)

KNNImputer(n_neighbors=11)

##### Impute missing values in train and test data accordingly

In [123]:
t0 = perf_counter()

X_train = pd.DataFrame(missing_value_imputer.transform(X_train), columns = X_train.columns)
X_test = pd.DataFrame(missing_value_imputer.transform(X_test), columns = X_test.columns)

t1 = perf_counter()
print("Time Taken in minutes = ", np.round((t1 - t0)/60,2))

Time Taken in minutes =  0.01


In [124]:
X_train.isnull().sum().sum()

0

In [125]:
X_test.isnull().sum().sum()

0

### 4. Data standardization

In [126]:
standard_scaler = StandardScaler()

In [127]:
# fit the standard scaler transformer on the train data
standard_scaler.fit(X_train)

StandardScaler()

In [128]:
X_train

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,130000.000,2.000,2.000,2.000,28.000,0.000,0.000,0.000,0.000,0.000,0.000,390.000,7412.000,390.000,4134.000,540.000,390.000,7412.000,390.000,4134.000,540.000,390.000,2596.000
1,350000.000,1.000,1.000,2.000,52.000,0.000,0.000,0.000,0.000,0.000,0.000,713.000,2272.000,722.000,867.000,1150.000,5263.000,2272.000,722.000,867.000,1150.000,5263.000,5011.000
2,260000.000,1.000,2.000,2.000,32.000,1.000,0.000,0.000,0.000,0.000,0.000,3581.000,-9.000,-9.000,1935.000,999.000,0.000,0.000,0.000,1944.000,999.000,0.000,0.000
3,280000.000,2.000,2.000,2.000,30.000,0.000,0.000,0.000,0.000,0.000,0.000,61282.000,53394.000,17644.000,0.000,5468.000,0.000,7900.000,4157.000,0.000,5468.000,0.000,330.000
4,80000.000,2.000,2.000,1.000,25.000,0.000,0.000,0.000,0.000,0.000,0.000,76117.000,76660.000,46137.000,48218.000,48286.000,48207.000,2300.000,2200.000,3207.000,2000.000,2000.000,2000.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23995,110000.000,2.000,3.000,2.000,31.000,0.000,0.000,0.000,0.000,0.000,0.000,2705.000,2487.000,2500.000,2500.000,0.000,0.000,2487.000,2700.000,2500.000,0.000,0.000,0.000
23996,360000.000,1.000,1.000,1.000,32.000,0.000,0.000,0.000,0.000,0.000,0.000,2500.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
23997,110000.000,2.000,2.000,1.000,24.000,1.000,1.000,0.000,0.000,0.000,0.000,9787.000,-1256.000,65064.000,66102.000,54275.000,51791.000,5.000,68500.000,3114.000,4016.000,2000.000,2000.000
23998,360000.000,2.000,2.000,2.000,30.000,0.000,0.000,0.000,0.000,0.000,0.000,7410.000,6783.000,7672.000,8380.000,4581.000,3853.000,1115.000,1161.000,1026.000,1523.000,3872.000,2626.000


In [129]:
## transform training and test feature     
transformed_std_scaled_train_df = pd.DataFrame(standard_scaler.transform(X_train))
transformed_std_scaled_train_df.columns = list(X_train.columns)

transformed_std_scaled_test_df = pd.DataFrame(standard_scaler.transform(X_test))
transformed_std_scaled_test_df.columns = list(X_test.columns)


### 5. Saving training and test data after imputation

In [130]:
X_train_df = transformed_std_scaled_train_df.copy()
X_test_df = transformed_std_scaled_test_df.copy()

X_train_df[target_variable] = y_train
X_test_df[target_variable] = y_test

In [131]:
X_train_df.to_csv("..\\1_Modeling_Data_After_Transformations_TestTrainSplits\\Training_data_std.csv", index  = False)
X_test_df.to_csv("..\\1_Modeling_Data_After_Transformations_TestTrainSplits\\Testing_data_std.csv", index  = False)