In [1]:
#Import libraries
import os
import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer


%matplotlib inline

In [2]:
#Load data
df_train = pd.read_csv('~/Documents/GitHub/springboard/Capstone project 2/processed data/df_train.csv')

In [3]:
#D_* = Delinquency variables
#S_* = Spend variables
#P_* = Payment variables
#B_* = Balance variables
#R_* = Risk variables

In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Columns: 177 entries, S_2 to target
dtypes: float64(172), int64(2), object(3)
memory usage: 108.0+ MB


In [5]:
#We get all the categorical columns
df_train.select_dtypes(['object']).columns

Index(['S_2', 'D_63', 'D_64'], dtype='object')

In [6]:
df_train[['S_2', 'D_63', 'D_64']]

Unnamed: 0,S_2,D_63,D_64
0,2018-02-14,CO,O
1,2017-08-26,CO,R
2,2017-08-17,CR,O
3,2017-05-04,CR,-1
4,2017-04-17,CO,O
...,...,...,...
79995,2017-11-08,CO,U
79996,2017-09-19,CO,O
79997,2017-08-06,CO,R
79998,2017-07-27,CR,R


In [7]:
df_train['D_63'].value_counts()


CO    59058
CR    13858
CL     6452
XZ      381
XM      127
XL      124
Name: D_63, dtype: int64

In [8]:
df_train['D_64'].value_counts()

O     42680
U     21880
R     11595
0      3298
-1      547
Name: D_64, dtype: int64

In [9]:
#We drop the 'S_2' column and one hot encode 'D_63' and 'D_64'
df_train = df_train.drop(columns = ['S_2'])
df_train = pd.get_dummies(df_train, columns = ['D_63', 'D_64'])

We now standardize the magnitude of numeric features using a scaler except for the dummy columns

In [10]:
#get all the numeric columns which are not dummy
columns_name = df_train.columns.tolist()
for i in ['D_63_CL',
 'D_63_CO',
 'D_63_CR',
 'D_63_XL',
 'D_63_XM',
 'D_63_XZ',
 'D_64_-1',
 'D_64_0',
 'D_64_O',
 'D_64_R',
 'D_64_U']:
    columns_name.remove(i)

In [11]:
#we apply the StandardScaler to the numeric columns which are not dummy
ct = ColumnTransformer([
        ('somename', StandardScaler(), columns_name)
    ], remainder='passthrough')
scaled_df = ct.fit_transform(df_train)
all_columns = df_train.columns
scaled_df = pd.DataFrame(scaled_df, columns=all_columns)

In [12]:
#split to train set and test set
y = df_train.target
X = df_train.drop(columns = ['target'])
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)