### Importing Relevant Libraries and Data

In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
train_data = pd.read_csv('credit_card_default_train.csv')
test_data = pd.read_csv('credit_card_default_test.csv')

In [3]:
cat_cols = ['Gender','EDUCATION_STATUS','MARITAL_STATUS','AGE','PAY_JULY','PAY_AUG','PAY_SEP','PAY_OCT','PAY_NOV','PAY_DEC']
target = 'NEXT_MONTH_DEFAULT'
ID = 'Client_ID'
num_cols = [col for col in train_data.columns.tolist() if col not in cat_cols +[target]+[ID]]

### Feature Engineering

In [4]:
train_data['PAY_TOT'] = train_data['PAY_JULY'] + train_data['PAY_AUG'] + train_data['PAY_SEP'] + train_data['PAY_OCT'] + train_data['PAY_NOV'] + train_data['PAY_DEC']
test_data['PAY_TOT'] = test_data['PAY_JULY'] + test_data['PAY_AUG'] + test_data['PAY_SEP'] + test_data['PAY_OCT'] + test_data['PAY_NOV'] + test_data['PAY_DEC']

In [5]:
def isZero(row):
    if row['PAY_TOT'] == 0:
        val = 1
    else:
        val = 0
    return val

In [6]:
train_data['PAY_TOT_0'] = train_data.apply(isZero, axis=1)
test_data['PAY_TOT_0'] = test_data.apply(isZero, axis=1)

In [7]:
LabelEncoder = LabelEncoder()
train_data['Balance_Limit_V1_cat'] = LabelEncoder.fit_transform(train_data['Balance_Limit_V1'])
train_data['Gender_cat'] = LabelEncoder.fit_transform(train_data['Gender'])
train_data['EDUCATION_STATUS_cat'] = LabelEncoder.fit_transform(train_data['EDUCATION_STATUS'])
train_data['MARITAL_STATUS_cat'] = LabelEncoder.fit_transform(train_data['MARITAL_STATUS'])
train_data['AGE_cat'] = LabelEncoder.fit_transform(train_data['AGE'])

test_data['Balance_Limit_V1_cat'] = LabelEncoder.fit_transform(test_data['Balance_Limit_V1'])
test_data['Gender_cat'] = LabelEncoder.fit_transform(test_data['Gender'])
test_data['EDUCATION_STATUS_cat'] = LabelEncoder.fit_transform(test_data['EDUCATION_STATUS'])
test_data['MARITAL_STATUS_cat'] = LabelEncoder.fit_transform(test_data['MARITAL_STATUS'])
test_data['AGE_cat'] = LabelEncoder.fit_transform(test_data['AGE'])

### Model Selection and Evaluation

In [12]:
X_train, X_test, y_train, y_test = train_test_split(train_data.drop([target,ID,'Balance_Limit_V1','Gender','EDUCATION_STATUS','MARITAL_STATUS','AGE'],axis=1), 
                                                    train_data[target], test_size=0.30)

In [17]:
X = train_data.drop([target,ID,'Balance_Limit_V1','Gender','EDUCATION_STATUS','MARITAL_STATUS','AGE'],axis=1)
y = train_data[target]
                    
random_grid = {'n_estimators': [200,400],
               'max_features': ['log2','auto'],
               'criterion': ['gini','entropy']}

grid = GridSearchCV(RandomForestClassifier(),random_grid,refit=True,verbose=3,cv=3)
grid.fit(X,y)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] criterion=gini, max_features=log2, n_estimators=200 .............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  criterion=gini, max_features=log2, n_estimators=200, score=0.805, total=   7.4s
[CV] criterion=gini, max_features=log2, n_estimators=200 .............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.3s remaining:    0.0s


[CV]  criterion=gini, max_features=log2, n_estimators=200, score=0.815, total=   7.3s
[CV] criterion=gini, max_features=log2, n_estimators=200 .............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   14.7s remaining:    0.0s


[CV]  criterion=gini, max_features=log2, n_estimators=200, score=0.820, total=   7.4s
[CV] criterion=gini, max_features=log2, n_estimators=400 .............
[CV]  criterion=gini, max_features=log2, n_estimators=400, score=0.803, total=  14.5s
[CV] criterion=gini, max_features=log2, n_estimators=400 .............
[CV]  criterion=gini, max_features=log2, n_estimators=400, score=0.818, total=  14.5s
[CV] criterion=gini, max_features=log2, n_estimators=400 .............
[CV]  criterion=gini, max_features=log2, n_estimators=400, score=0.819, total=  14.5s
[CV] criterion=gini, max_features=auto, n_estimators=200 .............
[CV]  criterion=gini, max_features=auto, n_estimators=200, score=0.805, total=   8.8s
[CV] criterion=gini, max_features=auto, n_estimators=200 .............
[CV]  criterion=gini, max_features=auto, n_estimators=200, score=0.817, total=   8.9s
[CV] criterion=gini, max_features=auto, n_estimators=200 .............
[CV]  criterion=gini, max_features=auto, n_estimators=200,

[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:  5.8min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [18]:
grid.best_params_

{'criterion': 'entropy', 'max_features': 'auto', 'n_estimators': 400}

In [19]:
model_rf = RandomForestClassifier(n_estimators=400,criterion='entropy')
model_rf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=400,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [20]:
preds_rf = model_rf.predict(X_test)

print(classification_report(y_test,preds_rf))
print ('\n')
print(confusion_matrix(y_test,preds_rf))

              precision    recall  f1-score   support

           0       0.84      0.94      0.89      5589
           1       0.64      0.37      0.47      1611

    accuracy                           0.81      7200
   macro avg       0.74      0.65      0.68      7200
weighted avg       0.79      0.81      0.79      7200



[[5258  331]
 [1022  589]]


### Implementation of the model in test data

In [21]:
model_rf = RandomForestClassifier(n_estimators=400,criterion='gini')
model_rf.fit(train_data.drop([target,ID,'Balance_Limit_V1','Gender','EDUCATION_STATUS','MARITAL_STATUS','AGE'],axis=1),train_data[target])

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=400,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [22]:
preds_rf = model_rf.predict(test_data.drop([ID,'Balance_Limit_V1','Gender','EDUCATION_STATUS','MARITAL_STATUS','AGE'],axis=1))

In [23]:
sample_submission = pd.DataFrame(columns=[ID,target])
sample_submission[ID]=test_data[ID]
sample_submission[target] = preds_rf

In [25]:
submission = sample_submission.to_csv('data-storm-day1-1 check2.csv',index = None)