### Importing Relevant Libraries and Data

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [13]:
train_data = pd.read_csv('credit_card_default_train.csv')
test_data = pd.read_csv('credit_card_default_test.csv')

In [14]:
cat_cols = ['Gender','EDUCATION_STATUS','MARITAL_STATUS','AGE','PAY_JULY','PAY_AUG','PAY_SEP','PAY_OCT','PAY_NOV','PAY_DEC']
target = 'NEXT_MONTH_DEFAULT'
ID = 'Client_ID'
num_cols = [col for col in train_data.columns.tolist() if col not in cat_cols +[target]+[ID]]

### Feature Engineering

In [15]:
def Paid_Due_July(row):
    if row['PAID_AMT_JULY'] == 0:
        val = row['DUE_AMT_JULY']
    else:
        val = row['DUE_AMT_JULY']/row['PAID_AMT_JULY']
    return val

def Paid_Due_Aug(row):
    if row['PAID_AMT_AUG'] == 0:
        val = row['DUE_AMT_AUG']
    else:
        val = row['DUE_AMT_AUG']/row['PAID_AMT_AUG']
    return val

def Paid_Due_Sep(row):
    if row['PAID_AMT_SEP'] == 0:
        val = row['DUE_AMT_SEP']
    else:
        val = row['DUE_AMT_SEP']/row['PAID_AMT_SEP']
    return val

def Paid_Due_Oct(row):
    if row['PAID_AMT_OCT'] == 0:
        val = row['DUE_AMT_OCT']
    else:
        val = row['DUE_AMT_OCT']/row['PAID_AMT_OCT']
    return val

def Paid_Due_Nov(row):
    if row['PAID_AMT_NOV'] == 0:
        val = row['DUE_AMT_NOV']
    else:
        val = row['DUE_AMT_NOV']/row['PAID_AMT_NOV']
    return val

def Paid_Due_Dec(row):
    if row['PAID_AMT_DEC'] == 0:
        val = row['DUE_AMT_DEC']
    else:
        val = row['DUE_AMT_DEC']/row['PAID_AMT_DEC']
    return val

In [16]:
train_data['PAID_DUE_JULY'] = train_data.apply(Paid_Due_July, axis=1)
train_data['PAID_DUE_AUG'] = train_data.apply(Paid_Due_Aug, axis=1)
train_data['PAID_DUE_SEP'] = train_data.apply(Paid_Due_Sep, axis=1)
train_data['PAID_DUE_OCT'] = train_data.apply(Paid_Due_Oct, axis=1)
train_data['PAID_DUE_NOV'] = train_data.apply(Paid_Due_Nov, axis=1)
train_data['PAID_DUE_DEC'] = train_data.apply(Paid_Due_Dec, axis=1)

test_data['PAID_DUE_JULY'] = test_data.apply(Paid_Due_July, axis=1)
test_data['PAID_DUE_AUG'] = test_data.apply(Paid_Due_Aug, axis=1)
test_data['PAID_DUE_SEP'] = test_data.apply(Paid_Due_Sep, axis=1)
test_data['PAID_DUE_OCT'] = test_data.apply(Paid_Due_Oct, axis=1)
test_data['PAID_DUE_NOV'] = test_data.apply(Paid_Due_Nov, axis=1)
test_data['PAID_DUE_DEC'] = test_data.apply(Paid_Due_Dec, axis=1)

In [17]:
train_data['PAY_TOT'] = train_data['PAY_JULY'] + train_data['PAY_AUG'] + train_data['PAY_SEP'] + train_data['PAY_OCT'] + train_data['PAY_NOV'] + train_data['PAY_DEC']
test_data['PAY_TOT'] = test_data['PAY_JULY'] + test_data['PAY_AUG'] + test_data['PAY_SEP'] + test_data['PAY_OCT'] + test_data['PAY_NOV'] + test_data['PAY_DEC']

In [18]:
def isZero(row):
    if row['PAY_TOT'] == 0:
        val = 1
    else:
        val = 0
    return val

In [19]:
train_data['PAY_TOT_0'] = train_data.apply(isZero, axis=1)
test_data['PAY_TOT_0'] = test_data.apply(isZero, axis=1)

In [20]:
LabelEncoder = LabelEncoder()
train_data['Balance_Limit_V1_cat'] = LabelEncoder.fit_transform(train_data['Balance_Limit_V1'])
train_data['Gender_cat'] = LabelEncoder.fit_transform(train_data['Gender'])
train_data['EDUCATION_STATUS_cat'] = LabelEncoder.fit_transform(train_data['EDUCATION_STATUS'])
train_data['MARITAL_STATUS_cat'] = LabelEncoder.fit_transform(train_data['MARITAL_STATUS'])
train_data['AGE_cat'] = LabelEncoder.fit_transform(train_data['AGE'])

test_data['Balance_Limit_V1_cat'] = LabelEncoder.fit_transform(test_data['Balance_Limit_V1'])
test_data['Gender_cat'] = LabelEncoder.fit_transform(test_data['Gender'])
test_data['EDUCATION_STATUS_cat'] = LabelEncoder.fit_transform(test_data['EDUCATION_STATUS'])
test_data['MARITAL_STATUS_cat'] = LabelEncoder.fit_transform(test_data['MARITAL_STATUS'])
test_data['AGE_cat'] = LabelEncoder.fit_transform(test_data['AGE'])

### Model Selection and Evaluation

In [21]:
X_train, X_test, y_train, y_test = train_test_split(train_data.drop([target,ID,'Balance_Limit_V1','Gender','EDUCATION_STATUS','MARITAL_STATUS','AGE'],axis=1), 
                                                    train_data[target], test_size=0.30)

In [22]:
X = train_data.drop([target,ID,'Balance_Limit_V1','Gender','EDUCATION_STATUS','MARITAL_STATUS','AGE'],axis=1)
y = train_data[target]
                    
random_grid = {'n_estimators': [100,200,400],
               'learning_rate': [0.1,0.01,0.03]}

grid = GridSearchCV(AdaBoostClassifier(),random_grid,refit=True,verbose=3,cv=3)
grid.fit(X,y)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] learning_rate=0.1, n_estimators=100 .............................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] . learning_rate=0.1, n_estimators=100, score=0.808, total=   4.0s
[CV] learning_rate=0.1, n_estimators=100 .............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.9s remaining:    0.0s


[CV] . learning_rate=0.1, n_estimators=100, score=0.823, total=   4.0s
[CV] learning_rate=0.1, n_estimators=100 .............................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    7.9s remaining:    0.0s


[CV] . learning_rate=0.1, n_estimators=100, score=0.824, total=   3.9s
[CV] learning_rate=0.1, n_estimators=200 .............................
[CV] . learning_rate=0.1, n_estimators=200, score=0.808, total=   7.8s
[CV] learning_rate=0.1, n_estimators=200 .............................
[CV] . learning_rate=0.1, n_estimators=200, score=0.823, total=   8.0s
[CV] learning_rate=0.1, n_estimators=200 .............................
[CV] . learning_rate=0.1, n_estimators=200, score=0.824, total=   8.0s
[CV] learning_rate=0.1, n_estimators=400 .............................
[CV] . learning_rate=0.1, n_estimators=400, score=0.809, total=  15.7s
[CV] learning_rate=0.1, n_estimators=400 .............................
[CV] . learning_rate=0.1, n_estimators=400, score=0.822, total=  15.7s
[CV] learning_rate=0.1, n_estimators=400 .............................
[CV] . learning_rate=0.1, n_estimators=400, score=0.825, total=  15.6s
[CV] learning_rate=0.01, n_estimators=100 ............................
[CV]  

[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:  4.2min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                          base_estimator=None,
                                          learning_rate=1.0, n_estimators=50,
                                          random_state=None),
             iid='deprecated', n_jobs=None,
             param_grid={'learning_rate': [0.1, 0.01, 0.03],
                         'n_estimators': [100, 200, 400]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [23]:
print (grid.best_params_)
print (grid.best_score_)

{'learning_rate': 0.1, 'n_estimators': 400}
0.8186666666666667


In [27]:
model_ada = AdaBoostClassifier(n_estimators=400,learning_rate = 0.1)
model_ada.fit(X_train,y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.1,
                   n_estimators=400, random_state=None)

In [28]:
preds_ada = model_ada.predict(X_test)

print(classification_report(y_test,preds_ada))
print ('\n')
print(confusion_matrix(y_test,preds_ada))

              precision    recall  f1-score   support

           0       0.83      0.95      0.89      5553
           1       0.68      0.33      0.45      1647

    accuracy                           0.81      7200
   macro avg       0.76      0.64      0.67      7200
weighted avg       0.79      0.81      0.79      7200



[[5299  254]
 [1101  546]]


### Implementation of the model in test data

In [29]:
model_ada = AdaBoostClassifier(n_estimators=200,learning_rate = 0.01)
model_ada.fit(train_data.drop([target,ID,'Balance_Limit_V1','Gender','EDUCATION_STATUS','MARITAL_STATUS','AGE'],axis=1),train_data[target])

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.01,
                   n_estimators=200, random_state=None)

In [30]:
preds_ada = model_ada.predict(test_data.drop([ID,'Balance_Limit_V1','Gender','EDUCATION_STATUS','MARITAL_STATUS','AGE'],axis=1))

In [31]:
sample_submission = pd.DataFrame(columns=[ID,target])
sample_submission[ID]=test_data[ID]
sample_submission[target] = preds_ada

In [32]:
submission = sample_submission.to_csv('data-storm-day1-2 check.csv',index = None)