In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesRegressor
import xgboost as xgb

In [2]:
data = pd.read_csv('dataset.csv', index_col=0)

In [3]:
data.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,200000,2,3,1,30,2,2,2,2,2,...,147273,149244,151973,6600,6000,5860,6000,5000,0,0
1,200000,2,4,2,27,-2,-2,-2,-2,-2,...,0,0,0,0,0,0,0,0,0,0
2,20000,2,2,2,28,0,0,2,0,0,...,8168,6894,11424,3353,3,5101,6,6530,8103,1
3,50000,1,2,2,23,0,0,0,-1,0,...,48437,18712,19129,4175,41000,51705,700,718,700,0
4,20000,1,2,1,47,-1,-1,-1,-2,-2,...,0,0,0,780,0,0,0,0,0,0


In [4]:
data.columns

Index(['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default payment next month'],
      dtype='object')

In [None]:
data['MARRIAGE'].value_counts()

In [None]:
data[data['EDUCATION'] == 0].LIMIT_BAL.value_counts()

In [None]:
data['MARRIAGE'].value_counts()

In [None]:
sns.pairplot(data)

In [None]:
data['log_bal'] = np.log(data['LIMIT_BAL'])
data['sqrt_bal'] = np.sqrt(data['LIMIT_BAL'])

In [None]:
data.EDUCATION.unique()

In [None]:
data.plot.scatter('LIMIT_BAL', 'SEX')

In [None]:
data.plot.hist('log_bal')

In [None]:
data.plot.hist('sqrt_bal')

In [None]:
data.LIMIT_BAL.describe()

In [None]:
data.plot.bar('SEX', 'LIMIT_BAL')

# functions for dataset that need to be done on validation set as well

In [5]:
data['EDUCATION'].replace([0,5,6],4, inplace=True)
data['MARRIAGE'].replace(0,3,inplace=True)

In [None]:
# data['PAY_0'].replace(0,-3,inplace=True)
# data['PAY_2'].replace(0,-3,inplace=True)
# data['PAY_3'].replace(0,-3,inplace=True)
# data['PAY_4'].replace(0,-3,inplace=True)
# data['PAY_5'].replace(0,-3,inplace=True)
# data['PAY_6'].replace(0,-3,inplace=True)

In [None]:
# data['PAY_0'].replace(-1,0,inplace=True)
# data['PAY_2'].replace(-1,0,inplace=True)
# data['PAY_3'].replace(-1,0,inplace=True)
# data['PAY_4'].replace(-1,0,inplace=True)
# data['PAY_5'].replace(-1,0,inplace=True)
# data['PAY_6'].replace(-1,0,inplace=True)

In [None]:
# # data[data['PAY_6'] != -2] or data[data['PAY_6'] != -3]:
# for x in data['PAY_6']:
#     if x != -3 or x!= -2:
#         data['PAY_5'] = np.where((data['PAY_AMT6']+data['PAY_AMT5']) >= data['BILL_AMT6'], 0, data['PAY_6']+1)
#         data['PAY_4'] = np.where(data['PAY_AMT4'] >= data['BILL_AMT5'], 0, data['PAY_5']+1)
#         data['PAY_3'] = np.where(data['PAY_AMT3'] >= data['BILL_AMT4'], 0, data['PAY_4']+1)
#         data['PAY_2'] = np.where(data['PAY_AMT2'] >= data['BILL_AMT3'], 0, data['PAY_3']+1)
#         data['PAY_0'] = np.where(data['PAY_AMT1'] >= data['BILL_AMT2'], 0, data['PAY_2']+1)

In [6]:
data['payment_over_6'] = sum([x for x in [data['PAY_AMT1'], data['PAY_AMT2'], data['PAY_AMT3'], data['PAY_AMT4'],
                                      data['PAY_AMT5'], data['PAY_AMT6']]])

In [None]:
data['payment_over_6'].head()

In [7]:
data['credit_spent'] = [x for x in (data['BILL_AMT1'] + data['payment_over_6']) - data['BILL_AMT6']]

In [None]:
data['credit_spent'].head()

In [None]:
# if the person spent more than they paid, there's a higher chance that they default

In [8]:
data[data['credit_spent'] > data['payment_over_6']]['default payment next month'].value_counts()

0    10245
1     2308
Name: default payment next month, dtype: int64

# creating features n target and train and test sets

In [9]:
target = data['default payment next month']
features = data.drop(columns = 'default payment next month')

In [37]:
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=23)

# handling class imbalances

#### SMOTE

In [32]:
from imblearn.over_sampling import SMOTE



In [34]:
sm = SMOTE(sampling_strategy='minority', random_state=23)
X_train, y_train = sm.fit_sample(X_train, y_train)



In [35]:
smote_tree = DecisionTreeClassifier(random_state=23, class_weight='balanced', max_depth=2, max_features=11)
smote_tree.fit(X_train, y_train)
smote_tree_pred = smote_tree.predict(X_test)
print('Test F1 score: ', f1_score(y_test, smote_tree_pred))
# balanced depth2 features11 gini

Test F1 score:  0.5218039508013418


In [36]:
smote_forest = RandomForestClassifier(class_weight='balanced', max_features=8, max_depth=7, n_estimators=10, warm_start=True)
smote_forest.fit(X_train, y_train)
smote_forest_pred = smote_forest.predict(X_test)
print('Test F1 score: ', f1_score(y_test, smote_forest_pred))
# balanced gini feature8 depth7 estimators10 wstart TRUE

  warn('class_weight presets "balanced" or '


Test F1 score:  0.5075593952483801


#### Tomek Links

In [None]:
from collections import Counter
from imblearn.under_sampling import TomekLinks # doctest: +NORMALIZE_WHITESPACE

In [None]:
tl = TomekLinks()
X_res, y_res = tl.fit_resample(X_train, y_train)
print('Resampled dataset shape %s' % Counter(y_res))

In [None]:
tomek_lr = LogisticRegression(solver='liblinear')

tomek_lr.fit(X_resampled, y_resampled)

tomek_pred = tomek_lr.predict(X_test)

# checking accuracy
print('Test Accuracy score: ', accuracy_score(y_test, tomek_pred))


# checking accuracy
print('Test F1 score: ', f1_score(y_test, tomek_pred))

# instantiating knn

In [12]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
print('F1:' + str(f1_score(y_test, knn_pred)))

F1:0.2793682132280355


# instantiating decision trees

In [13]:
dtc = DecisionTreeClassifier(random_state=23)
dtc.fit(X_train, y_train)
dtc_pred = dtc.predict(X_test)
print('F1:' + str(f1_score(y_test, dtc_pred)))

F1:0.42353852109949675


# instantiating gridsearch

#### knn grid

In [14]:
knn_grid = {'n_neighbors': list(range(23,34,2)),
            'weights': ['uniform', 'distance'],
            'metric': ['minkowski', 'euclidean', 'manhattan']}
# manhattan distance neighbors31

In [15]:
grid_knn = GridSearchCV(KNeighborsClassifier(), knn_grid, n_jobs=-1, verbose=1, cv=5)
grid_knn.fit(X_train, y_train)
# grid_knn_pred = grid_knn.predict(X_test)
# print('Accuracy:' + str(metrics.f1_score(y_test, grid_knn_pred)))

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:   28.6s finished


GridSearchCV(cv=5, estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'metric': ['minkowski', 'euclidean', 'manhattan'],
                         'n_neighbors': [23, 25, 27, 29, 31, 33],
                         'weights': ['uniform', 'distance']},
             verbose=1)

In [16]:
grid_knn.best_params_

{'metric': 'manhattan', 'n_neighbors': 31, 'weights': 'distance'}

#### tree grid

In [17]:
tree_grid = {'criterion': ['gini', 'entropy'],
             'max_depth': list(range(1,11)),
             'max_features': list(range(3,16)),
             'class_weight': ['None', 'balanced']}
# balanced depth2 features11 gini 

In [18]:
grid_tree = GridSearchCV(DecisionTreeClassifier(random_state=23), tree_grid, n_jobs=-1, verbose=1, cv=5)
grid_tree.fit(X_train, y_train)

Fitting 5 folds for each of 520 candidates, totalling 2600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 2593 out of 2600 | elapsed:   25.8s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 2600 out of 2600 | elapsed:   25.9s finished


GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=23), n_jobs=-1,
             param_grid={'class_weight': ['None', 'balanced'],
                         'criterion': ['gini', 'entropy'],
                         'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'max_features': [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
                                          14, 15]},
             verbose=1)

In [19]:
grid_tree.best_params_

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 2,
 'max_features': 11}

In [20]:
grid_knn_pred = grid_knn.best_estimator_.predict(X_test)
f1gkp = f1_score(y_test, grid_knn_pred)

grid_tree_pred = grid_tree.best_estimator_.predict(X_test)
f1gtp = f1_score(y_test, grid_tree_pred)

# print('knn f1: ' (f1_score(y_test, grid_knn_pred)) 'vs.tree f1:' f1_score(y_test, grid_knn_pred)
print(f1gkp, f1gtp)

0.16589861751152074 0.5265866209262435


# voting classifier between tree and knn

In [21]:
treevote = DecisionTreeClassifier(criterion='entropy', max_depth=1, max_features=13, class_weight='balanced')

In [22]:
knnvote = KNeighborsClassifier(n_neighbors=33, weights='uniform', metric='manhattan')

In [23]:
voting = VotingClassifier(estimators=[('knneighbors', knnvote), ('tree', treevote)], voting='hard')

voting.fit(X_train, y_train)

vote_pred = voting.predict(X_test)

vote_f1 = f1_score(y_test, vote_pred)

In [24]:
vote_f1

0.08840579710144927

# instantiating random forest

In [27]:
forest_grid = {'n_estimators': [10],
                'max_depth': list(range(1,11)),
                'max_features': list(range(3,16)),
                'class_weight': ['balanced_subsample','balanced', None],
                'criterion': ['gini', 'entropy'],
                'warm_start': [True, False]}
# balanced gini feature8 depth7 estimators10 wstart TRUE

In [28]:
grid_forest = GridSearchCV(RandomForestClassifier(),forest_grid, cv=5, n_jobs=-1, scoring='f1', verbose=1)
grid_forest.fit(X_train, y_train)


Fitting 5 folds for each of 1560 candidates, totalling 7800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   13.8s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   32.1s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 4992 tasks      | elapsed:  8.8min
[Parallel(n_jobs=-1)]: Done 6042 tasks      | elapsed: 10.4min
[Parallel(n_jobs=-1)]: Done 7192 tasks      | elapsed: 12.3min
[Parallel(n_jobs=-1)]: Done 7800 out of 7800 | elapsed: 14.1min finished
  warn('class_weight presets "balanced" or '


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'class_weight': ['balanced_subsample', 'balanced',
                                          None],
                         'criterion': ['gini', 'entropy'],
                         'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'max_features': [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
                                          14, 15],
                         'n_estimators': [10], 'warm_start': [True, False]},
             scoring='f1', verbose=1)

In [29]:
grid_forest.best_params_

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 7,
 'max_features': 8,
 'n_estimators': 10,
 'warm_start': True}

In [30]:
grid_forest_pred = grid_forest.best_estimator_.predict(X_test)
grid_forest_f1 = f1_score(y_test, grid_forest_pred)

In [31]:
grid_forest_f1

0.5355417529330573

# final model

In [55]:
final = RandomForestClassifier(random_state=45, class_weight='balanced', max_depth=7, max_features=8, n_estimators=10, warm_start=10)
final.fit(X_train, y_train)




  warn('class_weight presets "balanced" or '


RandomForestClassifier(class_weight='balanced', max_depth=7, max_features=8,
                       n_estimators=10, random_state=45, warm_start=10)

In [56]:
final_pred = reg_deg.predict(X_test)
final_f1 = f1_score(y_test, reg_deg_pred)

In [57]:
final_f1

0.5477617201268946

# calling in validation

In [58]:
validation = pd.read_csv('validation_set.csv', index_col=0)

In [66]:
validation['payment_over_6'] = sum([x for x in [validation['PAY_AMT1'], validation['PAY_AMT2'], validation['PAY_AMT3'], validation['PAY_AMT4'],
                                                validation['PAY_AMT5'], validation['PAY_AMT6']]])

In [67]:
validation['credit_spent'] = [x for x in (validation['BILL_AMT1'] + validation['payment_over_6']) - validation['BILL_AMT6']]

In [73]:
pd.DataFrame(final.predict(validation)).to_csv('validation_prediction_MOSR.csv')