In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from imblearn.ensemble import EasyEnsembleClassifier

from imblearn.over_sampling import SMOTE
from sklearn.utils import class_weight

In [4]:
data = pd.read_feather('./output/full_df_labelled')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 390891 entries, 0 to 390890
Columns: 150 entries, B_37 to Default_Flag
dtypes: category(4), float32(145), int64(1)
memory usage: 220.7 MB


In [5]:
for col in data.select_dtypes('category'):
    print(col, data[col].unique())

D_36 ['CO', 'CL', 'CR', 'XM', 'XZ', 'XL']
Categories (6, object): ['CL', 'CO', 'CR', 'XL', 'XM', 'XZ']
D_44 ['O', 'R', 'U']
Categories (3, object): ['O', 'R', 'U']
B_41 [1, 0]
Categories (2, int64): [0, 1]
D_130 [1, 0]
Categories (2, int64): [0, 1]


In [6]:
cat_columns = ['D_36','D_44']

data_encoded = pd.get_dummies(data,columns=cat_columns,drop_first=True)

In [7]:
data_encoded = data_encoded.astype({col: 'uint8' for col in data_encoded.select_dtypes('category').columns})

In [8]:
data_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 390891 entries, 0 to 390890
Columns: 155 entries, B_37 to D_44_U
dtypes: float32(145), int64(1), uint8(9)
memory usage: 222.6 MB


In [66]:
# data_encoded.head()

In [9]:
X = data_encoded.drop('Default_Flag',axis=1)
y = data_encoded['Default_Flag']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

### Random Forest

In [22]:
rf = RandomForestClassifier(random_state=42,class_weight="balanced",n_jobs=-1,bootstrap=True)

rf.fit(X_train,y_train)
rf_preds = rf.predict(X_test)

print(classification_report(y_test, rf_preds))
print(confusion_matrix(y_test, rf_preds))

              precision    recall  f1-score   support

           0       0.85      0.98      0.91     72686
           1       0.71      0.53      0.61      7144
           2       0.43      0.37      0.39      9786
           3       0.33      0.01      0.01      8107

    accuracy                           0.81     97723
   macro avg       0.58      0.47      0.48     97723
weighted avg       0.76      0.81      0.76     97723

[[71291   224  1136    35]
 [ 1740  3808  1576    20]
 [ 5173   942  3605    66]
 [ 5531   364  2152    60]]


### Easy Ensemble Classifier

In [10]:
ee = EasyEnsembleClassifier(random_state=42,sampling_strategy={0:30000,1:10000,2:10000,3:10000})

ee.fit(X_train,y_train)
ee_preds = ee.predict(X_test)

print(classification_report(y_test, ee_preds))
print(confusion_matrix(y_test, ee_preds))

              precision    recall  f1-score   support

           0       0.94      0.92      0.93     72686
           1       0.61      0.60      0.61      7144
           2       0.41      0.40      0.40      9786
           3       0.28      0.35      0.31      8107

    accuracy                           0.79     97723
   macro avg       0.56      0.57      0.56     97723
weighted avg       0.81      0.79      0.80     97723

[[66569   602  1605  3910]
 [  550  4305  1538   751]
 [ 1652  1456  3879  2799]
 [ 2230   637  2370  2870]]


### XGBoost

In [23]:
xgb = XGBClassifier(random_state=42,n_estimators=200,subsample=0.5,learning_rate=0.1)

xgb.fit(X_train,y_train)
xgb_preds = xgb.predict(X_test)

print(classification_report(y_test, xgb_preds))
print(confusion_matrix(y_test, xgb_preds))

              precision    recall  f1-score   support

           0       0.90      0.96      0.93     72686
           1       0.72      0.55      0.62      7144
           2       0.41      0.51      0.46      9786
           3       0.34      0.09      0.14      8107

    accuracy                           0.82     97723
   macro avg       0.59      0.53      0.54     97723
weighted avg       0.79      0.82      0.79     97723

[[70057   277  1818   534]
 [  970  3943  2054   177]
 [ 3158   928  5003   697]
 [ 3817   348  3228   714]]


In [25]:
test = pd.read_feather('./output/val_allx_Optmzd_feather')

In [72]:
# test.head()

In [26]:
null_cols = ['D_17','D_38', 'D_43', 'D_96', 'D_39', 'B_7', 'D_73', 'B_22', 'D_12', 'D_132', 
'D_114', 'D_80', 'D_97', 'R_8', 'B_26', 'D_110', 'R_7', 'D_11', 'D_95', 'D_105', 'D_48', 'D_141', 
'D_142', 'D_10', 'D_68', 'D_92', 'D_31', 'D_106', 'B_29', 'D_133', 'S_23', 'D_89', 'S_19', 'D_8', 'S_25', 'S_7', 'D_64', 'D_40', 'D_22']

test.drop(null_cols,axis=1,inplace=True)

test_encoded = pd.get_dummies(test,columns=cat_columns,drop_first=True)

In [89]:
# test_encoded.isnull().sum().sort_values(ascending=False).iloc[70:79]

B_5        39
B_9        39
B_10       39
B_18       39
B_33       39
R_28        0
S_5         0
D_36_CO     0
D_36_CR     0
dtype: int64

In [27]:
null_row_cols = test_encoded.isnull().sum().sort_values(ascending=False).head(75).index

for col in null_row_cols:
    test_encoded[col].fillna(test_encoded[col].mean(), inplace=True)

test_encoded.isnull().sum().sort_values(ascending=False).head()

ID      0
D_23    0
S_11    0
S_26    0
D_87    0
dtype: int64

In [28]:
test_encoded = test_encoded.astype({col: 'uint8' for col in test_encoded.select_dtypes('category').columns})

In [29]:
# valx_pred = rf.predict(test_encoded.drop('ID',axis=1))

valx_pred = xgb.predict(test_encoded.drop('ID',axis=1))

In [30]:
sub_df = pd.DataFrame(test_encoded['ID'])
sub_df['Label'] = valx_pred
sub_df.head()

Unnamed: 0,ID,Label
0,3337446730,0
1,7888784125,0
2,9871378905,2
3,8891869609,0
4,2006443827,0


In [31]:
sub_df['Label'].value_counts()

0    374925
2     56788
1     25641
3     10612
Name: Label, dtype: int64

In [32]:
sub_df.to_csv('./output/sub_xgb_02.csv',index=False)