In [18]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from imblearn.ensemble import EasyEnsembleClassifier

from imblearn.over_sampling import SMOTE
from sklearn.utils import class_weight

In [28]:
data = pd.read_feather('./output/full_df_wNA_labelled')
# data = pd.read_feather('./output/full_df_labelled')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458913 entries, 0 to 458912
Columns: 189 entries, B_37 to Default_Flag
dtypes: category(4), float64(184), int64(1)
memory usage: 649.5 MB


In [29]:
for col in data.select_dtypes('category'):
    print(col, data[col].unique())

D_36 ['CO', 'CL', 'CR', 'XM', 'XZ', 'XL']
Categories (6, object): ['CL', 'CO', 'CR', 'XL', 'XM', 'XZ']
D_44 ['O', 'R', 'U', NaN]
Categories (3, object): ['O', 'R', 'U']
B_41 [1, 0]
Categories (2, int64): [0, 1]
D_130 [1, 0]
Categories (2, int64): [0, 1]


In [30]:
cat_columns = ['D_36','D_44']

data_encoded = pd.get_dummies(data,columns=cat_columns,drop_first=True)

In [31]:
data_encoded = data_encoded.astype({col: 'uint8' for col in data_encoded.select_dtypes('category').columns})

In [64]:
data_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458913 entries, 0 to 458912
Columns: 194 entries, B_37 to D_44_U
dtypes: float64(184), int64(1), uint8(9)
memory usage: 651.7 MB


In [65]:
data_encoded.head()

Unnamed: 0,B_37,S_24,S_4,S_14,B_25,D_38,B_30,D_138,P_2,S_7,...,R_28,S_5,Default_Flag,D_36_CO,D_36_CR,D_36_XL,D_36_XM,D_36_XZ,D_44_R,D_44_U
0,0.006462,0.008118,0.853027,0.556641,1.003906,,0.021301,0.51123,1.004883,0.081543,...,0.008514,0.004307,0,1,0,0,0,0,0,0
1,0.007748,0.00631,0.758301,0.686035,0.001836,,0.088501,0.557129,0.687988,0.353027,...,0.003426,0.009224,0,0,0,0,0,0,0,0
2,0.005405,0.005219,0.766602,0.284668,1.00293,,0.005821,0.85498,0.927734,0.059479,...,0.009224,0.009636,0,1,0,0,0,0,0,0
3,0.004135,3.2e-05,0.363281,0.004623,0.533691,,0.002443,0.526367,0.700684,0.334473,...,0.009529,0.007523,0,0,1,0,0,0,0,0
4,0.002968,0.002892,0.757324,0.427734,1.001953,,0.008247,0.820312,1.001953,0.144287,...,0.00102,0.000535,0,0,0,0,1,0,0,0


In [33]:
X = data_encoded.drop('Default_Flag',axis=1)
y = data_encoded['Default_Flag']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
# X_train_rs, y_train_rs = SMOTE(n_jobs=-1,).fit_resample(X_train, y_train)

### Random Forest

In [26]:
# X_train_rs.shape, X_train.shape

((871340, 154), (293168, 154))

In [22]:
# rf = RandomForestClassifier(random_state=42,class_weight="balanced",n_jobs=-1,bootstrap=True,)

# rf.fit(X_train,y_train)
# rf_preds = rf.predict(X_test)

# print(classification_report(y_test, rf_preds))
# print(confusion_matrix(y_test, rf_preds))

              precision    recall  f1-score   support

           0       0.85      0.98      0.91     72686
           1       0.71      0.53      0.61      7144
           2       0.43      0.37      0.39      9786
           3       0.33      0.01      0.01      8107

    accuracy                           0.81     97723
   macro avg       0.58      0.47      0.48     97723
weighted avg       0.76      0.81      0.76     97723

[[71291   224  1136    35]
 [ 1740  3808  1576    20]
 [ 5173   942  3605    66]
 [ 5531   364  2152    60]]


### Easy Ensemble Classifier

In [14]:
# ee = EasyEnsembleClassifier(random_state=42,sampling_strategy={0:50000,1:10000,2:10000,3:10000},n_estimators=30,replacement=True,n_jobs=-1,)

# ee.fit(X_train,y_train)
# ee_preds = ee.predict(X_test)

# print(classification_report(y_test, ee_preds))
# print(confusion_matrix(y_test, ee_preds))

              precision    recall  f1-score   support

           0       0.92      0.93      0.93     72686
           1       0.62      0.61      0.62      7144
           2       0.42      0.38      0.40      9786
           3       0.29      0.30      0.29      8107

    accuracy                           0.80     97723
   macro avg       0.56      0.56      0.56     97723
weighted avg       0.80      0.80      0.80     97723

[[67872   552  1460  2802]
 [  690  4357  1434   663]
 [ 2138  1464  3723  2461]
 [ 2776   641  2289  2401]]


### XGBoost

In [59]:
xgb = XGBClassifier(random_state=42, n_estimators=200, subsample=0.5, learning_rate=0.1, max_depth=7, gamma=0.25, colsample_bytree=0.8)

xgb.fit(X_train,y_train)
xgb_preds = xgb.predict(X_test)

print(classification_report(y_test, xgb_preds))
print(confusion_matrix(y_test, xgb_preds))

              precision    recall  f1-score   support

           0       0.90      0.97      0.93     85022
           1       0.71      0.56      0.63      8625
           2       0.43      0.49      0.46     11606
           3       0.36      0.11      0.17      9476

    accuracy                           0.82    114729
   macro avg       0.60      0.53      0.55    114729
weighted avg       0.79      0.82      0.80    114729

[[82057   348  1895   722]
 [ 1190  4866  2364   205]
 [ 3755  1150  5713   988]
 [ 4500   458  3429  1089]]


In [60]:
xgbFull = XGBClassifier(random_state=42, n_estimators=200, subsample=0.5, learning_rate=0.1, max_depth=7, gamma=0.25, colsample_bytree=0.8)

xgbFull.fit(X,y)

### Grid Search for Best parameters

In [55]:
param_grid = {
    "max_depth": [7], # 3,5,10 done
    "learning_rate": [0.1], # 0.2,0.05,0.075 done
    "gamma": [0.25], #0, 1 done
    # "reg_lambda": [0, 1, 10],
    # "scale_pos_weight": [3, 5, 10],
    "subsample": [0.5],
    "colsample_bytree": [0.5]
}
# 0.81439, 0.81508, 0.81521

In [56]:
gridCV = GridSearchCV(XGBClassifier(random_state=42),param_grid, n_jobs=-1, cv=3, scoring="accuracy")

_ = gridCV.fit(X_train,y_train)

In [57]:
print('Best Score :',gridCV.best_score_)
print('Best parameters :',gridCV.best_params_)

Best Score : 0.8152122120726123
Best parameters : {'colsample_bytree': 0.5, 'gamma': 0.25, 'learning_rate': 0.1, 'max_depth': 7, 'subsample': 0.5}


In [36]:
test = pd.read_feather('./output/val_allx_Optmzd_feather')

In [72]:
# test.head()

In [37]:
# null_cols = ['D_17','D_38', 'D_43', 'D_96', 'D_39', 'B_7', 'D_73', 'B_22', 'D_12', 'D_132', 
# 'D_114', 'D_80', 'D_97', 'R_8', 'B_26', 'D_110', 'R_7', 'D_11', 'D_95', 'D_105', 'D_48', 'D_141', 
# 'D_142', 'D_10', 'D_68', 'D_92', 'D_31', 'D_106', 'B_29', 'D_133', 'S_23', 'D_89', 'S_19', 'D_8', 'S_25', 'S_7', 'D_64', 'D_40', 'D_22']

# test.drop(null_cols,axis=1,inplace=True)

test_encoded = pd.get_dummies(test,columns=cat_columns,drop_first=True)

In [89]:
# test_encoded.isnull().sum().sort_values(ascending=False).iloc[70:79]

B_5        39
B_9        39
B_10       39
B_18       39
B_33       39
R_28        0
S_5         0
D_36_CO     0
D_36_CR     0
dtype: int64

In [27]:
# null_row_cols = test_encoded.isnull().sum().sort_values(ascending=False).head(75).index

# for col in null_row_cols:
#     test_encoded[col].fillna(test_encoded[col].mean(), inplace=True)

# test_encoded.isnull().sum().sort_values(ascending=False).head()

ID      0
D_23    0
S_11    0
S_26    0
D_87    0
dtype: int64

In [38]:
test_encoded = test_encoded.astype({col: 'uint8' for col in test_encoded.select_dtypes('category').columns})

In [63]:
test_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 467966 entries, 0 to 467965
Columns: 194 entries, ID to D_44_U
dtypes: float64(184), object(1), uint8(9)
memory usage: 664.5+ MB


In [61]:
# valx_pred = rf.predict(test_encoded.drop('ID',axis=1))

valx_pred = xgbFull.predict(test_encoded.drop('ID',axis=1))

In [40]:
sub_df = pd.DataFrame(test_encoded['ID'])
sub_df['Label'] = valx_pred
sub_df.head()

Unnamed: 0,ID,Label
0,3337446730,0
1,7888784125,0
2,9871378905,2
3,8891869609,0
4,2006443827,0


In [41]:
sub_df['Label'].value_counts()

0    374372
2     56757
1     25836
3     11001
Name: Label, dtype: int64

In [62]:
sub_df.to_csv('./output/sub_XGBFull_wNA_01.csv',index=False)