In [17]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import KNNImputer
from xgboost import XGBClassifier
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU, Dropout
import tensorflow as tf
from imblearn.ensemble import EasyEnsembleClassifier

# from imblearn.over_sampling import SMOTE
# from sklearn.utils import class_weight

In [3]:
data = pd.read_feather('./output/full_df_wNA_labelled')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458913 entries, 0 to 458912
Columns: 189 entries, B_37 to Default_Flag
dtypes: category(4), float64(184), int64(1)
memory usage: 649.5 MB


In [13]:
null_cols = ['D_17','D_38', 'D_43', 'D_96', 'D_39', 'B_7', 'D_73', 'B_22', 'D_12', 'D_132', 
'D_114', 'D_80', 'D_97', 'R_8', 'B_26', 'D_110', 'R_7', 'D_11', 'D_95', 'D_105', 'D_48', 'D_141', 
'D_142', 'D_10', 'D_68', 'D_92', 'D_31', 'D_106', 'B_29', 'D_133', 'S_23', 'D_89', 'S_19', 'D_8', 'S_25', 'S_7', 'D_64', 'D_40', 'D_22']

# data_encoded = data.drop(null_cols,axis=1)

cat_columns = ['D_36','D_44']
data_encoded = pd.get_dummies(data,columns=cat_columns,drop_first=True)
data_encoded = data_encoded.astype({col: 'uint8' for col in data_encoded.select_dtypes('category').columns})

scaler = StandardScaler()
std_train_data = scaler.fit_transform(data_encoded.drop('Default_Flag',axis=1))
std_train_data = pd.DataFrame(std_train_data).fillna(1000)

In [12]:
# std_train_data = pd.DataFrame(std_train_data)
# imputer = KNNImputer(n_neighbors=100,weights='distance',add_indicator=True)
# std_train_data = imputer.fit_transform(std_train_data)


In [14]:
X = std_train_data
y = data_encoded['Default_Flag']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
# X_train_rs, y_train_rs = SMOTE(n_jobs=-1,).fit_resample(X_train, y_train)

### Random Forest

In [15]:
rf = RandomForestClassifier(random_state=42,class_weight="balanced",n_jobs=-1,bootstrap=True,)

rf.fit(X_train,y_train)
rf_preds = rf.predict(X_test)

print(classification_report(y_test, rf_preds))
print(confusion_matrix(y_test, rf_preds))

              precision    recall  f1-score   support

           0       0.85      0.98      0.91     85022
           1       0.71      0.54      0.61      8625
           2       0.43      0.34      0.38     11606
           3       0.33      0.03      0.05      9476

    accuracy                           0.80    114729
   macro avg       0.58      0.47      0.49    114729
weighted avg       0.75      0.80      0.76    114729

[[83411   276  1192   143]
 [ 2176  4650  1711    88]
 [ 6213  1164  3974   255]
 [ 6481   442  2316   237]]


### Easy Ensemble Classifier

In [18]:
ee = EasyEnsembleClassifier(random_state=42,sampling_strategy={0:50000,1:10000,2:10000,3:10000},n_estimators=30,replacement=True,n_jobs=-1,)

ee.fit(X_train,y_train)
ee_preds = ee.predict(X_test)

print(classification_report(y_test, ee_preds))
print(confusion_matrix(y_test, ee_preds))

              precision    recall  f1-score   support

           0       0.91      0.95      0.93     85022
           1       0.63      0.61      0.62      8625
           2       0.42      0.35      0.38     11606
           3       0.30      0.25      0.27      9476

    accuracy                           0.80    114729
   macro avg       0.56      0.54      0.55    114729
weighted avg       0.79      0.80      0.79    114729

[[80599   614  1466  2343]
 [ 1071  5259  1689   606]
 [ 3298  1752  4083  2473]
 [ 3984   752  2399  2341]]


### XGBoost

In [19]:
xgb = XGBClassifier(random_state=42, n_estimators=175, subsample=0.5, learning_rate=0.1, max_depth=7, gamma=0.25, colsample_bytree=0.5)

xgb.fit(X_train,y_train)
xgb_preds = xgb.predict(X_test)

print(classification_report(y_test, xgb_preds))
print(confusion_matrix(y_test, xgb_preds))

              precision    recall  f1-score   support

           0       0.89      0.97      0.93     85022
           1       0.71      0.57      0.63      8625
           2       0.42      0.49      0.45     11606
           3       0.36      0.10      0.16      9476

    accuracy                           0.82    114729
   macro avg       0.60      0.53      0.54    114729
weighted avg       0.79      0.82      0.80    114729

[[82120   327  1880   695]
 [ 1215  4883  2325   202]
 [ 3864  1170  5677   895]
 [ 4556   450  3482   988]]


### Grid Search for Best parameters - XGB

In [None]:
param_grid = {
    "n_estimators" : [175],
    "max_depth": [7], # 3,5,9,10 done
    "learning_rate": [0.1], # 0.2,0.05,0.075 done
    "gamma": [0.25], #0, 1 done
    # "reg_lambda": [0, 1, 10],
    # "scale_pos_weight": [3, 5, 10],
    "subsample": [0.5],
    "colsample_bytree": [0.5] #0.8 done
}
# 0.81439, 0.81508, 0.81521

In [None]:
gridCV = GridSearchCV(XGBClassifier(random_state=42),param_grid, n_jobs=-1, cv=3, scoring="accuracy")

_ = gridCV.fit(X_train,y_train)

print('Best Score :',gridCV.best_score_)
print('Best parameters :',gridCV.best_params_)

### Neural Network

In [26]:
model = Sequential()

model.add(tf.keras.Input(shape=(193,)))
model.add(Dense(512))
model.add(Dropout(0.3))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(128))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(8, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(4, activation='softmax'))

model.compile(optimizer='Adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.fit(X_train, y_train,
          class_weight = {0:3 , 1:12, 2:10, 3:11},
          batch_size=2048,
          epochs=18,
          verbose=1,
          validation_data=(X_test, y_test))
          

Epoch 1/18
Epoch 2/18
Epoch 3/18
Epoch 4/18
Epoch 5/18
Epoch 6/18
Epoch 7/18
Epoch 8/18
Epoch 9/18
Epoch 10/18
Epoch 11/18
Epoch 12/18
Epoch 13/18
Epoch 14/18
Epoch 15/18
Epoch 16/18
Epoch 17/18
Epoch 18/18


<keras.callbacks.History at 0x2bfb0bf40>

In [28]:
tf_preds_prob = model.predict(X_test)
tf_preds = tf_preds_prob.argmax(axis=-1)
tf_preds[:5]



array([0, 2, 2, 0, 0])

In [29]:
from sklearn.metrics import accuracy_score


print(classification_report(y_test,tf_preds))
print(confusion_matrix(y_test,tf_preds))
print("Accuracy :",accuracy_score(y_test, tf_preds))

              precision    recall  f1-score   support

           0       0.93      0.87      0.90     85022
           1       0.00      0.00      0.00      8625
           2       0.27      0.82      0.41     11606
           3       0.00      0.00      0.00      9476

    accuracy                           0.73    114729
   macro avg       0.30      0.42      0.33    114729
weighted avg       0.72      0.73      0.71    114729

[[74330     0 10692     0]
 [  832     0  7793     0]
 [ 2101     0  9505     0]
 [ 2483     0  6993     0]]
Accuracy : 0.7307219621891587


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Grid Search - Neural Network

In [32]:
def build_model():    
    cv_tf = Sequential()
    cv_tf.add(tf.keras.Input(shape=(193,)))
    cv_tf.add(Dense(512))
    cv_tf.add(Dropout(0.3))
    cv_tf.add(Dense(256, activation='relu'))
    cv_tf.add(Dropout(0.2))
    cv_tf.add(Dense(128))
    cv_tf.add(Dropout(0.3))
    cv_tf.add(Dense(64, activation='relu'))
    cv_tf.add(Dropout(0.2))
    cv_tf.add(Dense(32, activation='relu'))
    cv_tf.add(Dropout(0.2))
    cv_tf.add(Dense(16, activation='relu'))
    cv_tf.add(Dropout(0.2))
    cv_tf.add(Dense(8, activation='relu'))
    cv_tf.add(Dropout(0.2))
    cv_tf.add(Dense(4, activation='softmax'))
    cv_tf.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])
    return cv_tf

k_model = KerasClassifier(build_fn=build_model, verbose=0)

parameters = {'batch_size': [2048],
              'epochs': [17, 18, 19]
              }
grid_search = GridSearchCV(estimator = k_model,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 5,
                           verbose=0)
_ = grid_search.fit(X_train, y_train, verbose = 1)

  k_model = KerasClassifier(build_fn=build_model, verbose=0)


Epoch 1/17
Epoch 2/17
Epoch 3/17
Epoch 4/17
Epoch 5/17
Epoch 6/17
Epoch 7/17
Epoch 8/17
Epoch 9/17
Epoch 10/17
Epoch 11/17
Epoch 12/17
Epoch 13/17
Epoch 14/17
Epoch 15/17
Epoch 16/17
Epoch 17/17
Epoch 1/17
Epoch 2/17
Epoch 3/17
Epoch 4/17
Epoch 5/17
Epoch 6/17
Epoch 7/17
Epoch 8/17
Epoch 9/17
Epoch 10/17
Epoch 11/17
Epoch 12/17
Epoch 13/17
Epoch 14/17
Epoch 15/17
Epoch 16/17
Epoch 17/17
Epoch 1/17
Epoch 2/17
Epoch 3/17
Epoch 4/17
Epoch 5/17
Epoch 6/17
Epoch 7/17
Epoch 8/17
Epoch 9/17
Epoch 10/17
Epoch 11/17
Epoch 12/17
Epoch 13/17
Epoch 14/17
Epoch 15/17
Epoch 16/17
Epoch 17/17
Epoch 1/17
Epoch 2/17
Epoch 3/17
Epoch 4/17
Epoch 5/17
Epoch 6/17
Epoch 7/17
Epoch 8/17
Epoch 9/17
Epoch 10/17
Epoch 11/17
Epoch 12/17
Epoch 13/17
Epoch 14/17
Epoch 15/17
Epoch 16/17
Epoch 17/17
Epoch 1/17
Epoch 2/17
Epoch 3/17
Epoch 4/17
Epoch 5/17
Epoch 6/17
Epoch 7/17
Epoch 8/17
Epoch 9/17
Epoch 10/17
Epoch 11/17
Epoch 12/17
Epoch 13/17
Epoch 14/17
Epoch 15/17
Epoch 16/17
Epoch 17/17
Epoch 1/18
Epoch 2/18
Epo

In [33]:
print(_.best_params_)
print(_.best_score_)

{'batch_size': 2048, 'epochs': 18}
0.8095350205795349


### Prediction for Submission

In [27]:
test = pd.read_feather('./output/val_allx_Optmzd_feather')

In [30]:
null_cols = ['D_17','D_38', 'D_43', 'D_96', 'D_39', 'B_7', 'D_73', 'B_22', 'D_12', 'D_132', 
'D_114', 'D_80', 'D_97', 'R_8', 'B_26', 'D_110', 'R_7', 'D_11', 'D_95', 'D_105', 'D_48', 'D_141', 
'D_142', 'D_10', 'D_68', 'D_92', 'D_31', 'D_106', 'B_29', 'D_133', 'S_23', 'D_89', 'S_19', 'D_8', 'S_25', 'S_7', 'D_64', 'D_40', 'D_22']

# test.drop(null_cols,axis=1,inplace=True)


test_encoded = pd.get_dummies(test,columns=cat_columns,drop_first=True)

In [31]:
test_encoded = test_encoded.astype({col: 'uint8' for col in test_encoded.select_dtypes('category').columns})

In [139]:
# for col in test_encoded.columns:
    # test_encoded[col].fillna(100)

In [33]:
# norm_test_data = test_encoded.copy()
scaler = StandardScaler()

std_val_data = scaler.fit_transform(test_encoded.drop('ID',axis=1))
std_val_data = pd.DataFrame(std_val_data).fillna(1000)

In [36]:
rf_preds = rf.predict(std_val_data)
rfPred = pd.DataFrame(test_encoded['ID'])
rfPred['Label'] = rf_preds

In [37]:
ee_preds = ee.predict(std_val_data)
eePred = pd.DataFrame(test_encoded['ID'])
eePred['Label'] = ee_preds

In [38]:
# valx_pred = rf.predict(test_encoded.drop('ID',axis=1))
xgp_preds = xgb.predict(std_val_data) 
xgbPred = pd.DataFrame(test_encoded['ID'])
xgbPred['Label'] = xgp_preds

In [39]:
rfPred['Label'].value_counts()

0    406704
2     33522
1     25926
3      1814
Name: Label, dtype: int64

In [40]:
eePred['Label'].value_counts()

0    360282
2     39797
1     37692
3     30195
Name: Label, dtype: int64

In [41]:
xgbPred['Label'].value_counts()

0    353441
2     48433
1     42804
3     23288
Name: Label, dtype: int64

In [42]:
rfPred.to_csv('./output/class_0.csv',index=False)
eePred.to_csv('./output/class_1_3.csv',index=False)
xgbPred.to_csv('./output/class_2.csv',index=False)

In [58]:
fullPred =  rfPred.join(eePred['Label'],rsuffix="1&3").join(xgbPred['Label'],rsuffix="2")
fullPred.to_csv('./output/Full-Class.csv',index=False)
fullPred.head(10)

Unnamed: 0,ID,Label,Label1&3,Label2
0,3337446730,0,0,0
1,7888784125,0,0,0
2,9871378905,1,2,2
3,8891869609,0,0,0
4,2006443827,0,0,0
5,7340888752,0,0,0
6,1646892613,0,0,0
7,3726705791,0,0,0
8,7542013488,0,0,0
9,4183652505,0,0,2


In [53]:
class_2 = fullPred[fullPred['Label2']==2][['ID','Label2']]

In [57]:
# class_1_3 = 
fullPred[fullPred['Label1&3']==1][['ID','Label1&3']] #.merge(fullPred[fullPred['Label1&3']==3][['ID','Label1&3']])

Unnamed: 0,ID,Label1&3
10,1275012945,1
19,6226919132,1
23,9178484336,1
49,5117410894,1
50,6917559729,1
...,...,...
467912,9644730765,1
467927,1388125088,1
467946,6285658361,1
467961,5512100313,1


In [69]:
finalPreds = pd.read_csv('./output/mode.csv')
finalPreds.head()

Unnamed: 0,ID,Label,Label1&3,Label2,Final,Unnamed: 5,Mode++
0,3337446730,0,0,0,0,0.0,0
1,7888784125,0,0,0,0,0.0,0
2,9871378905,1,2,2,2,2.0,2
3,8891869609,0,0,0,0,0.0,0
4,2006443827,0,0,0,0,0.0,0


In [70]:
finalPreds['Mode++'].value_counts()

0    371964
2     37023
1     35852
3     23127
Name: Mode++, dtype: int64

In [71]:
finalfinal = finalPreds[['ID','Mode++']]
finalfinal

Unnamed: 0,ID,Mode++
0,3337446730,0
1,7888784125,0
2,9871378905,2
3,8891869609,0
4,2006443827,0
...,...,...
467961,5512100313,1
467962,2488191588,0
467963,4896427435,2
467964,9244915426,0


In [72]:
finalfinal.to_csv('./output/FINAL_FINAL_FINAL_Buahaha.csv',index=False)