In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report

In [None]:
# Loading data
df_train = pd.read_csv('../input/home-credit-engineeredhome-credit-competition/home_credit_train_engineered.csv')

df_train.dropna(inplace=True)


In [None]:
# Obtaining features
features = [f for f in df_train.columns if f not in ['Unnamed: 0','TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]


# Obtaining Undersampled and Oversampled Data

In [None]:
# Getting train test splits and scaling data

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


x_train, x_test, y_train, y_test = train_test_split(
df_train[features], df_train['TARGET'], test_size=0.33, random_state=42)

scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
# Getting undersampled data
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42, replacement=True)# fit predictor and target variable
x_u, y_u = rus.fit_resample(x_train, y_train)

print('Original train set shape:', len(x_train))
print('Resample train set shape :', len(x_u))


In [None]:
# Getting oversampled data

from imblearn.over_sampling import SMOTE 

sm = SMOTE(random_state=42)

x_o, y_o = sm.fit_resample(x_train, y_train)

print('Original train set shape:', len(x_train))
print('Resample train set shape :', len(x_o))



# print('\nBalance of positive and negative classes (%):')
# y_sm.value_counts(normalize=True) * 100

In [None]:
# Experimenting with classifiers

In [None]:
import joblib

## Defining necessary functions to maintain DRY code
#### as much as possible

In [None]:
def normal_classifier(clf,x_train = x_train,y_train = y_train,x_test = x_test,y_test = y_test):
    
    print("Classification on data from dataset\n")
    clf.fit(x_train,y_train)
    print("\n\nTraining report")
    train_report = classification_report(y_train,clf.predict(x_train))
    print(train_report)
    print("\n\nTesting report")
    test_report = classification_report(y_test,clf.predict(x_test))
    print(test_report)


In [None]:
def undersampled_classifier(clf,x_u = x_u,y_u = y_u,x_test = x_test,y_test = y_test):
    print("Classification on undersampled data\n")
    clf.fit(x_u,y_u)
    print("\n\nTraining report")
    train_report = classification_report(y_u,clf.predict(x_u))
    print(train_report)
    print("\n\nTesting report")
    test_report = classification_report(y_test,clf.predict(x_test))
    print(test_report)


In [None]:
def oversampled_classifier(clf,x_o=x_o,y_=y_o,x_test=x_test,y_test=y_test):
    print("\n\n For Oversampled data\n")
    clf.fit(x_o,y_o)
    print("\n\nTraining report")
    train_report = classification_report(y_o,clf.predict(x_o))
    print(train_report)
    print("\n\nTesting report")
    test_report = classification_report(y_test,clf.predict(x_test))
    print(test_report)



# LightGBM classifier

In [None]:
# LightGBM classifier
import lightgbm as ltb
model = ltb.LGBMClassifier()

In [None]:
normal_classifier(model)

In [None]:
undersampled_classifier(model)

In [None]:
oversampled_classifier(model)

# SVM

In [None]:
from sklearn.svm import SVC
model = SVC(gamma='auto')

In [None]:
normal_classifier(model)

In [None]:
undersampled_classifier(model)

In [None]:
oversampled_classifier(model)

# Decision trees

In [None]:
from sklearn import tree
model = tree.DecisionTreeClassifier()

In [None]:
normal_classifier(model)

In [None]:
undersampled_classifier(model)

In [None]:
oversampled_classifier(model)

# Neural Networks

In [None]:
import tensorflow as tf
# Without sampling
model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=x_train[0].shape),
    tf.keras.layers.Dense(100,activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(50,activation='relu'),
    tf.keras.layers.Dense(1,activation='sigmoid')
])
model.compile(metrics = 'accuracy',optimizer='adam',loss='binary_crossentropy')

In [None]:
history = model.fit(x_train,y_train,validation_data=(x_test,y_test),epochs=10,verbose = 4)

In [None]:
c = classification_report(y_train,model.predict_classes(x_train))
print("\n\nTraining\n",c)
c = classification_report(y_test,model.predict_classes(x_test))
print("Testing\n",c)

In [None]:
# With undersampling

model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=x_u[0].shape),
    tf.keras.layers.Dense(100,activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(50,activation='relu'),
    tf.keras.layers.Dense(1,activation='sigmoid')
])


model.compile(metrics = 'accuracy',optimizer='adam',loss='binary_crossentropy')
model.fit(x_u,y_u,validation_data=(x_test,y_test),epochs=10,verbose = 5)
 



In [None]:
c = classification_report(y_u,model.predict_classes(x_u))
print("Training\n",c)
c = classification_report(y_test,model.predict_classes(x_test))
print("Testing\n",c)

In [None]:
# With oversampling

model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=x_o[0].shape),
    tf.keras.layers.Dense(100,activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(50,activation='relu'),
    tf.keras.layers.Dense(1,activation='sigmoid')
])


model.compile(metrics = 'accuracy',optimizer='adam',loss='binary_crossentropy')
model.fit(x_o,y_o,validation_data=(x_test,y_test),epochs=10,verbose = 5)
 

In [None]:
c = classification_report(y_o,model.predict_classes(x_o))
print("Training\n",c)
c = classification_report(y_test,model.predict_classes(x_test))
print("Testing\n",c)

# Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [None]:
normal_classifier(model)

In [None]:
undersampled_classifier(model)

In [None]:
oversampled_classifier(model)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()

In [None]:
normal_classifier(model)

In [None]:
undersampled_classifier(model)

In [None]:
oversampled_classifier(model)