# Imbalanced classification using tf

In [None]:
import random
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import sklearn.metrics
from sklearn.metrics import confusion_matrix as cm, classification_report as cr
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

random.seed(0)
%matplotlib inline

In [None]:
df = pd.read_csv('../input/company-bankruptcy-prediction/data.csv')
df

# EDA

In [None]:
df.describe()

In [None]:
counts = df['Bankrupt?'].value_counts()
n_neg = counts[counts.index == 0].values[0]
n_pos = counts[counts.index == 1].values[0]
print(n_neg, n_pos)

In [None]:
# ' Net Income Flag'
df = df.drop(columns=df.std()[(df.std() == 0)].index)

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2)
train_df, val_df = train_test_split(df, test_size=0.2)

# Form np arrays of labels and features.
y_train = np.array(train_df.pop('Bankrupt?'))
y_val = np.array(val_df.pop('Bankrupt?'))
y_test = np.array(test_df.pop('Bankrupt?'))

x_train = np.array(train_df)
x_val = np.array(val_df)
x_test = np.array(test_df)

In [None]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)

x_val = scaler.transform(x_val)
x_test = scaler.transform(x_test)

In [None]:
columns = train_df.columns  # [kbest.get_support()]
temp_df = pd.DataFrame(x_train, columns=columns)
temp_df['y'] = y_train
corr = temp_df.corr()
plt.figure(figsize=(15, 15))
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns)
plt.show()
plt.close()

corr = corr['y'].drop('y').abs().sort_values(ascending=False)
print(corr)
print(len(corr[corr > 0.05]))

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8), sharey=True, sharex=True)
bins = ax1.hist(temp_df[temp_df['y'] == 0][' Net Income to Total Assets'], bins='auto', density=True)
ax2.hist(temp_df[temp_df['y'] == 1][' Net Income to Total Assets'], bins=bins[1], density=True)
plt.show()
plt.close()

In [None]:
fig, ax = plt.subplots()
scatter = ax.scatter(temp_df[' Net Income to Total Assets'], temp_df['y'], c=temp_df['y'], s=1)
legend1 = ax.legend(*scatter.legend_elements(), loc="upper left", title="Classes")
ax.add_artist(legend1)
plt.show()
plt.close()

In [None]:
from sklearn.feature_selection import SelectKBest

kbest = SelectKBest(k=50)
x_train = kbest.fit_transform(x_train, y_train)
x_test = kbest.transform(x_test)
x_val = kbest.transform(x_val)

# Model

In [None]:
class PrintLRCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        print(f'LR at end of epoch {epoch} = {tf.keras.backend.eval(self.model.optimizer.lr(self.model.optimizer.iterations))}')
        

def plot_history(history):
    metrics = ['loss', 'prc', 'precision', 'recall']
    for n, metric in enumerate(metrics):
        name = metric.replace("_"," ").capitalize()
        plt.subplot(2,2,n+1)
        plt.plot(history.epoch, history.history[metric], label='Train')
        plt.plot(history.epoch, history.history['val_'+metric], linestyle="--", label='Val')
        plt.xlabel('Epoch')
        plt.ylabel(name)
        if metric == 'loss':
            plt.ylim([0, plt.ylim()[1]])
        elif metric == 'auc':
            plt.ylim([0.8,1])
        else:
            plt.ylim([0,1])
            
        plt.legend()
    plt.show()

    
def report(y_true, y_pred, p=0.5):
    y_pred = y_pred = np.where(y_pred >= 0.5, 1, 0).squeeze()
    cm_ = cm(y_true=y_true, y_pred=y_pred)
    plt.figure(figsize=(5,5))
    sns.heatmap(cm_, annot=True, fmt="d")
    plt.title('Confusion matrix @{:.2f}'.format(p))
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    plt.show()

    print('TN: ', cm_[0][0])
    print('FP: ', cm_[0][1])
    print('FN: ', cm_[1][0])
    print('TP: ', cm_[1][1])

    print(cr(y_true=y_true, y_pred=y_pred))
    
    
def plot_roc(y_true, y_pred, **kwargs):
    fp, tp, _ = sklearn.metrics.roc_curve(y_true, y_pred)

    plt.plot(100*fp, 100*tp, linewidth=2, **kwargs)
    plt.xlabel('False positives [%]')
    plt.ylabel('True positives [%]')
    plt.xlim([-0.5,20])
    plt.ylim([80,100.5])
    plt.grid(True)
    ax = plt.gca()
    ax.set_aspect('equal')
    plt.show()

## Stupid baselines

In [None]:
def guess_all_neg(x):
    return np.zeros(len(x), dtype='float32')


y_pred = guess_all_neg(x_train)
report(y_train, y_pred)

y_pred = guess_all_neg(x_test)
report(y_test, y_pred)

## Model with class weights + bias init

In [None]:
BATCH_SIZE = 2048
METRICS = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
      keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
]


def make_model(metrics=METRICS, output_bias=None, lr=1e-3, dropout=0.5):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
    model = keras.Sequential([
        keras.layers.Dense(128, activation='relu', input_shape=(x_train.shape[-1],)),
        keras.layers.Dense(128, activation='relu'),
        keras.layers.Dropout(dropout),
        keras.layers.Dense(1, activation='sigmoid', bias_initializer=output_bias),
    ])

    model.compile(optimizer=keras.optimizers.Adam(learning_rate=lr),
                  loss=keras.losses.BinaryCrossentropy(),
                  metrics=metrics)

    return model

In [None]:
# We want initial loss to be around n_pos/(n_pos + n_neg)=220/6819~0.03
# https://karpathy.github.io/2019/04/25/recipe/#2-set-up-the-end-to-end-trainingevaluation-skeleton--get-dumb-baselines

# To do this we set the initial_bias = log(n_pos/n_neg)
EPOCHS = 200

initial_bias = np.log([n_pos / n_neg])
print(initial_bias)

random.seed(0)
lr = 1e-3
lr = tf.keras.experimental.CosineDecay(lr, decay_steps=EPOCHS * len(x_train)//BATCH_SIZE)
model = make_model(output_bias=initial_bias, lr=lr)
model.summary()
model.save_weights('initial_weights')

results = model.evaluate(x_train, y_train, batch_size=BATCH_SIZE, verbose=0)
print(results)

In [None]:
random.seed(0)

callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_prc', verbose=1, patience=20, mode='max', restore_best_weights=True),
    # PrintLRCallback()
    # tf.keras.callbacks.ReduceLROnPlateau(monitor='val_prc', verbose=1, patience=10, mode='max', min_lr=0, factor=0.2)
]

total = n_pos + n_neg
class_weight = {0: (total / n_neg)/2.0, 1: (total / n_pos)/2.0}
print(class_weight)

history = model.fit(
    x_train,
    y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    callbacks=callbacks,
    validation_data=(x_val, y_val),
    class_weight=class_weight,
    verbose=1
)

results = model.evaluate(x_train, y_train, batch_size=BATCH_SIZE, verbose=0)
for name, value in zip(model.metrics_names, results):
    print(name, ': ', value)

results = model.evaluate(x_val, y_val, batch_size=BATCH_SIZE, verbose=0)
for name, value in zip(model.metrics_names, results):
    print(name, ': ', value)

plot_history(history)

In [None]:
y_pred = model.predict(x_train)
report(y_train, y_pred)
plot_roc(y_train, y_pred)

In [None]:
y_pred = model.predict(x_test)
report(y_test, y_pred)
plot_roc(y_test, y_pred)

results = model.evaluate(x_test, y_test, batch_size=BATCH_SIZE, verbose=0)
for name, value in zip(model.metrics_names, results):
    print(name, ': ', value)

## SMOTE

In [None]:
from imblearn.over_sampling import SMOTE 

sm = SMOTE(random_state=0)
x_train_res, y_train_res = sm.fit_resample(x_train, y_train)

In [None]:
random.seed(0)

model = make_model()
model.load_weights('initial_weights')
model.layers[-1].bias.assign([0])

history = model.fit(
    x_train_res,
    y_train_res,
    epochs=EPOCHS,
    callbacks=callbacks,
    batch_size=BATCH_SIZE,
    verbose=1,
    validation_data=(x_val, y_val)
)

results = model.evaluate(x_train_res, y_train_res, batch_size=BATCH_SIZE, verbose=0)
for name, value in zip(model.metrics_names, results):
    print(name, ': ', value)

results = model.evaluate(x_val, y_val, batch_size=BATCH_SIZE, verbose=0)
for name, value in zip(model.metrics_names, results):
    print(name, ': ', value)

plot_history(history)

In [None]:
y_pred = model.predict(x_train)
report(y_train, y_pred)
plot_roc(y_train, y_pred)

In [None]:
y_pred = model.predict(x_test)
report(y_test, y_pred)
plot_roc(y_test, y_pred)

results = model.evaluate(x_test, y_test, batch_size=BATCH_SIZE, verbose=0)
for name, value in zip(model.metrics_names, results):
    print(name, ': ', value)