# Insurance Policy Binary Classification - TensorFlow CNN

## [1] Import the Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer 

from sklearn.pipeline import Pipeline

## [2] Load and Split the Data

In [None]:
# Whole Dataset
data = pd.read_csv("TrainingDataset_2023Qualification.csv")

In [None]:
# Split the Features and Target Variables
features = data.iloc[:, 2:]
target = data.iloc[:, 1]

In [None]:
# Split the Data into Training, Validation, and Testing Sets
x_train_val, x_test, y_train_val, y_test = train_test_split(features, target, test_size=0.20)

# Split the Data into Training and Validation Sets
x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, test_size=0.20)

## [3] Exploratory Data Analysis

### [3.1] Explore the Data

In [None]:
# Check Data Types
x_train.dtypes

#### 3.1.1 Numerical Continuous Variables

In [None]:
# Explore Numerical Continuous Variables for Anomalies [Outliers]
x_train[["policyHolderAge", "homeInsurancePremium", "nbWeeksInsured"]].describe()

In [None]:
# Plot a Histogram of the Numerical Variables
fig, ax = plt.subplots(1, 3, figsize=(15, 5))

ax[0].hist(x_train["policyHolderAge"], bins=20, color='blue')
ax[0].set_title('policyHolderAge')
ax[1].hist(x_train["homeInsurancePremium"], bins=20, color='red')
ax[1].set_title('homeInsurancePremium')
ax[2].hist(x_train["nbWeeksInsured"], bins=20, color='green')
ax[2].set_title('nbWeeksInsured')

fig.tight_layout()

plt.show()

In [None]:
# Plot a Scatterplot of the Numerical Variables
fig, ax = plt.subplots(1, 3, figsize=(15, 5))

ax[0].scatter(x_train["policyHolderAge"], x_train["homeInsurancePremium"], color='blue')
ax[0].set_title('policyHolderAge vs. homeInsurancePremium')
ax[1].scatter(x_train["policyHolderAge"], x_train["nbWeeksInsured"], color='red')
ax[1].set_title('policyHolderAge vs. nbWeeksInsured')
ax[2].scatter(x_train["homeInsurancePremium"], x_train["nbWeeksInsured"], color='green')
ax[2].set_title('homeInsurancePremium vs. nbWeeksInsured')

fig.tight_layout()

plt.show()

In [None]:
# Check Outliers for homeInsurancePremium
""" Outliers do exist within the homeInsurancePremium variable. However, having more than one outlier means they did not come by mistake. Hence, they will be kept but with caution. """
mask = x_train["homeInsurancePremium"] > 4000
print(mask.sum())

In [None]:
# Check for Colinearity
x_train[["policyHolderAge", "homeInsurancePremium", "nbWeeksInsured"]].corr()

#### 3.1.2 Categorical Variables - High Cardinality

In [None]:
# One can use: frequency encoding, target encoding, hashing trick encoding, or embeding
# Check the Number of Distinct Values - Variables are already Label Encoded!
x_train[["territory", "saleChannel"]].nunique()

In [None]:
# Plot a Histogram of the Variables
ig, ax = plt.subplots(1, 2, figsize=(15, 5))

ax[0].hist(x_train["territory"], bins=20, color='blue')
ax[0].set_title('territory')
ax[1].hist(x_train["saleChannel"], bins=20, color='red')
ax[1].set_title('saleChannel')

fig.tight_layout()

plt.show()

In [None]:
# Check for Categorical Dependence
# Comment: Some p-values are zero which indicates there is a relationship between their corresponding variables.
#          A low p-value doesn't necessarily mean the relationship is strong, and it doesn't provide information about the nature or strength of the relationship.
#          In addition, a low p-value could also mean that the sample size is large enough to detect even small differences between observed and expected frequencies.
from scipy.stats import chi2_contingency

contingency_table = pd.crosstab(index=x_train["territory"], columns=x_train["saleChannel"])
stat, p, dof, expected = chi2_contingency(contingency_table)

print("Chi-square Statistic is: " + str(stat))
print("p-value is: " + str(p))
print("Number of DOFs is: " + str(dof))


#### 3.1.3 Categorical Variables - Low Cardinality

In [None]:
# Check Frequencies of Categorical Variables
print(x_train[["Gender"]].value_counts())
print("---------")
print(x_train[["hasCanadianDrivingLicense" ]].value_counts())
print("---------")
print(x_train[["hasAutoInsurance"]].value_counts())
print("---------")
print(x_train[["hadVehicleClaimInPast"]].value_counts())
print("---------")
print(x_train[["isOwner"]].value_counts())
print("---------")
print(x_train[["rentedVehicle"]].value_counts())
print("---------")
print(x_train[["hasMortgage"]].value_counts())
print("---------")
print(x_train[["vehicleStatus"]].value_counts())

In [None]:
# Plot a Histogram of the Variables
columns = ['Gender', 'hasCanadianDrivingLicense', "hasAutoInsurance", "hadVehicleClaimInPast", "isOwner", "rentedVehicle", "hasMortgage", "vehicleStatus"]

# set the number of rows and columns for the subplot
nrows = 2
ncols = 4

# create the subplot grid
fig, ax = plt.subplots(nrows, ncols, figsize=(16, 8))

# loop through the columns and axes
for i, col in enumerate(columns):
    # calculate the frequency of each category in the column
    freq = x_train[col].value_counts()
    
    # get the axis for the subplot
    axi = ax.flat[i]
    
    # plot the frequency as a bar chart
    freq.plot(kind='bar', ax=axi)
    axi.set_title(col)
    axi.set_xlabel(col)
    axi.set_ylabel('Frequency')

# adjust the layout of the subplots
plt.tight_layout()

# display the plot
plt.show()

In [None]:
# Check for Dependence
# [1] Establish contingency tables
# [2] Perform Chi-squared test
# [3] Comment: Some p-values are zero which indicates there is a relationship between their corresponding variables.
#              A low p-value doesn't necessarily mean the relationship is strong, and it doesn't provide information about the nature or strength of the relationship.
#              In addition, a low p-value could also mean that the sample size is large enough to detect even small differences between observed and expected frequencies.
#              Proceed with caution!
from scipy.stats import chi2_contingency

p_matrix = -1 * np.ones((len(columns), len(columns)))

for i in range(len(columns)):
    for j in range(i+1, len(columns)):
        
        contingency_table = pd.crosstab(index=x_train[columns[i]], columns=x_train[columns[j]])
        stat, p, dof, expected = chi2_contingency(contingency_table)
        
        p_matrix[i, j] = np.round(p, 5)
        
print(p_matrix)

#### 3.1.4 Response Variable

In [None]:
# Check Response Variable for Class Imbalance
target_imbalance = (y_train.value_counts()/x_train.shape[0])
target_imbalance

In [None]:
# Plot Classes
frequency = y_train.value_counts()

# Plot the frequency using a bar plot
frequency.plot(kind='bar')

# Add labels to the x and y axes
plt.xlabel('Category')
plt.ylabel('Frequency')

# Display the plot
plt.show()

### [3.2] Check for Missing Values

In [None]:
# NA Values in "hasMortgage" means the house is rented. Hence, it needs to be imputed as a thid category!
x_train.isna().sum()

### [3.3] Data Pre-processing Pipelines
Create a set of data preprocessing pipelines before building the CNN model.

In [None]:
# Create Columns Lists
num_features = ["policyHolderAge", "homeInsurancePremium", "nbWeeksInsured"]
cat_long_features = ["territory", "saleChannel"]
cat_features = ['Gender', 'hasCanadianDrivingLicense', "hasAutoInsurance", "hadVehicleClaimInPast", "isOwner", "rentedVehicle", "hasMortgage", "vehicleStatus"]

##### Transformer #1
Imputation + Categorical Encodings

In [None]:
impute_and_onehot_pipe = Pipeline(steps=[
                                            ('imputer', SimpleImputer(strategy='constant', fill_value=2)),
                                            ('onehot', OneHotEncoder(handle_unknown='ignore'))
                                        ])


preprocessing_transformer1 = ColumnTransformer(transformers=[
                                                                ('genderPipe', OneHotEncoder(handle_unknown='ignore'), ["Gender"]),
                                                                ('vehicleClaimPipe', OneHotEncoder(handle_unknown='ignore'), ["hadVehicleClaimInPast"]),
                                                                ('hasMortgagePipe', impute_and_onehot_pipe, ["hasMortgage"]),
                                                                ('vehicleStatusPipe', OneHotEncoder(handle_unknown='ignore'), ["vehicleStatus"])
                                                            ], remainder='passthrough')

preprocessing_pipeline1 = Pipeline([
                                      ('transform_column', preprocessing_transformer1)
                                   ])
preprocessing_pipeline1


##### Transformer #3
Imputation + Categorical Encodings + Numeric Standardization + Normalization (cat_long_features)

In [None]:
import category_encoders as ce

impute_and_onehot_pipe = Pipeline(steps=[
                                            ('imputer', SimpleImputer(strategy='constant', fill_value=2)),
                                            ('onehot', OneHotEncoder(handle_unknown='ignore'))
                                        ])


preprocessing_transformer3 = ColumnTransformer(transformers=[
                                                                ("numericPipe_log", FunctionTransformer(np.log, validate=True), ["policyHolderAge", "homeInsurancePremium"]),
                                                                ("numericPipe", StandardScaler(), num_features),
                                                                ("catLongPipe", StandardScaler(), cat_long_features),
                                                                ('genderPipe', OneHotEncoder(handle_unknown='ignore'), ["Gender"]),
                                                                ('vehicleClaimPipe', OneHotEncoder(handle_unknown='ignore'), ["hadVehicleClaimInPast"]),
                                                                ('hasMortgagePipe', impute_and_onehot_pipe, ["hasMortgage"]),
                                                                ('vehicleStatusPipe', OneHotEncoder(handle_unknown='ignore'), ["vehicleStatus"]),
                                                            ], remainder='passthrough')

preprocessing_pipeline3 = Pipeline([
                                      ('transform_column', preprocessing_transformer3)
                                   ])
preprocessing_pipeline3

##### Transformer #5
Imputation + Categorical Encodings + Numeric Standardization + Count Encoding (cat_long_features) + Normalization (cat_long_features)

In [None]:
import category_encoders as ce

impute_and_onehot_pipe = Pipeline(steps=[
                                            ('imputer', SimpleImputer(strategy='constant', fill_value=2)),
                                            ('onehot', OneHotEncoder(handle_unknown='ignore'))
                                        ])


preprocessing_transformer5 = ColumnTransformer(transformers=[
                                                                ("numericPipe_log", FunctionTransformer(np.log, validate=True), ["policyHolderAge", "homeInsurancePremium"]),
                                                                ("numericPipe", StandardScaler(), num_features),
                                                                ("catLongPipe", StandardScaler(), cat_long_features),
                                                                ('genderPipe', OneHotEncoder(handle_unknown='ignore'), ["Gender"]),
                                                                ('vehicleClaimPipe', OneHotEncoder(handle_unknown='ignore'), ["hadVehicleClaimInPast"]),
                                                                ('hasMortgagePipe', impute_and_onehot_pipe, ["hasMortgage"]),
                                                                ('vehicleStatusPipe', OneHotEncoder(handle_unknown='ignore'), ["vehicleStatus"]),
                                                              ], remainder='passthrough')

preprocessing_pipeline5 = Pipeline([
                                        ("terr_Encoder", ce.CountEncoder(cols=["territory"], return_df=True)),
                                        ("saleChannel_Encoder", ce.CountEncoder(cols=["saleChannel"], return_df=True)),
                                        ('transform_column', preprocessing_transformer5)
                                   ])

preprocessing_pipeline5

##### Transformer #7
Imputation + Categorical Encodings + Numeric Standardization + Target Encoding (cat_long_features) + Normalization (cat_long_features)

In [None]:
import category_encoders as ce

impute_and_onehot_pipe = Pipeline(steps=[
                                            ('imputer', SimpleImputer(strategy='constant', fill_value=2)),
                                            ('onehot', OneHotEncoder(handle_unknown='ignore'))
                                        ])


preprocessing_transformer7 = ColumnTransformer(transformers=[
                                                                ("numericPipe_log", FunctionTransformer(np.log, validate=True), ["policyHolderAge", "homeInsurancePremium"]),
                                                                ("numericPipe", StandardScaler(), num_features),
                                                                ("catLongPipe", StandardScaler(), cat_long_features),
                                                                ('genderPipe', OneHotEncoder(handle_unknown='ignore'), ["Gender"]),
                                                                ('vehicleClaimPipe', OneHotEncoder(handle_unknown='ignore'), ["hadVehicleClaimInPast"]),
                                                                ('hasMortgagePipe', impute_and_onehot_pipe, ["hasMortgage"]),
                                                                ('vehicleStatusPipe', OneHotEncoder(handle_unknown='ignore'), ["vehicleStatus"]),
                                                              ], remainder='passthrough')

preprocessing_pipeline7 = Pipeline([
                                        ("terr_Encoder", ce.TargetEncoder(cols=["territory"], return_df=True)),
                                        ("saleChannel_Encoder", ce.TargetEncoder(cols=["saleChannel"], return_df=True)),
                                        ('transform_column', preprocessing_transformer7)
                                   ])

preprocessing_pipeline7

## [4] Prediction Models

In [None]:
import tensorflow as tf
from tensorflow import keras

import os
import tempfile

import sklearn
from sklearn.metrics import confusion_matrix

In [None]:
# Transform the Data
x_train_trans = preprocessing_pipeline5.fit_transform(x_train)
x_val_trans = preprocessing_pipeline5.fit_transform(x_val)
x_test_trans = preprocessing_pipeline5.fit_transform(x_test)

### [4.1] Model Metrics & Network Architecture

In [None]:
METRICS = [
              keras.metrics.TruePositives(name='tp'),
              keras.metrics.FalsePositives(name='fp'),
              keras.metrics.TrueNegatives(name='tn'),
              keras.metrics.FalseNegatives(name='fn'), 
              keras.metrics.BinaryAccuracy(name='accuracy'),
              keras.metrics.Precision(name='precision'),
              keras.metrics.Recall(name='recall'),
              keras.metrics.AUC(name='auc'),
              keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
        ]

def make_model(metrics=METRICS, output_bias=None):
  if output_bias is not None:
    output_bias = tf.keras.initializers.Constant(output_bias)
    
  
  # Try different CNN architectures!
  model = keras.Sequential([
                              keras.layers.Dense(19, activation='relu', input_shape=(x_train_trans.shape[-1],)),
      
                              keras.layers.Dense(250, activation='relu'),
                              keras.layers.Dense(250, activation='relu'),
                              keras.layers.Dense(250, activation='relu'),
                              keras.layers.Dense(250, activation='relu'),
                              keras.layers.Dense(250, activation='relu'),
                              keras.layers.Dense(250, activation='relu'),
                              keras.layers.Dense(250, activation='relu'),
                              keras.layers.Dense(250, activation='relu'),
      
                              keras.layers.Dense(250, activation='relu'),
                              keras.layers.Dense(250, activation='relu'),
                              keras.layers.Dense(250, activation='relu'),
                              keras.layers.Dense(250, activation='relu'),
                              keras.layers.Dense(250, activation='relu'),
                              keras.layers.Dense(250, activation='relu'),
                              keras.layers.Dense(250, activation='relu'),
                              keras.layers.Dense(250, activation='relu'),
 
                              keras.layers.Dropout(0.50),
                              keras.layers.Dense(1, activation='sigmoid', bias_initializer=output_bias)
                          ])

  model.compile(
                  optimizer=keras.optimizers.Adam(learning_rate=1e-3),
                  loss=keras.losses.BinaryCrossentropy(),
                  metrics=metrics
              )

  return model


### [4.2] Baseline Model

#### 4.2.1 Build the Model

In [None]:
EPOCHS = 100
BATCH_SIZE = 2048

early_stopping = tf.keras.callbacks.EarlyStopping(
                                                    monitor='val_prc', 
                                                    verbose=1,
                                                    patience=10,
                                                    mode='max',
                                                    restore_best_weights=True
                                                )

model = make_model()
model.summary()

In [None]:
# Test the Model
model.predict(x_train_trans[:10])

In [None]:
# Evaluate Losses
results = model.evaluate(x_train_trans, y_train, batch_size=BATCH_SIZE, verbose=0)
print("Loss: {:0.4f}".format(results[0]))

#### 4.2.2 Correct for Initial Bias

In [None]:
neg, pos = np.bincount(target)
total = neg + pos

In [None]:
initial_bias = np.log([pos/neg])
initial_bias

In [None]:
# Try the Model with the New Initial Bias
model = make_model(output_bias=initial_bias)
model.predict(x_train_trans[:10])

In [None]:
# Check Initial Loss
results = model.evaluate(x_train_trans, y_train, batch_size=BATCH_SIZE, verbose=0)
print("Loss: {:0.4f}".format(results[0])) # Loss Reduced Dramatically from 0.8197 to 0.3730!

#### 4.2.3 Save Initial Weights

In [None]:
initial_weights = os.path.join(tempfile.mkdtemp(), 'initial_weights')
model.save_weights(initial_weights)

#### 4.2.4 Confirm Initial Bias Effectiveness

In [None]:
# No Bias Model
model = make_model()
model.load_weights(initial_weights)
model.layers[-1].bias.assign([0.0])
zero_bias_history = model.fit(
                                x_train_trans,
                                y_train,
                                batch_size=BATCH_SIZE,
                                epochs=20,
                                validation_data=(x_val_trans, y_val), 
                                verbose=0
                            )

In [None]:
# Biased Model
model = make_model()
model.load_weights(initial_weights)
careful_bias_history = model.fit(
                                    x_train_trans,
                                    y_train,
                                    batch_size=BATCH_SIZE,
                                    epochs=20,
                                    validation_data=(x_val_trans, y_val), 
                                    verbose=0
                                )


In [None]:
# Plot the Results
def plot_loss(history, label, color):
  # Use a log scale on y-axis to show the wide range of values.
  plt.semilogy(history.epoch, history.history['loss'], color=color, label='Train ' + label)
    
  plt.semilogy(history.epoch, history.history['val_loss'], color=color, label='Val ' + label, linestyle="--")

  plt.xlabel('Epoch')
  plt.ylabel('Loss')
  plt.title("Loss vs. # of Epochs")

In [None]:
plot_loss(zero_bias_history, "Zero Bias", "blue")
plot_loss(careful_bias_history, "Careful Bias", "green")

#### 4.2.5 Train the Model

In [None]:
model = make_model()
model.load_weights(initial_weights)
baseline_history = model.fit(
                                x_train_trans,
                                y_train,
                                batch_size=BATCH_SIZE,
                                epochs=EPOCHS,
                                callbacks=[early_stopping],
                                validation_data=(x_val_trans, y_val)
                            )

#### 4.2.6 Plot Training History

In [None]:
def plot_metrics(history):
  metrics = ['loss', 'prc', 'precision', 'recall']
  for n, metric in enumerate(metrics):
    name = metric.replace("_"," ").capitalize()
    plt.subplot(2,2,n+1)
    
    plt.plot(history.epoch, history.history[metric], color="blue", label='Train')
    plt.plot(history.epoch, history.history['val_'+metric], color="blue", linestyle="--", label='Val')
    
    plt.xlabel('Epoch')
    plt.ylabel(name)
    if metric == 'loss':
      plt.ylim([0, plt.ylim()[1]])
    elif metric == 'auc':
      plt.ylim([0.8,1])
    else:
      plt.ylim([0,1])

    plt.legend()

In [None]:
plot_metrics(baseline_history)

#### 4.2.7 Evaluate the Metrics

In [None]:
train_predictions_baseline = model.predict(x_train_trans, batch_size=BATCH_SIZE)
test_predictions_baseline = model.predict(x_test_trans, batch_size=BATCH_SIZE)

In [None]:
def plot_cm(labels, predictions, p=0.5):
  cm = confusion_matrix(labels, predictions > p)
  plt.figure(figsize=(5,5))
  sns.heatmap(cm, annot=True, fmt="d")
  plt.title('Confusion matrix @{:.2f}'.format(p))
  plt.ylabel('Actual label')
  plt.xlabel('Predicted label')

  print('No Buy Detected (True Negatives): ', cm[0][0])
  print('No Buy Incorrectly Detected (False Positives): ', cm[0][1])
  print('Buy Missed (False Negatives): ', cm[1][0])
  print('Buy Detected (True Positives): ', cm[1][1])
  print('Total Buy Decisions: ', np.sum(cm[1]))

In [None]:
baseline_results = model.evaluate(x_test_trans, y_test, batch_size=BATCH_SIZE, verbose=0)
for name, value in zip(model.metrics_names, baseline_results):
  print(name, ': ', value)
print()

plot_cm(y_test, test_predictions_baseline)

#### 4.2.8 Plot ROC

In [None]:
def plot_roc(name, labels, predictions, **kwargs):
  fp, tp, _ = sklearn.metrics.roc_curve(labels, predictions)

  plt.plot(100*fp, 100*tp, label=name, linewidth=2, **kwargs)
  plt.xlabel('False positives [%]')
  plt.ylabel('True positives [%]')
  plt.xlim([0,100])
  plt.ylim([0,100.5])
  plt.grid(True)
  ax = plt.gca()
  ax.set_aspect('equal')

In [None]:
plot_roc("Train Baseline", y_train, train_predictions_baseline, color="green")
plot_roc("Test Baseline", y_test, test_predictions_baseline, color="green", linestyle='--')
plt.legend(loc='lower right');

#### 4.2.9 Plot AUPRC

In [None]:
def plot_prc(name, labels, predictions, **kwargs):
    precision, recall, _ = sklearn.metrics.precision_recall_curve(labels, predictions)

    plt.plot(precision, recall, label=name, linewidth=2, **kwargs)
    plt.xlabel('Precision')
    plt.ylabel('Recall')
    plt.grid(True)
    ax = plt.gca()
    ax.set_aspect('equal')

In [None]:
plot_prc("Train Baseline", y_train, train_predictions_baseline, color="orange")
plot_prc("Test Baseline", y_test, test_predictions_baseline, color="blue", linestyle='--')
plt.legend(loc='lower right');

### [4.3] Class Weights Model

#### 4.3.1 Evaluate Class Weights

In [None]:
# Scaling by total/2 helps keep the loss to a similar magnitude.
# The sum of the weights of all examples stays the same.
weight_for_0 = (1 / neg) * (total / 2.0)
weight_for_1 = (1 / pos) * (total / 2.0)

class_weight = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

#### 4.3.2 Train Model - Consider Class Weights

In [None]:
weighted_model = make_model()
weighted_model.load_weights(initial_weights)

weighted_history = weighted_model.fit(
                                        x_train_trans,
                                        y_train,
                                        batch_size=BATCH_SIZE,
                                        epochs=EPOCHS,
                                        callbacks=[early_stopping],
                                        validation_data=(x_val_trans, y_val),
                                        # The class weights go here
                                        class_weight=class_weight
                                    )

#### 4.3.3 Training History

In [None]:
plot_metrics(weighted_history)

#### 4.3.4 Evaluate Metrics

In [None]:
train_predictions_weighted = weighted_model.predict(x_train_trans, batch_size=BATCH_SIZE)
test_predictions_weighted = weighted_model.predict(x_test_trans, batch_size=BATCH_SIZE)

In [None]:
weighted_results = weighted_model.evaluate(x_test_trans, y_test, batch_size=BATCH_SIZE, verbose=0)

for name, value in zip(weighted_model.metrics_names, weighted_results):
  print(name, ': ', value)
print()

plot_cm(y_test, test_predictions_weighted)

#### 4.3.5 Plot ROC

In [None]:
plot_roc("Train Baseline", y_train, train_predictions_baseline, color="blue")
plot_roc("Test Baseline", y_test, test_predictions_baseline, color="blue", linestyle='--')

plot_roc("Train Weighted", y_train, train_predictions_weighted, color="orange")
plot_roc("Test Weighted", y_test, test_predictions_weighted, color="orange", linestyle='--')


plt.legend(loc='lower right');

#### 4.3.6 Plot AUPRC

In [None]:
plot_prc("Train Baseline", y_train, train_predictions_baseline, color="blue")
plot_prc("Test Baseline", y_test, test_predictions_baseline, color="blue", linestyle='--')

plot_prc("Train Weighted", y_train, train_predictions_weighted, color="green")
plot_prc("Test Weighted", y_test, test_predictions_weighted, color="green", linestyle='--')

plt.legend(loc='lower right');

### [4.4] Over-sampling
Correct for dataset response variable imbalance.

#### 4.4.1 Oversample Minority Class

In [None]:
# Form np arrays of labels and features
bool_train_labels = y_train != 0
bool_train_labels

In [None]:
# Count positive and negative features
pos_features = x_train_trans[bool_train_labels]
neg_features = x_train_trans[~bool_train_labels]

pos_labels = y_train[bool_train_labels]
neg_labels = y_train[~bool_train_labels]

In [None]:
# Do Resampling
BUFFER_SIZE = 100000

def make_ds(features, labels):
  ds = tf.data.Dataset.from_tensor_slices((features, labels))#.cache()
  ds = ds.shuffle(BUFFER_SIZE).repeat()
  return ds

pos_ds = make_ds(pos_features, pos_labels)
neg_ds = make_ds(neg_features, neg_labels)

In [None]:
for features, label in pos_ds.take(1):
  print("Features:\n", features.numpy())
  print()
  print("Label: ", label.numpy())

In [None]:
# Merge Data
resampled_ds = tf.data.Dataset.sample_from_datasets([pos_ds, neg_ds], weights=[0.5, 0.5])
resampled_ds = resampled_ds.batch(BATCH_SIZE).prefetch(2)

In [None]:
for features, label in resampled_ds.take(1):
  print(label.numpy().mean())

In [None]:
# Find the Number of Steps per Epoch
resampled_steps_per_epoch = np.ceil(2.0*neg/BATCH_SIZE)
resampled_steps_per_epoch

#### 4.4.2 Train Model - Oversampled Data

In [None]:
resampled_model = make_model()
resampled_model.load_weights(initial_weights)

# Reset the bias to zero, since this dataset is balanced.
output_layer = resampled_model.layers[-1] 
output_layer.bias.assign([0])

val_ds = tf.data.Dataset.from_tensor_slices((x_val_trans, y_val)).cache()
val_ds = val_ds.batch(BATCH_SIZE).prefetch(2) 

resampled_history = resampled_model.fit(
                                            resampled_ds,
                                            epochs=EPOCHS,
                                            steps_per_epoch=resampled_steps_per_epoch,
                                            callbacks=[early_stopping],
                                            validation_data=val_ds
                                        )

#### 4.4.3 Check Training History

In [None]:
plot_metrics(resampled_history)

#### 4.4.4 Retrained Model

In [None]:
resampled_model = make_model()
resampled_model.load_weights(initial_weights)

# Reset the bias to zero, since this dataset is balanced.
output_layer = resampled_model.layers[-1] 
output_layer.bias.assign([0])

resampled_history = resampled_model.fit(
                                            resampled_ds,
                                            # These are not real epochs
                                            steps_per_epoch=20,
                                            epochs=10*EPOCHS,
                                            callbacks=[early_stopping],
                                            validation_data=(val_ds)
                                        )

#### 4.4.5 Check Training History - Retrained Model

In [None]:
plot_metrics(resampled_history)

#### 4.4.6 Evaluate Metrics

In [None]:
train_predictions_resampled = resampled_model.predict(x_train_trans, batch_size=BATCH_SIZE)
test_predictions_resampled = resampled_model.predict(x_test_trans, batch_size=BATCH_SIZE)

In [None]:
resampled_results = resampled_model.evaluate(x_test_trans, y_test, batch_size=BATCH_SIZE, verbose=0)
for name, value in zip(resampled_model.metrics_names, resampled_results):
  print(name, ': ', value)
print()

plot_cm(y_test, test_predictions_resampled)

#### 4.4.7 Plot ROC

In [None]:
plot_roc("Train Baseline", y_train, train_predictions_baseline, color="blue")
plot_roc("Test Baseline", y_test, test_predictions_baseline, color="blue", linestyle='--')

plot_roc("Train Weighted", y_train, train_predictions_weighted, color="orange")
plot_roc("Test Weighted", y_test, test_predictions_weighted, color="orange", linestyle='--')

plot_roc("Train Resampled", y_train, train_predictions_resampled, color="green")
plot_roc("Test Resampled", y_test, test_predictions_resampled, color="green", linestyle='--')
plt.legend(loc='lower right')


#### 4.4.8 Plot AUPRC

In [None]:
plot_prc("Train Baseline", y_train, train_predictions_baseline, color="blue")
plot_prc("Test Baseline", y_test, test_predictions_baseline, color="blue", linestyle='--')

plot_prc("Train Weighted", y_train, train_predictions_weighted, color="orange")
plot_prc("Test Weighted", y_test, test_predictions_weighted, color="orange", linestyle='--')

plot_prc("Train Resampled", y_train, train_predictions_resampled, color="green")
plot_prc("Test Resampled", y_test, test_predictions_resampled, color="green", linestyle='--')
plt.legend(loc='lower right')

## [5] Scoring Dataset Predictions

In [None]:
# Load Dataset
data_comp = pd.read_csv("ScoringDataset_2023Qualification.csv")

In [None]:
# Load Features
x_comp = data_comp.iloc[:, 1:]

In [None]:
# Transform the Data
x_comp_trans = preprocessing_pipeline5.fit_transform(x_comp)

In [None]:
# Predict Results
test_predictions_weighted = weighted_model.predict(x_comp_trans, batch_size=BATCH_SIZE)
test_predictions_weighted

In [None]:
threshold = 0.5

# Convert probabilities into zeros and ones using the threshold value
binary_predictions = np.where(test_predictions_weighted > threshold, 1, 0)

# Print the binary predictions
binary_predictions.shape

In [None]:
# Export to DF
pred_df = pd.DataFrame(binary_predictions)
pred_df.to_csv('comp_dataframe.csv', index=False)