## Setup

In [1]:
import random
random.seed(2020)

import tensorflow as tf
from tensorflow import keras

import os
import tempfile

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from scipy import sparse

import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_curve, auc

In [2]:
mpl.rcParams['figure.figsize'] = (12, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

In [3]:
# Utilities 
def plot_loss(history, label, n):
  # Use a log scale to show the wide range of values.
    plt.semilogy(history.epoch,  history.history['loss'],
               color=colors[n], label='Train '+label)
    plt.semilogy(history.epoch,  history.history['val_loss'],
          color=colors[n], label='Val '+label,
          linestyle="--")
    plt.xlabel('Epoch')
    plt.ylabel('Loss')

    plt.legend()
    
def plot_metrics(history):
    metrics =  ['loss', 'auc', 'precision', 'recall']
    for n, metric in enumerate(metrics):
        name = metric.replace("_"," ").capitalize()
        plt.subplot(2,2,n+1)
        plt.plot(history.epoch,  history.history[metric], color=colors[0], label='Train')
        plt.plot(history.epoch, history.history['val_'+metric],
                 color=colors[0], linestyle="--", label='Val')
        plt.xlabel('Epoch')
        plt.ylabel(name)
        plt.legend()

def plot_cm(labels, predictions, p=0.5):
    cm = confusion_matrix(labels, predictions > p)
    plt.figure(figsize=(5,5))
    sns.heatmap(cm, annot=True, fmt="d")
    plt.title('Confusion matrix @{:.2f}'.format(p))
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')

    print('Legitimate Transactions Detected (True Negatives): ', cm[0][0])
    print('Legitimate Transactions Incorrectly Detected (False Positives): ', cm[0][1])
    print('Fraudulent Transactions Missed (False Negatives): ', cm[1][0])
    print('Fraudulent Transactions Detected (True Positives): ', cm[1][1])
    print('Total Fraudulent Transactions: ', np.sum(cm[1]))
    
def plot_roc(name, labels, predictions, **kwargs):
    fp, tp, _ = sklearn.metrics.roc_curve(labels, predictions)

    plt.plot(100*fp, 100*tp, label=name+ ' (AUC = %0.3f)' % auc(fp, tp), linewidth=2, **kwargs)
    plt.xlabel('False positives [%]')
    plt.ylabel('True positives [%]')
#     plt.xlim([-0.5,20])
#     plt.ylim([80,100.5])
    plt.grid(True)
    ax = plt.gca()
    ax.set_aspect('equal')
    
def AUCcalc(y_val_val, y_pred):
    fpr, tpr, thresholds = roc_curve(y_val_val, y_pred)
    tauc = auc(fpr, tpr)
    return tauc

## Data processing and exploration

In [4]:
full_preprocessed_train = pd.DataFrame(pd.read_csv(os.getcwd() + '/' + 'full_preprocessed_train.csv', sep=','))
full_preprocessed_val = pd.DataFrame(pd.read_csv(os.getcwd() + '/' + 'full_preprocessed_dev.csv', sep=','))

full_preprocessed_train = full_preprocessed_train.replace(np.nan, " ")
full_preprocessed_val = full_preprocessed_val.replace(np.nan, " ")

In [5]:
doc2vec_train = pd.DataFrame(pd.read_csv(os.getcwd() + '/' + 'dv_train.csv', sep=',', header=None))
doc2vec_val = pd.DataFrame(pd.read_csv(os.getcwd() + '/' + 'dv_val.csv', sep=',', header=None))

In [6]:
test =  pd.DataFrame(pd.read_csv(os.getcwd() + '/' + 'test_clean.csv', sep=','))
test = test.replace(np.nan, " ")
dv_test = pd.DataFrame(pd.read_csv(os.getcwd() + '/' + 'dv_test.csv', sep=',', header=None))

In [7]:
doc2vec_X_training_data = pd.concat([full_preprocessed_train, doc2vec_train], axis=1, sort=False)
doc2vec_X_val_data = pd.concat([full_preprocessed_val, doc2vec_val], axis=1, sort=False)
test = pd.concat([test, dv_test], axis=1, sort=False)

### Examine the class label imbalance

Let's look at the dataset imbalance:

In [8]:
neg, pos = np.bincount(doc2vec_X_training_data['label'])
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))

Examples:
    Total: 250874
    Positive: 25819 (10.29% of total)



### Clean and normalize the data

In [9]:
cleaned_train = doc2vec_X_training_data.copy()
cleaned_val = doc2vec_X_val_data.copy()
cleaned_test = test.copy()

# You don't want the `Time` column.
cleaned_train = cleaned_train.drop(['date'], axis=1)
cleaned_val = cleaned_val.drop(['date'], axis=1)
cleaned_test = cleaned_test.drop(['date'], axis=1)

# The `length` column covers a huge range. Convert to log-space.
eps=0.001 
cleaned_train['length'] = np.log(cleaned_train.pop('length')+eps)
cleaned_val['length'] = np.log(cleaned_val.pop('length')+eps)
cleaned_test['length'] = np.log(cleaned_test.pop('length')+eps)

In [10]:
# Can't normalize str col
cleaned_train = cleaned_train.drop(['review'], axis = 1)
cleaned_val = cleaned_val.drop(['review'], axis = 1)
cleaned_test = cleaned_test.drop(['review'], axis = 1)

In [11]:
cleaned_test = cleaned_test.drop(['label'], axis = 1)

In [12]:
# Form np arrays of labels and features.
train_df, test_df = cleaned_train, cleaned_test
val_df = cleaned_val

train_labels = np.array(train_df.pop('label'))
val_labels = np.array(val_df.pop('label'))

train_features = np.array(train_df)
val_features = np.array(val_df)
test_features = np.array(test_df)

Normalize the input features using the sklearn StandardScaler.
This will set the mean to 0 and standard deviation to 1.

Note: The `StandardScaler` is only fit using the `train_features` to be sure the model is not peeking at the validation or test sets. 

In [13]:
scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)

val_features = scaler.transform(val_features)
test_features = scaler.transform(test_features)

train_features = np.clip(train_features, -5, 5)
val_features = np.clip(val_features, -5, 5)
test_features = np.clip(test_features, -5, 5)


print('Training labels shape:', train_labels.shape)
print('Validation labels shape:', val_labels.shape)
#print('Test labels shape:', test_labels.shape)

print('Training features shape:', train_features.shape)
print('Validation features shape:', val_features.shape)
print('Test features shape:', test_features.shape)

Training labels shape: (250874,)
Validation labels shape: (35918,)
Training features shape: (250874, 145)
Validation features shape: (35918, 145)
Test features shape: (72165, 145)


## Define the model and metrics

Define a function that creates a simple neural network with a densly connected hidden layer, a [dropout](https://developers.google.com/machine-learning/glossary/#dropout_regularization) layer to reduce overfitting, and an output sigmoid layer that returns the probability of a transaction being fraudulent: 

In [19]:
METRICS = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
]

def make_model(metrics = METRICS, output_bias=None):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
    # Sequential groups a linear stack of layers into a tf.keras.Model.
    model = keras.Sequential([
        keras.layers.Dense(64, activation='relu', input_shape=(train_features.shape[-1],)),
        keras.layers.Dropout(0.5),
#         keras.layers.Dense(64, activation='relu'),
#         keras.layers.Dropout(0.5),
        keras.layers.Dense(1, activation='sigmoid', bias_initializer=output_bias),])

    model.compile(
      optimizer=keras.optimizers.Adam(lr=.1),
      loss=keras.losses.BinaryCrossentropy(),
      metrics=metrics)

    return model

In [20]:
EPOCHS = 100
BATCH_SIZE = 10000

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_auc', 
    verbose=1,
    patience=10,
    mode='max',
    restore_best_weights=True)

### Checkpoint the initial weights

To make the various training runs more comparable, keep this initial model's weights in a checkpoint file, and load them into each model before training.

In [21]:
initial_bias = np.log([pos/neg])
model = make_model(output_bias = initial_bias)
initial_weights = os.path.join(tempfile.mkdtemp(),'initial_weights')
model.save_weights(initial_weights)

### Train the model

## Class weights

### Calculate class weights

The goal is to identify fradulent transactions, but you don't have very many of those positive samples to work with, so you would want to have the classifier heavily weight the few examples that are available. You can do this by passing Keras weights for each class through a parameter. These will cause the model to "pay more attention" to examples from an under-represented class.

In [22]:
# Scaling by total/2 helps keep the loss to a similar magnitude.
# The sum of the weights of all examples stays the same.
weight_for_0 = (1 / neg)*(total)/2.0 
weight_for_1 = (1 / pos)*(total)/2.0

class_weight = {0: weight_for_0, 1: weight_for_1}

### Train a model with class weights

Now try re-training and evaluating the model with class weights to see how that affects the predictions.

Note: Using `class_weights` changes the range of the loss. This may affect the stability of the training depending on the optimizer. Optimizers whose step size is dependent on the magnitude of the gradient, like `optimizers.SGD`, may fail. The optimizer used here, `optimizers.Adam`, is unaffected by the scaling change. Also note that because of the weighting, the total losses are not comparable between the two models.

In [23]:
weighted_model = make_model()
weighted_model.load_weights(initial_weights)

weighted_history = weighted_model.fit(
    train_features,
    train_labels,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    callbacks = [early_stopping],
    validation_data=(val_features, val_labels),
    # The class weights go here
    class_weight=class_weight) 

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 00018: early stopping


### Check training history

### Evaluate metrics

In [24]:
test_predictions_weighted = weighted_model.predict(test_features, batch_size=BATCH_SIZE)

In [25]:
test_predictions_weighted = pd.DataFrame(test_predictions_weighted)

In [26]:
test_predictions_weighted.to_csv (os.getcwd() + '/' + 'predictions.csv', index = False, header=False)

In [None]:
train_predictions_weighted = weighted_model.predict(train_features, batch_size=BATCH_SIZE)
# val_predictions_weighted = weighted_model.predict(val_features, batch_size=BATCH_SIZE)

### Plot the ROC

In [None]:
plot_roc("Train Weighted", train_labels, train_predictions_weighted, color=colors[1])
plot_roc("Validation Weighted", val_labels, val_predictions_weighted, color=colors[1], linestyle='--')


plt.legend(loc='lower right')

In [None]:
print("Average Precision Report:")
print("Validation Weighted: ", average_precision_score(val_labels, val_predictions_weighted))

In [None]:
print("AUC:")
print("Validation Weighted: ", AUCcalc(val_labels, val_predictions_weighted))