# Load the data

In [None]:
!pip install -r ../src/train/requirements.txt --quiet

After running `pip`, restart the notebook kernel by going to the menu at the top.

In [None]:
import os, mlflow

In [None]:
print('The MLflow server is here: {}'.format(os.environ['MLFLOW_TRACKING_URI']))

In [None]:
# The experiment results will be logged into the "Dev" experiment
experiment_name = os.environ.get('MLFLOW_EXPERIMENT_NAME', 'Dev')

e = mlflow.get_experiment_by_name(experiment_name)
if not e:
    experiment_id = mlflow.create_experiment(name=experiment_name)
else:
    experiment_id = e.experiment_id

print('Experiment ID: {}'.format(experiment_id))

In [None]:
run = mlflow.start_run(experiment_id=experiment_id, run_name = 'xgboost')

In [None]:
import numpy as np
import pandas as pd

In [None]:
import dvc.api
import s3fs
import os

# The credidcard.csv versioning metadata is kept in a git repository in os.environ['DATA_REPO'].
# We are pulling version 1.0 of the data and the data itself is from a S3 bucket that is returned
# by dvc.api.get_url()
resource_url = dvc.api.get_url(
    path='creditcard.csv',
    repo=os.environ['DATA_REPO'],
    rev='v1.0')

print("Data is from this S3 bucket:\n{}".format(resource_url))

# Pandas doens't support endpoint_url
# https://github.com/pandas-dev/pandas/pull/29050
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': os.environ['S3_ENDPOINT_URL']})
df = pd.read_csv(fs.open(resource_url))

In [None]:
features = df.columns.values

# Finding features with the highest correlation

In [None]:
def most_corr(param, n):
    class_corr = df.corr()[param].sort_values(ascending=False)
    list_class = []
    for i in features:
        if(np.abs(class_corr[i]) >= n): 
           list_class.append(i)
    return list_class
# Select features with correlation higher than 0.1 (positive correlation) or lower than -0.1 (negative correlation)
selected_features = most_corr('Class', 0.1)
selected_features

In [None]:
dataset = df[selected_features]

# Building XGBoost model

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report, recall_score, precision_score, \
    average_precision_score, plot_precision_recall_curve, f1_score, auc, \
    roc_curve, roc_auc_score, confusion_matrix, accuracy_score, precision_recall_curve

In [None]:
RANDOM_SEED = 42
mlflow.log_param('RANDOM_SEED', RANDOM_SEED)

In [None]:
TEST_SIZE = 0.3
mlflow.log_param('TEST_SIZE', TEST_SIZE)

## Split the data into X and y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dataset.drop('Class',1) , dataset['Class'], test_size=TEST_SIZE, random_state=RANDOM_SEED)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [None]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, y_train)
dtest = xgb.DMatrix(X_test, y_test)

In [None]:
watchlist = [(dtrain, 'train'), (dtest, 'test')]

# Set xgboost parameters
params = {}
params['objective'] = 'binary:logistic'
params['eta'] = 0.039
params['max_depth'] = 2
params['subsample'] = 0.8
params['colsample_bytree'] = 0.9
params['eval_metric'] = 'auc'
params['random_state'] = RANDOM_SEED

In [None]:
# autolog results using mlflow
import mlflow.xgboost
mlflow.xgboost.autolog() 

baseline = xgb.train(params, 
                dtrain, 
                1000, 
                watchlist, 
                early_stopping_rounds=50, 
                maximize=True, 
                verbose_eval=50)

y_baseline = baseline.predict(dtest)

In [None]:
y_proba_baseline = baseline.predict(dtest)

In [None]:
average_precision = average_precision_score(y_test, y_proba_baseline)
mlflow.log_metric('average_precision', average_precision)

In [None]:
THRESHOLD = 0.7

In [None]:
print(classification_report(y_test, y_proba_baseline > THRESHOLD))
print('\n')
print('AUC: {}%'.format(roc_auc_score(y_test, y_proba_baseline)))
print('Precision-Recall: {}'.format(average_precision))

rpt = classification_report(y_test, y_proba_baseline > THRESHOLD, output_dict=True)
for lbl in ['0', '1']:
    mlflow.log_metric(lbl + '_recall', rpt[lbl]['recall'])
    mlflow.log_metric(lbl + '_f1_score', rpt[lbl]['f1-score'])
    mlflow.log_metric(lbl + '_precision', rpt[lbl]['precision'])

## AUC - ROC Curve

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

In [None]:
plt.figure(figsize = (10,7))
fpr, tpr, thresholds = roc_curve(y_test, y_proba_baseline)
plt.plot(fpr, tpr, color='orange', label='ROC')
plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
plt.fill_between(fpr, tpr, color='skyblue', alpha=0.3)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()

## Plot the Precision-Recall curve

In [None]:
# For unbalanced data
plt.figure(figsize = (10,7))

precision, recall, _ = precision_recall_curve(y_test, y_proba_baseline)
plt.plot(recall, precision, marker='.', label='{} (AP={:.4f})'.format(baseline.__class__.__name__, average_precision))

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('2-class Precision-Recall curve')
plt.legend()
plt.show()

## Confusion Matrix

In [None]:
LABELS = ["Normal", "Fraud"]

plt.figure(figsize = (10,7))

cm = confusion_matrix(y_test, y_proba_baseline>THRESHOLD) # rows = truth, cols = prediction    
df_cm = pd.DataFrame(cm, index = (0, 1), columns = (0, 1))

sns.set(font_scale=1.4)
sns.heatmap(df_cm, xticklabels=LABELS, yticklabels=LABELS, annot=True, fmt="d", cmap='Blues')

accuracy = accuracy_score(y_test, y_proba_baseline>THRESHOLD)
mlflow.log_metric('accuracy', accuracy)

plt.title('Accuracy: {:.4f}'.format(accuracy))
plt.ylabel('True label')
plt.xlabel('Predicted label')

b, t = plt.ylim() # discover the values for bottom and top
b += 0.5 # Add 0.5 to the bottom
t -= 0.5 # Subtract 0.5 from the top
plt.ylim(b, t) # update the ylim(bottom, top) values

plt.tight_layout()
plt.show()

In [None]:
mlflow.end_run()