In [None]:
# Set True, if you want to use a subset of the data for faster development. 
# Set False, if you want to use the entire dataset.
use_partial_data = True

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [None]:
import sys
sys.path.append('..')

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import os

from model_training import train_model
from plotting import plot_precision_recall_curve

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import average_precision_score
# import shap

import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns


cwd_path = os.path.abspath(os.getcwd())
project_root = os.path.dirname(cwd_path)

# allow more data columns to be shown than by default
pd.set_option('display.max_columns', 500)

# DATA

In [None]:
if use_partial_data:
    data_path = os.path.join(project_root, 'data/preprocessed_data_with_feature_engineering_small.csv')
    print(data_path)
    data = pd.read_csv(data_path, index_col=0)
    data['Datetime'] = pd.to_datetime(data['Datetime'], yearfirst=True)
else:
    print('Not created yet!')
    data_path = os.path.join(project_root, 'data/preprocessed_data_with_feature_engineering.csv')
    data = pd.read_csv(data_path, index_col=0)
    data['Datetime'] = pd.to_datetime(data['Datetime'], yearfirst=True)
display(data.head())
display(data.shape)

## Features and data standardization

In [None]:
np.array(data.columns)

In [None]:
features = ['Amount', 'Has Chip', 'Gender', 'Online Transaction', 'Amex', 'Discover', 
       'Mastercard', 'Visa', 'Credit', 'Debit', 'Debit (Prepaid)', 'MCC_mean_encoding', 'card_present_transaction', 
        'fraud_rolling_mean_30_days',
       'fraud_rolling_mean_60_days', 'fraud_rolling_mean_365_days',
       'fraud_rolling_mean_2_years', 
       'fraud_online_rolling_mean_30_days',
       'fraud_online_rolling_mean_60_days',
       'fraud_online_rolling_mean_365_days',
       'fraud_online_rolling_mean_2_years',
       'fraud_card_present_rolling_mean_30_days',
       'fraud_card_present_rolling_mean_60_days',
       'fraud_card_present_rolling_mean_365_days',
       'fraud_card_present_rolling_mean_2_years',
       'fraud_rolling_30_days_relative_to_365_days',
       'fraud_rolling_30_days_relative_to_2_years',
       'fraud_rolling_60_days_relative_to_365_days',
       'fraud_rolling_60_days_relative_to_2_years',       
       'fraud_online_rolling_30_days_relative_to_365_days',
       'fraud_online_rolling_30_days_relative_to_2_years',
       'fraud_online_rolling_60_days_relative_to_365_days',
       'fraud_online_rolling_60_days_relative_to_2_years',
       'fraud_card_present_rolling_30_days_relative_to_365_days',
       'fraud_card_present_rolling_30_days_relative_to_2_years',
       'fraud_card_present_rolling_60_days_relative_to_365_days',
       'fraud_card_present_rolling_60_days_relative_to_2_years',
       'fraud_online_rolling_30_days_relative_to_all_frauds',
       'fraud_card_present_rolling_30_days_relative_to_all_frauds',
       'fraud_online_rolling_60_days_relative_to_all_frauds',
       'fraud_card_present_rolling_60_days_relative_to_all_frauds',
       'fraud_online_rolling_365_days_relative_to_all_frauds',
       'fraud_card_present_rolling_365_days_relative_to_all_frauds',
       'fraud_online_rolling_2_years_relative_to_all_frauds',
       'fraud_card_present_rolling_2_years_relative_to_all_frauds', 'hour_sin', 'hour_cos',
       'month_sin', 'month_cos', 'day_of_week_sin', 'day_of_week_cos',
       'is_holiday', 'weekend', 'is_2015_or_later', 'mean_amount', 'mean_amount_last_year',
       'mean_amount_last_30_days', 'mean_amount_last_7_days',
       'mean_amount_last_2_days', 'mean_amount_last_1_days',
       'mean_amount_last_7_days_relative_to_last_year',
       'mean_amount_last_2_days_relative_to_last_year',
       'mean_amount_last_1_days_relative_to_last_year',
       'mean_amount_last_7_days_relative_to_last_30_days',
       'mean_amount_last_2_days_relative_to_last_30_days',
       'mean_amount_last_1_days_relative_to_last_30_days', 'transaction_count',
       'days_since_first_transaction', 'transaction_frequency_all',
       'transaction_frequency_last_year', 'transaction_frequency_last_30_days',
       'transaction_frequency_last_7_days',
       'transaction_frequency_last_2_days',
       'transaction_frequency_last_1_days',
       '1_days_transaction_frequency_relative_to_last_30_days',
       '1_days_transaction_frequency_relative_to_last_year',
       '2_days_transaction_frequency_relative_to_last_30_days',
       '2_days_transaction_frequency_relative_to_last_year',
       '7_days_transaction_frequency_relative_to_last_30_days',
       '7_days_transaction_frequency_relative_to_last_year']
target = ['Is Fraud?']

In [None]:
# impute missing values with zeros
data[features] = data[features].fillna(0)
# impute infinities with zeros
data[features] = data[features].replace([np.inf, -np.inf], 0)      

In [None]:
# features
X = np.array(data[features])
print(f'X: shape={X.shape}')
display(X[0:5,:])

# target
y = np.squeeze(np.array(data[target]))
print(f'y: shape={y.shape}')
display(y[0:10])

# TIME SERIES CROSS-VALIDATION DATASETS

In [None]:
tscv = TimeSeriesSplit(n_splits=4)
all_splits = list(tscv.split(X, y))
display(all_splits)

In [None]:
sampling_interval = 1000
fig = go.Figure()
for fold in range(len(all_splits)):
    
    # training set
    fig.add_trace(
        go.Scatter(
            x=data.loc[all_splits[fold][0][::sampling_interval], 'Datetime'], 
            y=-fold*np.ones(len(all_splits[fold][0][::sampling_interval])), 
            mode='lines',
            line=dict(color='blue', width=10),             
            name='training',
            # only pring legend once
            showlegend=(fold==0),
        )
     )
    
    # test set
    fig.add_trace(
        go.Scatter(
            x=data.loc[all_splits[fold][1][::sampling_interval], 'Datetime'], 
            y=-fold*np.ones(len(all_splits[fold][1][::sampling_interval])),
            mode='lines',
            line=dict(color='red', width=10), 
            name='test',
            # only pring legend once
            showlegend=(fold==0),
        )
    )

fig.update_traces(marker_size=10)
fig.update_yaxes(showticklabels=False)    

fig.update_layout(
    title = f"Train and test set split with time series k-fold cross-validation",
    #template='plotly_dark',
    xaxis_title = 'aika',
    #yaxis_title = 'maalit / ottelu',
    showlegend=True
)

fig.show()

In [None]:
# scaler used to scale training data to mean zero and unit variance
scaler = StandardScaler()
for count, (train_index, test_index) in enumerate(all_splits):
    print(f"CV COUNT: {count+1}")
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = np.squeeze(y[train_index]), np.squeeze(y[test_index])
    # standardize X_train and X_test. However, use only X_train for fitting
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    print('X_train mean:')
    display(X_train.mean(axis=0))
    print('X_train standard deviation:')
    display(X_train.std(axis=0))
    print('X_test mean:')
    display(X_test.mean(axis=0))
    print('X_test standard deviation:')
    display(X_test.std(axis=0))
    print(f'y_train proportion of fraud: {np.sum(y_train)/y_train.shape[0]}')
    print(f'y_test proportion of fraud: {np.sum(y_test)/y_test.shape[0]}')
    print('')
    print('')

# MODEL TRAINING

In [None]:
# set up dataframe for collecting all the results
all_results = pd.DataFrame(columns=['model','precision', 'recall', 'f1', 'f2', 'f0.5', 'roc_auc', 'average_precision', 'precision_top_k'])

## LOGISTIC REGRESSION

In [None]:
classifier_logistic = LogisticRegression(random_state=42, max_iter=500)
all_classifier_logistic, all_performance_metrics, cross_validation_results = train_model(classifier_logistic, all_splits, X, y,
                                                data_standardization='standard', data_sampling='none')
# collect results
cross_validation_results['model'] = 'logistic regression'
all_results = all_results.append(cross_validation_results, ignore_index=True)

## SGDClassifier

In [None]:
classifier_sgd = SGDClassifier(loss='modified_huber', random_state=42)
all_classifier_sgd, all_performance_metrics, cross_validation_results = train_model(classifier_sgd, all_splits, X, y,
                                                data_standardization='standard', data_sampling='none')
# collect results
cross_validation_results['model'] = 'SGDClassifier'
all_results = all_results.append(cross_validation_results, ignore_index=True)

## RANDOM FOREST

In [None]:
classifier_rf = RandomForestClassifier(random_state=42, n_jobs=-1)
all_classifier_rf, all_performance_metrics, cross_validation_results = train_model(classifier_rf, all_splits, X, y,
                                                data_standardization='standard', data_sampling='none')
# collect results
cross_validation_results['model'] = 'random forest'
all_results = all_results.append(cross_validation_results, ignore_index=True)

## XGBOOST

In [None]:
classifier_xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
all_classifiers_xgb, all_performance_metrics, cross_validation_results = train_model(classifier_xgb, all_splits, X, y,
                                                data_standardization='none', data_sampling='none',)
# collect results
cross_validation_results['model'] = 'XGBoost'
all_results = all_results.append(cross_validation_results, ignore_index=True)

## LIGHTGBM

In [None]:
classifier_lgbm = LGBMClassifier(random_state=42)
all_classifier_lgbm, all_performance_metrics, cross_validation_results = train_model(classifier_lgbm, all_splits, X, y,
                                                data_standardization='none', data_sampling='none')
# collect results
cross_validation_results['model'] = 'LigthGBM'
all_results = all_results.append(cross_validation_results, ignore_index=True)

# RESULTS

In [None]:
display(all_results.round(3))

In [None]:
# save results
#all_results.round(3).to_csv('../results/221116_random_oversampling_with_noise_minority_proportion_10.csv')

# SHAP

In [None]:
explainer = shap.Explainer(all_classifiers_xgb[3], data[features])#scaler.fit_transform(data[features]))
shap_values = explainer(data.iloc[::100][features])

In [None]:
shap.summary_plot(shap_values, max_display=20, plot_type='bar')

In [None]:
shap.summary_plot(shap_values, max_display=20)

In [None]:
all_classifiers_xgb[0].predict_proba(X)

In [None]:
all_classifiers_xgb[3].predict_proba(X)