In [None]:
# # Update sklearn to prevent version mismatches
# !pip install sklearn --upgrade

# install joblib. This is used to save the model. 
# # Restart your kernel after installing 
# !pip install joblib

# !pip install xgboost

In [None]:
# Import the dependencies
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
warnings.simplefilter('ignore', FutureWarning)
pd.set_option('mode.chained_assignment', None)

In [None]:
# Import the dependencies
from sklearn.metrics import plot_confusion_matrix

## Load the dataset

In [None]:
# import csv into a dataframe
data_read = pd.read_csv("../Resources/fraudTrain.csv")
data_read.head()

In [None]:
data_read.info()

## Exploratory Data Analysis

In [None]:
data_read.isnull().values.any()

In [None]:
data_read.describe()

#### Fraud/non-fraud transaction distribution

In [None]:
data_read['is_fraud'].value_counts()

In [None]:
### Visualize the fraud/non-fraud transaction distribution

fig = sns.catplot('is_fraud', data=data_read, kind='count', height=4.5, aspect=1)
plt.title("Fraud vs Non-Fraud Transaction Distribution\n")
plt.savefig("../static/images/distribution.jpg", bbox_inches='tight')
plt.ylabel("Number of transactions")
plt.xlabel("Status")
fig.set_xticklabels(['Non-fraud', 'Fraud'])
plt.show()

The dataset is imbalanced with 1289169 legitimate transactions and 7506 fraud transactions 

In [None]:
data_notfraud = data_read[data_read.is_fraud==0]
data_fraud = data_read[data_read.is_fraud==1]

In [None]:
data_notfraud.shape, data_fraud.shape

In [None]:
data_notfraud['amt'].describe()

In [None]:
data_fraud['amt'].describe()

# Balancing the data

### Up Sampling minority class

In [None]:
from sklearn.utils import resample

In [None]:
#separate majority and minority classes
df_majority = data_read[data_read.is_fraud==0]
df_minority = data_read[data_read.is_fraud==1]
print(df_majority.size)
print(df_minority.size)
data_read.is_fraud.value_counts()

In [None]:
# Upsample minority class
# nsamples = df_majority.is_fraud.count()

nsamples = round(df_majority.is_fraud.count()/2).astype(int)

df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=nsamples,    # to match majority class
                                 random_state=123) # reproducible results
df_minority_upsampled.is_fraud.value_counts()

In [None]:
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
df_upsampled

In [None]:
df_upsampled.is_fraud.value_counts()

In [None]:
# Visualize the distribtution with the balanced dataset
fig = sns.catplot('is_fraud', data=df_upsampled, kind='count', height=4.5, aspect=1)
plt.title("Fraud vs Non-Fraud Transaction Distribution\n(Balanced Data)")
plt.savefig("../static/images/distribution_balanced.jpg", bbox_inches='tight')
plt.ylabel("Number of transactions")
plt.xlabel("Status")
fig.set_xticklabels(['Non-fraud', 'Fraud'])
plt.show()

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
f.suptitle('Amount per transaction by class')
bins = 50
ax1.hist(data_fraud.amt, bins = bins)
ax1.set_title('Fraud')
ax2.hist(data_notfraud.amt, bins = bins)
ax2.set_title('Not Fraud')

plt.xlabel('Amount ($)')
f.text(.01, .5, 'Number of transactions', ha='center', va='center', rotation='vertical')
plt.xlim((0, 20000))
plt.yscale('log')
plt.savefig("../static/images/histogram_balanced.jpg", bbox_inches='tight')
plt.show()

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
f.suptitle('Amount per transaction by time')
ax1.scatter(data_fraud.category, data_fraud.amt)
ax1.set_title('Fraud')
ax2.scatter(data_notfraud.category, data_notfraud.amt)
ax2.set_title('Not Fraud')
plt.xlabel('Category', fontsize=11)
plt.xticks(rotation=45)
f.text(.01, .5, 'Transaction Amount', ha='center', va='center', rotation='vertical')
plt.savefig("../static/images/category-plot.jpg",  bbox_inches='tight')
plt.show();

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
f.suptitle('Amount per transaction by time')

ax1.scatter(data_fraud.trans_date_trans_time, data_fraud.amt)
ax1.set_title('Fraud')
ax2.scatter(data_notfraud.trans_date_trans_time, data_notfraud.amt)
ax2.set_title('Not Fraud')
plt.xlabel('Time', fontsize=11)
f.text(.01, .5, 'Transaction Amount', ha='center', va='center', rotation='vertical')
plt.xticks(rotation=45)
# plt.ylabel('Amount')
plt.figure(figsize=(4, 10))
plt.savefig("../static/images/unixtime-plot.jpg",  bbox_inches='tight')
plt.show()

In [None]:
data_sample = data_read.sample(frac=0.1, random_state=1)
data_sample.shape

### Data Transformation

Convert categorical data to numeric

In [None]:
# Convert the date of birth into numeric as age
data_train = df_upsampled
data_train['dob']= pd.to_datetime(data_train['dob'])
data_train['dob']

from datetime import datetime
today = datetime.today()
data_train['age'] = round(data_train['dob'].apply(lambda x: (today - x).days//365.25),0)
data_train['age']

In [None]:
# Display the dataframe column names
data_train.columns

In [None]:
# Select the columns which are required for the further processing
data_train = data_train[["category", "cc_num", "amt", "lat","long", "job", "age", "trans_num", 
                         "unix_time", "merch_lat","merch_long", "is_fraud"]]
data_train.head()

## Label Encoding

In [None]:
# Encode the transaction number and convert into numeric
from sklearn.preprocessing import LabelEncoder

get_transnum = data_train['trans_num']
label_encoder = LabelEncoder()
label_encoder.fit(get_transnum)
encoded_transnum = label_encoder.transform(get_transnum)
data_train['trans_num'] = encoded_transnum

In [None]:
# Encode the category and convert into numeric
get_category = data_train['category']
label_encoder = LabelEncoder()
label_encoder.fit(get_category)
encoded_category = label_encoder.transform(get_category)
data_train['category'] = encoded_category

In [None]:
# Encode the job and convert into numeric
get_job = data_train['job']
label_encoder = LabelEncoder()
label_encoder.fit(get_job)
encoded_job = label_encoder.transform(get_job)
data_train['job'] = encoded_job

In [None]:
# sets y to is_fraud
target = data_train["is_fraud"].values.reshape(-1, 1)

# Define the features
selected_features = data_train.drop('is_fraud', axis=1)

In [None]:
selected_features

## Split and Scale the data

In [None]:
# Import the dependecnies
from sklearn.model_selection import train_test_split

# Split the selected dataset into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(selected_features, target, random_state=42)

In [None]:
# Scale the train & test datasets
from sklearn.preprocessing import StandardScaler

# Create a standard scaler model and fit it to the training data
X_scaler = StandardScaler().fit(X_train)

# Transform the scaled data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Random Forest Classifier

In [None]:
# Use the Random Forest Classifier Model to get the feature importance/weightage
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100)

# Fit the random forest model to X and y
rf = rf.fit(X_train_scaled, y_train)

# Return the importance of each column to predicting the outcomes
importances = rf.feature_importances_
importances

In [None]:
# Sort each column in order of importance
rf_weights = sorted(zip(importances, selected_features.keys()), reverse=True)
rf_weights

In [None]:
# Convert the weighted feature list into a dataframe
rf_weighted_df = pd.DataFrame(rf_weights)
rf_weighted_df

In [None]:
# Save the dataframe into a csv file for plotting
rf_weighted_df.to_csv("../exporteddata/random_forest_balanced.csv")

In [None]:
### Plot the random forest features

colors = ['#c6dbef','#9ecae1','#9ecae1','#9ecae1','#6baed6','#6baed6','#4292c6','#2171b5','#2171b5','#08519c','#08306b','#023858']
sorted_idx = importances.argsort()
# plt.barh(selected_features.columns[sorted_idx], importances[sorted_idx], color=['#FFFFCC',#D9F0A3','#ADDD8E','#78C679','#31A354','#006837'])
plt.barh(selected_features.columns[sorted_idx], importances[sorted_idx], color=colors)

plt.title("RandomForest feature importance on the outcome")
plt.ylabel("Features")
plt.xlabel("Weight")
plt.savefig("../static/images/randomforest_balanced.jpg",  bbox_inches='tight')
plt.show()

### XGBoost Classifier Model

In [None]:
from xgboost import XGBRegressor

In [None]:
xgb = XGBRegressor(n_estimators=20)
xgb.fit(X_train_scaled, y_train)
xgb.feature_importances_

In [None]:
xgb_weights = sorted(zip(xgb.feature_importances_, selected_features.keys()), reverse=True)

In [None]:
xgb_df = pd.DataFrame(xgb_weights)
xgb_df.to_csv("../exporteddata/xgboost_features_balanced.csv")

In [None]:
sorted_idx = xgb.feature_importances_.argsort()
plt.barh(selected_features.columns[sorted_idx], xgb.feature_importances_[sorted_idx])
plt.title("XGBoost feature importance")
plt.xlabel("Weight")
plt.ylabel("Features")
plt.savefig("../static/images/xgb_balanced.jpg", bbox_inches='tight')
plt.show

### Correlation Diagram

In [None]:
import seaborn as sns
def correlation_heatmap(train):
    correlations = train.corr()

    fig, ax = plt.subplots(figsize=(10,10))
    sns.heatmap(correlations, vmax=1.0, center=0, fmt='.2f', cmap="YlGnBu",
                square=True, linewidths=.5, annot=True, cbar_kws={"shrink": .70}
                )
    plt.title("Feature correlation diagram")
    plt.xlabel("Features")
    plt.ylabel("Features")
    plt.savefig("../static/images/correlation_balanced.jpg", bbox_inches='tight')
    plt.show();

correlation_heatmap(X_train[selected_features.columns[sorted_idx]])
