In [1]:
import sys
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install matplotlib
!{sys.executable} -m pip install scikit-learn
!{sys.executable} -m pip install ipywidgets

!{sys.executable} -m jupyter nbextension enable --py widgetsnbextension



Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: ok


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import ipywidgets as widgets

In [3]:
# Import CSV data
datasource_loc = "creditcard.csv"
df = pd.read_csv(datasource_loc)

def pca_transform(df, feat_to_trans):
    features = df.columns[:-1]
    trans_col = df.loc[:, feat_to_trans]
    non_trans_col = df.loc[:, [x for x in features if x not in feat_to_trans]]
    target_col = df.loc[:, 'Class']

    pca_trans = StandardScaler().fit_transform(trans_col)

    return pd.concat([pd.DataFrame(data=pca_trans, columns=feat_to_trans), non_trans_col, target_col], axis=1)

# Use PCA Transformation on Time and Amount
df_trans = pca_transform(df, ['Time', 'Amount'])

In [4]:
def simple_box_plot(data, xticks, xlabel, ylabel, title):
    fig, ax = plt.subplots()
    ax.set_ylabel(ylabel, fontsize=12)
    ax.set_xlabel(xlabel, fontsize=12)
    ax.set_title(title, fontsize=14)
    ax.grid(axis='y')
    
    green_diamond = dict(markerfacecolor='g', marker='D')
    plt.boxplot(data, flierprops=green_diamond)

    plt.xticks(range(1, len(xticks) + 1), xticks, fontsize='12')
    plt.yticks(fontsize='12')
    plt.show()

In [5]:
def multi_plot_linear_regression(df, x_col, y_col, target_col, targets, colors, legend, x_label, y_label, title, reg_line=True):
    fig, ax = plt.subplots()
    ax.set_xlabel(x_label, fontsize=12)
    ax.set_ylabel(y_label, fontsize=12)
    ax.set_title(title, fontsize=14)

    for target, color in zip(targets, colors):
        indicesToKeep = df[target_col] == target
        X = df.loc[indicesToKeep, x_col]
        Y = df.loc[indicesToKeep, y_col]
        
        ax.scatter(X, Y, color=color, s=10)
        
        if reg_line:
            fit = np.polyfit(X, Y, deg=1)
            r, g, b = color
            plt.plot(X, [fit[0] * n + fit[1] for n in X], color=(r * 0.5, g * 0.5, b * 0.5))

    if reg_line:
        reg_leg = list()
    
        for label in legend:
            reg_leg.append(label)
            reg_leg.append(label + " Regression Line")

        ax.legend(reg_leg)
    else:
        ax.legend(legend)
    ax.grid()

    plt.show()

In [6]:
features = df_trans.columns[:-1]

x = widgets.Dropdown(
    options=features,
    value=features[0],
    description='X-axis: ',
    disabled=False)

y = widgets.Dropdown(
    options=features,
    value=features[1],
    description='Y-axis: ',
    disabled=False)

plot_type = widgets.Dropdown(
    options=['Box Plot', 'Scatterplot'],
    value='Box Plot',
    description='Plot Type: ',
    disabled=False
)

reg_line = widgets.Checkbox(
    value=False,
    description="Add Regression Lines",
    disabled=False,
    indent=True
)

# Descriptive Analysis

In [7]:
def plot_it(x, y, plot_type, reg_line):
    desc_analysis_title = widgets.HTML(
        value = '<h1>Descriptive Analysis</h1>'
    )
    display(desc_analysis_title)
    
    plt.rcParams['figure.dpi'] = 100
    plt.rcParams["figure.figsize"] = (10,10)
    
    if plot_type == "Box Plot":
        print("Box Plots will only graph the feature chosen for the Y-axis.")
        if reg_line:
            print('Regression Lines only show on Scatterplots.')
        all_y = df_trans.loc[:, [y, 'Class']]
        genuine = all_y[all_y['Class'] == 0].loc[:, y]
        fraud = all_y[all_y['Class'] == 1].loc[:, y]
        simple_box_plot([genuine, fraud], ['Genuine', 'Fraud'], 'Target', f'{y}', f'{y} Effect on Fraud')
    elif plot_type == 'Scatterplot':
        if x == y:
            print('Features on X-axis and Y-axis should be different.')
        
        else:
            multi_plot_linear_regression(df_trans, 
                                 x, 
                                 y, 
                                 'Class', 
                                 [0, 1], 
                                 [(1.0, 0.0, 0.0), (0.0, 0.0, 1.0)],
                                 ['Genuine', 'Fraud'], 
                                 x, 
                                 y, 
                                 f'Effects of {x} and {y} on Fraud',
                                 reg_line=reg_line)
        

widgets.interactive(plot_it, x=x, y=y, plot_type=plot_type, reg_line=reg_line)

interactive(children=(Dropdown(description='X-axis: ', options=('Time', 'Amount', 'V1', 'V2', 'V3', 'V4', 'V5'…

# Non-Descriptive Analysis
## Training

In [8]:
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

# Get final dataset, set list of features to remove
df_final = pca_transform(df, ['Time'])
feat_to_remove = ['Amount', 'Class']

# Split the data into Training, Validation, and Testing Datasets
sampling_df = shuffle(df_final)

total_samples = sampling_df.values.shape[0]
training_size = int(total_samples * 0.5)
validation_size = int(total_samples * 0.25)
testing_size = int(total_samples * 0.25)

training_df = sampling_df.iloc[:training_size, :]
validation_df = sampling_df.iloc[training_size:training_size + validation_size, :]
testing_df = sampling_df.iloc[training_size + validation_size:, :]

# Train the machine learning model
clf = RandomForestClassifier(n_estimators=23, max_features=12, n_jobs=-1)
X_train = training_df.loc[:, [f for f in training_df.columns if f not in feat_to_remove]].values
Y_train = training_df.loc[:, 'Class'].values
clf = clf.fit(X_train, Y_train)

## Validation

In [9]:
# Validate the machine learning model
X_validate = validation_df.loc[:, [f for f in validation_df.columns if f not in feat_to_remove]].values
Y_validate = validation_df.loc[:, 'Class'].values

Y_predict = clf.predict(X_validate)
print('Validation Results for Random Forest Classifer:')
print('recall', metrics.recall_score(Y_validate, Y_predict))
print('precision positive', metrics.precision_score(Y_validate, Y_predict))
print('precision negative', metrics.precision_score(Y_validate, Y_predict, pos_label=0))

Validation Results for Random Forest Classifer:
recall 0.8548387096774194
precision positive 0.8548387096774194
precision negative 0.9997467535208295


## Testing

In [10]:
# Testing interface
MONTHS = [
        ('January', 1),
        ('February', 2),
        ('March', 3),
        ('April', 4),
        ('May', 5),
        ('June', 6),
        ('July', 7),
        ('August', 8),
        ('September', 9),
        ('October', 10),
        ('November', 11),
        ('December', 12)
    ]

MONTH_NUM_TO_NAME = {v[1]: v[0] for v in MONTHS}

title = widgets.HTML(
    value="<h1>Fraud Detect - Demo Version</h1><h2>Performance Analysis</h2><h3>Maximum Timeframe is 1 year</h3>",
    placeholder='',
    description=''
)

month_from = widgets.Dropdown(
    options = MONTHS,
    value = 1,
    description = 'Month from:'
)

month_to = widgets.Dropdown(
    options = MONTHS,
    value = 1,
    description = 'Month to:'
)

btn = widgets.Button(
    description='Run',
    disabled=False,
    button_style='',
    tooltip='Run report',
    icon='check'
)

out = widgets.Output()

display(title)
display(month_from)
display(month_to)
display(btn)
display(out)

month_width = int(testing_df.shape[0] / 12)

def run(month_from, month_to):
    first_interval = (month_from - 1) * month_width
    second_interval = month_to * month_width
    
    if month_from > month_to:
        X_test = pd.concat([testing_df.iloc[first_interval:, :-1], testing_df.iloc[:second_interval, :-1]])
        Y_test = pd.concat([testing_df.iloc[first_interval:, -1], testing_df.iloc[:second_interval, -1]]).values
        print('NOTE: Timeframe spans across two different calendar years.')
    else:
        X_test = testing_df.iloc[first_interval:second_interval, :-1]
        Y_test = testing_df.iloc[first_interval:second_interval, -1].values
        
    
    # Predict fraud with the trained classifier
    Y_predict = clf.predict(X_test.loc[:, [f for f in X_test.columns if f not in feat_to_remove]].values)
    
    # Calculate performance metrics
    recall = metrics.recall_score(Y_test, Y_predict)
    precision_pos = metrics.precision_score(Y_test, Y_predict)
    precision_neg = metrics.precision_score(Y_test, Y_predict, pos_label=0)
    
    # Build pie charts
    recall_pie = np.array([recall, 1-recall])
    prec_pos_pie = np.array([precision_pos, 1-precision_pos])
    prec_neg_pie = np.array([precision_neg, 1-precision_neg])
    colors = ['g', 'k']
    
    fig, axis = plt.subplots(3, 1)
    axis[0].pie(recall_pie, labels=['Recall {:.2f}%'.format(recall*100), ''], colors=colors)
    axis[0].set_title('Percent of Fraudulent Transactions Identified')
    axis[1].pie(prec_pos_pie, labels=['Precision Positive {:.2f}%'.format(precision_pos*100), ''], colors=colors)
    axis[1].set_title('Percent of Fraud Predictions that were Correct')
    axis[2].pie(prec_neg_pie, labels=['Precision Negative {:.2f}%'.format(precision_neg*100), ''], colors=colors)
    axis[2].set_title('Percent of Genuine Predictions that were Correct')
    
    plt.show()
        
    # Create a single dataframe containing transaction info, the actual values, and predicted values
    results_df = X_test.copy()
    results_df['Class'] = Y_test
    results_df['Predict'] = Y_predict
    
    # Sum the dollar amounts of identified fraud and missed fraud
    identified = results_df[(results_df['Class'] == 1) & (results_df['Predict'] == 1)].loc[:, 'Amount'].sum(axis=0)
    missed = results_df[(results_df['Class'] == 1) & (results_df['Predict'] == 0)].loc[:, 'Amount'].sum(axis=0)
    return (identified, missed)
    
def on_btn_clicked(b):
    with out:
        out.clear_output()
        plot_title = widgets.HTML(
            value=f'<h3>Performance Analysis - {MONTH_NUM_TO_NAME[month_from.value]} through {MONTH_NUM_TO_NAME[month_to.value]}</h3>'
        )
        display(plot_title)
        
        identified, missed = run(month_from.value, month_to.value)

        # Display the dollar amount of identified fraud.
        fraud_dollars = widgets.HTML(
            value='<p style="font-size:18px"><strong>${:,.02f} of ${:,.02f}</strong> possible fraud identified</p>'.format(identified, identified + missed),
            placeholder = '',
            description = ''
        )
        display(fraud_dollars)
        
btn.on_click(on_btn_clicked)

HTML(value='<h1>Fraud Detect - Demo Version</h1><h2>Performance Analysis</h2><h3>Maximum Timeframe is 1 year</…

Dropdown(description='Month from:', options=(('January', 1), ('February', 2), ('March', 3), ('April', 4), ('Ma…

Dropdown(description='Month to:', options=(('January', 1), ('February', 2), ('March', 3), ('April', 4), ('May'…

Button(description='Run', icon='check', style=ButtonStyle(), tooltip='Run report')

Output()