<img src="https://github.com/rjpost20/Anomalous-Bank-Transactions-Detection-Project/blob/main/data/AdobeStock_319163865.jpeg?raw=true">
Image by <a href="https://stock.adobe.com/contributor/200768506/andsus?load_type=author&prev_url=detail" >AndSus</a> on Adobe Stock

# Phase 5 Project: *Detecting Anomalous Financial Transactions*

## Notebook 3: Modeling, Analysis and Results

### By Ryan Posternak

Flatiron School, Full-Time Live NYC<br>
Project Presentation Date: August 25th, 2022<br>
Instructor: Joseph Mata

<br>

# Imports and Reading in Data

### Google colab compatibility downloads

In [1]:
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.3.0/spark-3.3.0-bin-hadoop3.tgz 
!tar xf spark-3.3.0-bin-hadoop3.tgz
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.0-bin-hadoop3"
!pip install pyspark==3.3.0
!pip install -q findspark
import findspark
findspark.init()

[33m0% [Working][0m            Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Get:3 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Hit:4 http://archive.ubuntu.com/ubuntu bionic InRelease
Ign:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease [1,581 B]
Get:7 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:8 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:9 http://security.ubuntu.com/ubuntu bionic-security/main amd64 Packages [2,937 kB]
Hit:10 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Get:11 http://security.ubuntu.com/ubuntu bionic-security/universe amd64 Packages [1,533 kB]
Get:12 http://archive.ubuntu.c

In [2]:
# Connect to Google drive
from google.colab import drive, files
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import numpy as np
import pandas as pd
from itertools import chain
import os

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, IntegerType, DoubleType
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

import seaborn as sns
from IPython.display import HTML, display
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [4]:
# Check Colab GPU info
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
    print('Not connected to a GPU')
else:
    print(gpu_info)

Not connected to a GPU


In [5]:
# Check Colab RAM info
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 54.8 gigabytes of available RAM

You are using a high-RAM runtime!


In [6]:
# Set text to wrap in Google colab notebook
def set_css():
    display(HTML("""
    <style>
      pre {
          white-space: pre-wrap;
      }
    </style>
    """))
get_ipython().events.register('pre_run_cell', set_css)

In [7]:
# Initialize Spark Session
spark = SparkSession.builder\
        .master("local[*]")\
        .appName("Colab")\
        .config("spark.ui.port", "4050")\
        .config("spark.driver.memory", "15g")\
        .getOrCreate()

spark

In [8]:
# Read in weighted_df and resampled_df (training data) and test_df_preprocessed (testing data) data csv files as Spark DataFrames
train_df_weighted = spark.read.csv('/content/drive/MyDrive/Colab Notebooks/train_df_weighted.csv', header=True, inferSchema=True)
train_df_resampled = spark.read.csv('/content/drive/MyDrive/Colab Notebooks/train_df_resampled.csv', header=True, inferSchema=True)

test_df_preprocessed = spark.read.csv('/content/drive/MyDrive/Colab Notebooks/test_df_preprocessed.csv', header=True, inferSchema=True)

In [10]:
# Print shape of dataframes
print(f"train_df_weighted:  {train_df_weighted.count():,} Rows, {len(train_df_weighted.columns)} Columns")
print(f"train_df_resampled:  {train_df_resampled.count():,} Rows, {len(train_df_resampled.columns)} Columns")

print(f"test_df_preprocessed:  {test_df_preprocessed.count():,} Rows, {len(test_df_preprocessed.columns)} Columns")

train_df_weighted:  4,691,725 Rows, 15 Columns
train_df_resampled:  4,690,479 Rows, 14 Columns
test_df_preprocessed:  705,108 Rows, 14 Columns


In [17]:
# Print schema of training dataframes
print('train_df_weighted:')
train_df_weighted.printSchema()
print('\ntrain_df_resampled:')
train_df_resampled.printSchema()

train_df_weighted:
root
 |-- MessageId: string (nullable = true)
 |-- Label: integer (nullable = true)
 |-- InstructedAmountUSD: integer (nullable = true)
 |-- IntermediaryTransactions: integer (nullable = true)
 |-- Flagged: integer (nullable = true)
 |-- OrderingCountryFreq: integer (nullable = true)
 |-- BeneficiaryCountryFreq: integer (nullable = true)
 |-- Hour: integer (nullable = true)
 |-- SenderHourFreq: integer (nullable = true)
 |-- SenderCurrencyFreq: integer (nullable = true)
 |-- SenderCurrencyAmtAvg: integer (nullable = true)
 |-- SenderFreq: integer (nullable = true)
 |-- ReceiverFreq: integer (nullable = true)
 |-- SenderReceiverFreq: integer (nullable = true)
 |-- ClassWeight: double (nullable = true)


train_df_resampled:
root
 |-- MessageId: string (nullable = true)
 |-- Label: integer (nullable = true)
 |-- InstructedAmountUSD: integer (nullable = true)
 |-- IntermediaryTransactions: integer (nullable = true)
 |-- Flagged: integer (nullable = true)
 |-- OrderingCou

In [18]:
# Print schema of test dataframe
print('test_df_preprocessed:')
test_df_preprocessed.printSchema()

test_df_preprocessed:
root
 |-- MessageId: string (nullable = true)
 |-- Label: integer (nullable = true)
 |-- InstructedAmountUSD: integer (nullable = true)
 |-- IntermediaryTransactions: integer (nullable = true)
 |-- Flagged: integer (nullable = true)
 |-- OrderingCountryFreq: integer (nullable = true)
 |-- BeneficiaryCountryFreq: integer (nullable = true)
 |-- Hour: integer (nullable = true)
 |-- SenderHourFreq: integer (nullable = true)
 |-- SenderCurrencyFreq: integer (nullable = true)
 |-- SenderCurrencyAmtAvg: integer (nullable = true)
 |-- SenderFreq: integer (nullable = true)
 |-- ReceiverFreq: integer (nullable = true)
 |-- SenderReceiverFreq: integer (nullable = true)



In [19]:
# Drop 'MessageId' individual transaction identifier column - will not be used in modeling
train_df_weighted = train_df_weighted.drop('MessageId')
train_df_resampled = train_df_resampled.drop('MessageId')
test_df_preprocessed = test_df_preprocessed.drop('MessageId')

# Rename target variable 'Label' column to more descriptive 'Anomalous'
train_df_weighted = train_df_weighted.withColumnRenamed('Label', 'Anomalous')
train_df_resampled = train_df_resampled.withColumnRenamed('Label', 'Anomalous')
test_df_preprocessed = test_df_preprocessed.withColumnRenamed('Label', 'Anomalous')

In [22]:
# Display first row of train_df_weighted dataframe
train_df_weighted.show(n=1, truncate=False, vertical=True)

-RECORD 0-----------------------------------------
 Anomalous                | 0                     
 InstructedAmountUSD      | 519149                
 IntermediaryTransactions | 0                     
 Flagged                  | 0                     
 OrderingCountryFreq      | 73733                 
 BeneficiaryCountryFreq   | 40984                 
 Hour                     | 8                     
 SenderHourFreq           | 47559                 
 SenderCurrencyFreq       | 73733                 
 SenderCurrencyAmtAvg     | 1906631               
 SenderFreq               | 73733                 
 ReceiverFreq             | 41012                 
 SenderReceiverFreq       | 40984                 
 ClassWeight              | 0.0010443919880214632 
only showing top 1 row



In [21]:
# Display first row of resampled_df dataframe
train_df_resampled.show(n=1, truncate=False, vertical=True)

-RECORD 0---------------------------
 Anomalous                | 0       
 InstructedAmountUSD      | 809115  
 IntermediaryTransactions | 0       
 Flagged                  | 0       
 OrderingCountryFreq      | 49092   
 BeneficiaryCountryFreq   | 116098  
 Hour                     | 5       
 SenderHourFreq           | 11206   
 SenderCurrencyFreq       | 49092   
 SenderCurrencyAmtAvg     | 1907059 
 SenderFreq               | 49092   
 ReceiverFreq             | 116135  
 SenderReceiverFreq       | 49092   
only showing top 1 row



In [23]:
# Display first row of test_df_preprocessed dataframe
test_df_preprocessed.show(n=1, truncate=False, vertical=True)

-RECORD 0---------------------------
 Anomalous                | 0       
 InstructedAmountUSD      | 653958  
 IntermediaryTransactions | 0       
 Flagged                  | 0       
 OrderingCountryFreq      | 73733   
 BeneficiaryCountryFreq   | 40984   
 Hour                     | 8       
 SenderHourFreq           | 47559   
 SenderCurrencyFreq       | 73733   
 SenderCurrencyAmtAvg     | 1906631 
 SenderFreq               | 73733   
 ReceiverFreq             | 41012   
 SenderReceiverFreq       | 40984   
only showing top 1 row



<br>

# Create Pipeline to Preprocess and Model Data

### Index string columns with StringIndexer

In [None]:
stages = []

categoricalCols = [item[0] for item in train_df_resampled.dtypes if item[1].startswith('string')]

indexers = []

for col in categoricalCols:
    indexer = StringIndexer(inputCol=col, outputCol=col + '_index', handleInvalid='keep')
    indexers.append(indexer)
    
indexed_features = []
for si in indexers:
    indexed_features.append(si.getOutputCol())
    
print(f"Indexed nominal categorical features: \n{indexed_features}")

### Create a OneHotEncoder to encode the indexed string features

In [None]:
encoder = OneHotEncoder(inputCols=indexed_features, 
                        outputCols=[col + '_ohe' for col in indexed_features], 
                        dropLast=True)

print(f"One hot encoded nominal categorical features: {len(encoder.getOutputCols())}\n{encoder.getOutputCols()}")

### Compile numeric features, not including target column or class weight column of train_df_weighted

In [28]:
numeric_features = []
for column, dtype in train_df_resampled.dtypes:
    if dtype != 'string' and column != 'Anomalous':
        numeric_features.append(column)

# Confirm equal column counts
assert len(train_df_resampled.drop('Anomalous').columns) == (len(indexed_features) + len(numeric_features))
print(f"Numeric features: {len(numeric_features)}\n{numeric_features}")

Numeric features: 12
['InstructedAmountUSD', 'IntermediaryTransactions', 'Flagged', 'OrderingCountryFreq', 'BeneficiaryCountryFreq', 'Hour', 'SenderHourFreq', 'SenderCurrencyFreq', 'SenderCurrencyAmtAvg', 'SenderFreq', 'ReceiverFreq', 'SenderReceiverFreq']


In [29]:
# Print names of final features going into the model
features = numeric_features
print(f"Final features: {len(features)}\n{features}")

Final features: 12
['InstructedAmountUSD', 'IntermediaryTransactions', 'Flagged', 'OrderingCountryFreq', 'BeneficiaryCountryFreq', 'Hour', 'SenderHourFreq', 'SenderCurrencyFreq', 'SenderCurrencyAmtAvg', 'SenderFreq', 'ReceiverFreq', 'SenderReceiverFreq']


### Create a VectorAssembler to combine all features

In [None]:
assembler = VectorAssembler(inputCols=features, outputCol='vectorized_features')

# Assemble a list of stages that includes all indexers, the one hot encoder, the vector 
# assembler, and the standard scaler
scaler = StandardScaler(inputCol='vectorized_features', outputCol='scaled_features')

stages = indexers + [encoder, assembler, scaler]
print("Stages:", stages)

### Create modeling pipeline

In [None]:
pipeline = Pipeline(stages=stages)

pipeline_model = pipeline.fit(train_df_resampled)

pipeline_df = pipeline_model.transform(train_df_resampled)

In [None]:
# Display first row of resampled_df after running through pipeline
pipeline_df.show(1, vertical=True, truncate=False)

In [None]:
pipeline_test = pipeline.fit(test_df_preprocessed)

pipeline_df_test = pipeline_model.transform(test_df_preprocessed)

pipeline.fit(test_df_preprocessed).transform(test_df_preprocessed).head()['features'].size

In [None]:
# Display first row of test_df_preprocessed after running through pipeline
pipeline_df_test.show(1, vertical=True, truncate=False)

<br>

In [None]:
print(vars(BinaryClassificationEvaluator.metricName))

In [None]:
def score_model(classifier, train_df, test_df, preprocessing_stages=stages):
    # Fit the model and classifier
    stages_with_classifier = preprocessing_stages + [classifier]
    pipe = Pipeline(stages=stages_with_classifier)
    model = pipe.fit(train_df)
    # Define evaluator_1 for areaUnderPR (curve)
    evaluator_1 = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction', 
                                                labelCol='Anomalous', 
                                                metricName="areaUnderPR")
    
    # Define evaluator_2 for accuracy, precision, recall, and f1 scores
    evaluator_2 = MulticlassClassificationEvaluator(predictionCol='prediction', 
                                                    labelCol='Anomalous')
    # Store train and test dataframes in variables
    train_results = model.transform(train_df)
    test_results = model.transform(test_df)
    # Compute accuracy scores on train and test data
    train_acc = evaluator_2.evaluate(train_results, {evaluator_2.metricName: 'accuracy'})
    test_acc = evaluator_2.evaluate(test_results, {evaluator_2.metricName: 'accuracy'})
    # Compute AUPRC scores on train and test data
    train_auprc = evaluator_1.evaluate(train_results)
    test_auprc = evaluator_1.evaluate(test_results)
    # Compute precision scores on test data
    test_precision_0 = evaluator_2.evaluate(test_results, 
                                            {evaluator_2.metricName: 'precisionByLabel', 
                                             evaluator_2.metricLabel: 0})
    test_precision_1 = evaluator_2.evaluate(test_results, 
                                            {evaluator_2.metricName: 'precisionByLabel', 
                                             evaluator_2.metricLabel: 1})
    # Compute recall scores on test data
    test_recall_0 = evaluator_2.evaluate(test_results, 
                                         {evaluator_2.metricName: 'recallByLabel', 
                                          evaluator_2.metricLabel: 0})
    test_recall_1 = evaluator_2.evaluate(test_results, 
                                         {evaluator_2.metricName: 'recallByLabel', 
                                          evaluator_2.metricLabel: 1})

    print(classifier, '\n')
    print(f"Training set accuracy: {round(train_acc, 3)}", end='\t')
    print(f"Test set accuracy: {round(test_acc, 3)}")
    print('-'*60)
    print(f"Training set AUPRC: {round(train_auprc, 3)}", end='\t')
    print(f"Test set AUPRC: {round(test_auprc, 3)}")
    print('-'*60)
    print("Test set precision")
    print(f"Non-anomalous {round(test_precision_0, 3)}", end='\t')
    print(f"Anomalous: {round(test_precision_1, 3)}")
    print('-'*60)
    print("Test set recall")
    print(f"Non-anomalous: {round(test_recall_0, 3)}", end='\t')
    print(f"Anomalous (label 1): {round(test_recall_1, 3)}")

In [None]:
lr_1 = LogisticRegression(
    featuresCol='features',
    labelCol='Anomalous',
    predictionCol='prediction',
    rawPredictionCol='rawPrediction', 
    standardization=False, 
    threshold=0.9
)

In [None]:
score_model(classifier=lr_1, train_df=train_df_resampled, test_df=test_df_preprocessed)

In [None]:
plot_confusion_matrix(classifier=lr_1, train_df=train_df_resampled, test_df=test_df_preprocessed)