In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Background of This Project
The dataset is designed to offer insights into various aspects of financial movements often associated with illicit activities. Often with financial instuitions are ethically obliged to identify certain black money laundering activities due to inflow of black money into economy leads to corruption of the society.

This is because every black money is collected from either illegal activities or legal income that is not taxed publically which serves some negative purpose of tax saving. Especially when such black money goes through money laundering and is publicly used, societal structure is destroyed, bringing the main cause of rise in criminal rate. Moreover, these black money cannot be included in the gross national product (GNP) of a country, exceptionally underestimating country's economic situation. These inaccuracies affect in country's policy making and eventual welfare of the population. 

Considering such background knowledge along with comprehensive attributes ranging from transaction amounts to risk scores, I will be attempting to:

* draw some patterns on financial crime patterns in order to successfully classify whether the new transaction activity is either legal or illegal to ease the problems arose from such financial background of black money -> **Source of Money Classification**  

**Columns:**

* Transaction ID: Unique identifier for each transaction. (e.g., TX0000001)
* Country: Country where the transaction occurred. (e.g., USA, China)
* Amount (USD): Transaction amount in US Dollars. (e.g., 150000.00)
* Transaction Type: Type of transaction. (e.g., Offshore Transfer, Property Purchase)
* Date of Transaction: The date and time of the transaction. (e.g., 2022-03-15 14:32:00)
* Person Involved: Name or identifier of the person/entity involved. (e.g., Person_1234)
* Industry: Industry associated with the transaction. (e.g., Real Estate, Finance)
* Destination Country: Country where the money was sent. (e.g., Switzerland)
* Reported by Authority: Whether the transaction was reported to authorities. (e.g., True/False)
* **Source of Money**: Origin of the money. (e.g., Legal, Illegal) **This will be our target label.**
* Money Laundering Risk Score: Risk score indicating the likelihood of money laundering (1-10). (e.g., 8)
* Shell Companies Involved: Number of shell companies used in the transaction. (e.g., 3)
* Financial Institution: Bank or financial institution involved in the transaction. (e.g., Bank_567)
* Tax Haven Country: Country where the money was transferred to a tax haven. (e.g., Cayman Islands)

**Key Business Goals**
* Minimize risk of money laundering through identifying source of money from the dataset and eventually minimize risk of societal corruption

## Import Relevant Packages

In [None]:
pd.set_option('display.max_columns', None)
# General Modules
import datetime

# Model Preprocessing
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from imblearn.over_sampling import SMOTE

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Model Construction
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC

# Model Evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, roc_auc_score
from sklearn.metrics import classification_report

In [None]:
## Read in CSV file
black_money = pd.read_csv('/kaggle/input/global-black-money-transactions-dataset/Big_Black_Money_Dataset.csv')
black_money.head()

## EDA

In [None]:
## Access general information about the dataset
black_money.info()

**It seems that we have 14 columns including the target label and 10K entries**
* There are some columns with float or int like **transaction amount**, **money laundering risk score**, and **shell companies invovled**

* Some variables need to be modified into datetime like **Date of Transaction** and some variables like **Shell Companies Involved** need to be converted into categorical variables

* There exists categorical variables like **Transaction Type**, **Tax Haven Country**, etc

In [None]:
## Convert necessary variables into optimum data types
black_money['Shell Companies Involved'] = black_money['Shell Companies Involved'].astype('str')

In [None]:
black_money['Date of Transaction'] = black_money['Date of Transaction'].apply(lambda x : datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))

In [None]:
## Access statistical information about the dataset
black_money.describe(include = 'all')

There seems to be no evidences of abnormal behaviour among the variables

In [None]:
## Check for null values 
black_money.isna().sum()

Luckily there are no null values!

In [None]:
## Check for duplicated values
black_money.duplicated().sum()

Even for the duplicating values, we cannot find any of them!

In [None]:
black_money.head()

In [None]:
## Check for proportion of the target label
black_money['Source of Money'].value_counts(normalize = True)

Although the target label is imbalanced, the extent at which it is imbalanced is not too extreme.

In [None]:
black_money.drop(columns = ['Transaction ID', 'Person Involved'], axis = 1, inplace = True)

Considering data from two variables possibly include unnecessary personal information, I decided to drop the two of them

In [None]:
## Arrange all numerical features to a single series
numerical_features = black_money.columns[(black_money.dtypes == int).values | (black_money.dtypes == float)]
numerical_features

In [None]:
## Arrange all categorical features to a single series
categorical_features = black_money.columns[black_money.dtypes == object].drop(['Financial Institution', 'Source of Money'])
categorical_features

Here I dropped Transaction ID as it

In [None]:
## Plot Histograms of numerical features
fig, ax = plt.subplots(1, 2, figsize = (20, 5))

for i, feature in enumerate(numerical_features):
    sns.histplot(data = black_money, x = feature, ax = ax[i])
    ax[i].set_title(f'{feature} Histogram')
    
plt.tight_layout()
plt.show()

Both numerical features need to go through normalisation through MinMaxScaler considering their evenly distributed histogram 

**How can Normalizing help in this scenario?**
* Improve Convergence : Some algorithms based on gradient boosting converge faster when features are at similar scale

* Reduce the Noisiness coming from Bias : Especially for algorithms that rely on distance metrics

* Better Interpretation of Feature Importance : Normalising help in interperting feature importances

Now let's check presence of outliers in the datset.

In [None]:
## Check if outliers are present in the variables
fig, ax = plt.subplots(1, 2, figsize = (20, 5))

for i, feature in enumerate(numerical_features):
    sns.boxplot(data = black_money, x = feature, ax = ax[i])
    ax[i].set_title(f'Boxplot of {feature}')
    
plt.tight_layout()
plt.show()

Luckily there seems to be no outliers present in the numerical features but definitely we gotta check again.

In [None]:
## Check for outliers
def check_outliers(data, features):
    for feature in features:
        q1 = data[feature].quantile(0.25)
        q3 = data[feature].quantile(0.75)
        iqr = q3 - q1
        
        lower = q1 - 1.5 * iqr
        upper = q3 + 1.5 * iqr
        
        num_of_outliers = len(data[(data[feature] < lower) | (data[feature] > upper)])
        print(f'Number of Outliers for {feature} : {num_of_outliers}')

check_outliers(black_money, numerical_features)

We have finally proved both visually and mathematically that there are no outliers among the numerical features!

It's now time to check for value counts of categorical features.

In [None]:
# Assuming categorical_features is a list of your categorical columns
plots_per_row = 2

# Calculate number of rows needed
num_of_rows = (len(categorical_features) + plots_per_row - 1) // plots_per_row

# Establish subplot template
fig, axes = plt.subplots(num_of_rows, plots_per_row, figsize=(20, 4 * num_of_rows))
axes = axes.flatten()  # Flatten the axes array for easier indexing

# Loop through the categorical features and show their distribution plots
for i, feature in enumerate(categorical_features):
    value_counts = black_money[feature].value_counts()
    sns.barplot(x=value_counts.index, y=value_counts.values, ax=axes[i])
    axes[i].set_title(f'Value Counts of {feature}')
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('Count')

# Hide any unused subplots
for j in range(i + 1, len(axes)):
    axes[j].set_visible(False)

# For better visualization
plt.tight_layout()
plt.show()

There are no need to deal with imbalanced categorical features like upsampling or downsampling so we are just going to deal with the data.

In [None]:
numerical_features

In [None]:
## Compute median amount of transaction for each risk score where source of money is illegal 
illegal = black_money[black_money['Source of Money'] == 'Illegal'].groupby('Money Laundering Risk Score')['Amount (USD)'].agg('median').reset_index()

## Compute median amount of transaction for each risk score where source of money is legal 
legal = black_money[black_money['Source of Money'] == 'Legal'].groupby('Money Laundering Risk Score')['Amount (USD)'].agg('median').reset_index()

## Then we merge the two dataframes
merged = illegal.join(legal, how = 'inner', on = 'Money Laundering Risk Score', lsuffix = '_illegal', rsuffix = '_legal')

## Pivot the table for beter analysis 
melted = merged.melt(id_vars='Money Laundering Risk Score', 
                     value_vars=['Amount (USD)_illegal', 'Amount (USD)_legal'],
                     var_name='Source', 
                     value_name='Median Amount (USD)')

## Show the table
melted

In [None]:
# Create the bar plot using the melted table
plt.figure(figsize=(10, 6))
sns.barplot(data=melted, x='Money Laundering Risk Score', y='Median Amount (USD)', hue='Source')
plt.title('Median Amount (USD) by Money Laundering Risk Score')
plt.xlabel('Money Laundering Risk Score')
plt.ylabel('Median Amount (USD)')
plt.legend(title='Source of Money', loc = 'lower right')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

* The trend shows that majority of transaction amount is lower in the legal money transaction than that of illegal money transaction

* There exists no particular trend in the case of money laundering risk score

Let's explore more about the categorical features and source of money

In [None]:
categorical_features

Let's start with Country variable

In [None]:
## Explore the grouped table using country variable
grped = black_money.groupby('Source of Money')['Country'].value_counts().unstack()
grped

In [None]:
## Create a grouped table for visualisation
country_grouped = black_money.groupby('Country')['Source of Money'].value_counts().unstack()

## Create a new ratio column for each label in terms of better visualisation
country_grouped['Illegal_ratio'] = country_grouped['Illegal'] / (country_grouped['Illegal'] + country_grouped['Legal'])
country_grouped['Legal_ratio'] = country_grouped['Legal'] / (country_grouped['Illegal'] + country_grouped['Legal'])
country_grouped.sort_values(by = 'Illegal_ratio', inplace = True)
country_grouped

In [None]:
## Barplot using counts of Source of Money 
fig = px.bar(country_grouped[['Illegal','Legal']])
fig.update_layout(xaxis_title='Country',
                  yaxis_title='Count',
                  title='Country Distribution by Source of Money', height=500, width=1000)


fig.show()

In [None]:
## Barplot using counts of Source of Money Ratio
fig = px.bar(country_grouped[['Illegal_ratio','Legal_ratio']])
fig.update_layout(xaxis_title='Country',
                  yaxis_title='Ratio',
                  title='Country Distribution by Source of Money Ratio', height=500, width=1000)


fig.show()

* It seems that Country is not a strong influencer in determining source of money as distribution seem similar across different countries

* Nevertheless, Brazil did contain the most proportion of Illegal money transaction, followed by South Africa

Now we will look at Transaction Type

In [None]:
black_money.groupby('Source of Money')['Transaction Type'].value_counts().unstack()

In [None]:
## Create ratio columns for each label for better comparison
transaction_type_grouped = black_money.groupby('Transaction Type')['Source of Money'].value_counts().unstack()
transaction_type_grouped['Illegal_ratio'] = transaction_type_grouped['Illegal'] / (transaction_type_grouped['Illegal'] + transaction_type_grouped['Legal'])
transaction_type_grouped['Legal_ratio'] = transaction_type_grouped['Legal'] / (transaction_type_grouped['Illegal'] + transaction_type_grouped['Legal'])
transaction_type_grouped.sort_values(by = 'Illegal_ratio', inplace = True)
transaction_type_grouped

In [None]:
## Plot counts for source of money for each transaction type
fig = px.bar(transaction_type_grouped[['Illegal', 'Legal']])
fig.update_layout(xaxis_title = 'Transaction Type',
                 yaxis_title = 'Counts',
                 title = 'Transaction Type Distribution By Source of Money')
fig.show()

In [None]:
## Plot counts for source of money ratio for each transaction type
fig = px.bar(transaction_type_grouped[['Illegal_ratio', 'Legal_ratio']])
fig.update_layout(xaxis_title = 'Transaction Type',
                 yaxis_title = 'Ratio',
                 title = 'Transaction Type Distribution by Source of Money Ratio')
fig.show()

* Cash Withdraw had the highest proportion of illegal money transaction, following by stocks transfer

* Even with the transaction type, there seems to be no particular trend across different transaction type, indicating that transaction type is not a noticeable variable that influences the source of money

In [None]:
black_money.groupby('Source of Money')['Industry'].value_counts().unstack()

In [None]:
## Create ratio columns for each label for better comparison
industry_grouped = black_money.groupby('Industry')['Source of Money'].value_counts().unstack()
industry_grouped['Illegal_ratio'] = industry_grouped['Illegal'] / (industry_grouped['Illegal'] + industry_grouped['Legal'])
industry_grouped['Legal_ratio'] = industry_grouped['Legal'] / (industry_grouped['Illegal'] + industry_grouped['Legal'])
industry_grouped.sort_values(by = 'Illegal_ratio', inplace = True)
industry_grouped

In [None]:
## Plot counts for source of money for each transaction type
fig = px.bar(industry_grouped[['Illegal', 'Legal']])
fig.update_layout(xaxis_title = 'Industry',
                 yaxis_title = 'Counts',
                 title = 'Industry Distribution by Source of Money')
fig.show()

In [None]:
## Plot counts for source of money ratio for each transaction type
fig = px.bar(industry_grouped[['Illegal_ratio', 'Legal_ratio']])
fig.update_layout(xaxis_title = 'Industry',
                 yaxis_title = 'Ratio',
                 title = 'Industry Distribution by Source of Money Ratio')
fig.show()

* In terms of different industries, the proportion of illegal money transaction had been the highest for Casinos while that of Construction being the lowest

In [None]:
black_money.groupby('Source of Money')['Destination Country'].value_counts().unstack()

In [None]:
## Create ratio columns for each label for better comparison
dest_country_grouped = black_money.groupby('Destination Country')['Source of Money'].value_counts().unstack()
dest_country_grouped['Illegal_ratio'] = dest_country_grouped['Illegal'] / (dest_country_grouped['Illegal'] + dest_country_grouped['Legal'])
dest_country_grouped['Legal_ratio'] = dest_country_grouped['Legal'] / (dest_country_grouped['Illegal'] + dest_country_grouped['Legal'])
dest_country_grouped.sort_values(by = 'Illegal_ratio', inplace = True)
dest_country_grouped

In [None]:
## Plot counts for source of money for each transaction type
fig = px.bar(dest_country_grouped[['Illegal', 'Legal']])
fig.update_layout(xaxis_title = 'Destination Country',
                 yaxis_title = 'Counts',
                 title = 'Destination Country Distribution by Source of Money')
fig.show()

In [None]:
## Plot counts for source of money ratio for each transaction type
fig = px.bar(dest_country_grouped[['Illegal_ratio', 'Legal_ratio']])
fig.update_layout(xaxis_title = 'Destination Country',
                 yaxis_title = 'Ratio',
                 title = 'Destination Country Distribution by Source of Money Ratio')
fig.show()

* USA had the highest proportion of illegal money transaction among the other destination countries, followed by South Africa

* UK contained the smallest proportion of illegal money transaction 

In [None]:
black_money.groupby('Source of Money')['Shell Companies Involved'].value_counts().unstack()

In [None]:
sh_comp_grouped = black_money.groupby('Shell Companies Involved')['Source of Money'].value_counts().unstack()
sh_comp_grouped['Illegal_ratio'] = sh_comp_grouped['Illegal'] / (sh_comp_grouped['Illegal'] + sh_comp_grouped['Legal'])
sh_comp_grouped['Legal_ratio'] = sh_comp_grouped['Legal'] / (sh_comp_grouped['Illegal'] + sh_comp_grouped['Legal'])
sh_comp_grouped.sort_values(by = 'Illegal_ratio', inplace = True)
sh_comp_grouped

In [None]:
fig = px.bar(sh_comp_grouped[['Illegal', 'Legal']])
fig.update_layout(xaxis_title = 'Number of Shell Companies Involved',
                 yaxis_title = 'Counts',
                 title = 'Number of Shell Companies Invovled Distribution by Source of Money')
fig.show()

In [None]:
fig = px.bar(sh_comp_grouped[['Illegal_ratio', 'Legal_ratio']])
fig.update_layout(xaxis_title = 'Number of Shell Companies Involved',
                 yaxis_title = 'Ratio',
                 title = 'Number of Shell Companies Invovled Distribution by Source of Money')
fig.show()

* In terms of number of Shell Companies Involved, it did not show any particular trend towards the source of money

In [None]:
black_money.groupby('Source of Money')['Tax Haven Country'].value_counts().unstack()

In [None]:
tax_haven_country_grouped = black_money.groupby('Tax Haven Country')['Source of Money'].value_counts().unstack()
tax_haven_country_grouped['Illegal_ratio'] = tax_haven_country_grouped['Illegal'] / (tax_haven_country_grouped['Illegal'] + tax_haven_country_grouped['Legal'])
tax_haven_country_grouped['Legal_ratio'] = tax_haven_country_grouped['Legal'] / (tax_haven_country_grouped['Illegal'] + tax_haven_country_grouped['Legal'])
tax_haven_country_grouped.sort_values(by = 'Illegal_ratio', inplace = True)
tax_haven_country_grouped

In [None]:
fig = px.bar(tax_haven_country_grouped[['Illegal', 'Legal']])
fig.update_layout(xaxis_title = 'Tax Haven Country',
                 yaxis_title = 'Counts',
                 title = 'Tax Haven Country Invovled Distribution by Source of Money')
fig.show()

In [None]:
fig = px.bar(tax_haven_country_grouped[['Illegal_ratio', 'Legal_ratio']])
fig.update_layout(xaxis_title = 'Tax Haven Country',
                 yaxis_title = 'Ratio',
                 title = 'Tax Haven Country Invovled Distribution by Source of Money Ratio')
fig.show()

Even with Tax Haven Country did not show any particular trend with source of money

In [None]:
fig = px.histogram(
    black_money,
    x='Amount (USD)',
    facet_col='Destination Country',
    color='Source of Money',
    pattern_shape='Source of Money',
    title='Amount of Money Transaction Distribution by Destination Country and Source of Money',
    nbins=20
)

# Update the layout to adjust subplot titles and x-axis title
fig.update_layout(
    title_font_size=20,  # Adjust overall title font size
    xaxis_title_font_size=14,  # Adjust x-axis title font size
    yaxis_title_font_size=14,  # Adjust y-axis title font size
)

# Update subplot titles
for subplot in fig.select_traces():
    subplot.update(name=subplot.name, legendgroup=subplot.name)

# Rotate x-axis labels
fig.update_xaxes(tickangle=45)

# Show the figure
fig.show()

In [None]:
fig = px.histogram(
    black_money,
    x='Amount (USD)',
    facet_col='Destination Country',
    color='Source of Money',
    pattern_shape='Source of Money',
    title='Amount of Money Transaction Distribution by Destination Country and Source of Money',
    nbins=20
)

# Update the layout to adjust subplot titles and x-axis title
fig.update_layout(
    title_font_size=20,  # Adjust overall title font size
    xaxis_title_font_size=14,  # Adjust x-axis title font size
    yaxis_title_font_size=14,  # Adjust y-axis title font size
)

# Update subplot titles
for subplot in fig.select_traces():
    subplot.update(name=subplot.name, legendgroup=subplot.name)

# Rotate x-axis labels
fig.update_xaxes(tickangle=45)

# Show the figure
fig.show()

In [None]:
fig = px.histogram(
    black_money,
    x='Amount (USD)',
    facet_col='Industry',
    color='Source of Money',
    pattern_shape='Source of Money',
    title='Amount of Money Transaction Distribution by Industry and Source of Money',
    nbins=20
)

# Update the layout to adjust subplot titles and x-axis title
fig.update_layout(
    title_font_size=20,  # Adjust overall title font size
    xaxis_title_font_size=14,  # Adjust x-axis title font size
    yaxis_title_font_size=14,  # Adjust y-axis title font size
)

# Update subplot titles
for subplot in fig.select_traces():
    subplot.update(name=subplot.name, legendgroup=subplot.name)

# Rotate x-axis labels
fig.update_xaxes(tickangle=45)

# Show the figure
fig.show()

In [None]:
fig = px.histogram(
    black_money,
    x='Amount (USD)',
    facet_col='Transaction Type',
    color='Source of Money',
    pattern_shape='Source of Money',
    title='Amount of Money Transaction Distribution by Transaction Type and Source of Money',
    nbins=20
)

# Update the layout to adjust subplot titles and x-axis title
fig.update_layout(
    title_font_size=20,  # Adjust overall title font size
    xaxis_title_font_size=14,  # Adjust x-axis title font size
    yaxis_title_font_size=14,  # Adjust y-axis title font size
)

# Update subplot titles
for subplot in fig.select_traces():
    subplot.update(name=subplot.name, legendgroup=subplot.name)

# Rotate x-axis labels
fig.update_xaxes(tickangle=45)

# Show the figure
fig.show()

In [None]:
fig = px.histogram(
    black_money,
    x='Amount (USD)',
    facet_col='Shell Companies Involved',
    color='Source of Money',
    pattern_shape='Source of Money',
    title='Amount of Money Transaction Distribution by Number of Shell Companies Involved and Source of Money',
    nbins=20
)

# Update the layout to adjust subplot titles and x-axis title
fig.update_layout(
    title_font_size=20,  # Adjust overall title font size
    xaxis_title_font_size=14,  # Adjust x-axis title font size
    yaxis_title_font_size=14,  # Adjust y-axis title font size
)

# Update subplot titles
for subplot in fig.select_traces():
    subplot.update(name=subplot.name, legendgroup=subplot.name)

# Rotate x-axis labels
fig.update_xaxes(tickangle=45)

# Show the figure
fig.show()

In [None]:
fig = px.histogram(
    black_money,
    x='Amount (USD)',
    facet_col='Country',
    color='Source of Money',
    pattern_shape='Source of Money',
    title='Amount of Money Transaction Distribution by Country and Source of Money',
    nbins=20
)

# Update the layout to adjust subplot titles and x-axis title
fig.update_layout(
    title_font_size=20,  # Adjust overall title font size
    xaxis_title_font_size=14,  # Adjust x-axis title font size
    yaxis_title_font_size=14,  # Adjust y-axis title font size
)

# Update subplot titles
for subplot in fig.select_traces():
    subplot.update(name=subplot.name, legendgroup=subplot.name)

# Rotate x-axis labels
fig.update_xaxes(tickangle=45)

# Show the figure
fig.show()

In [None]:
fig = px.histogram(
    black_money,
    x='Amount (USD)',
    facet_col='Tax Haven Country',
    color='Source of Money',
    pattern_shape='Source of Money',
    title='Amount of Money Transaction Distribution by Tax Haven Country and Source of Money',
    nbins=20
)

# Update the layout to adjust subplot titles and x-axis title
fig.update_layout(
    title_font_size=20,  # Adjust overall title font size
    xaxis_title_font_size=14,  # Adjust x-axis title font size
    yaxis_title_font_size=14,  # Adjust y-axis title font size
)

# Update subplot titles
for subplot in fig.select_traces():
    subplot.update(name=subplot.name, legendgroup=subplot.name)

# Rotate x-axis labels
fig.update_xaxes(tickangle=45)

# Show the figure
fig.show()

* In terms of relationship of categorical features with the numerical feature of amount of transaction, almost every plot has shown normal distribution indicating lesser number of cases of illegal money transaction.

* Transaction amount that are less than 250K or greater than 4.75M should be explored on such trend for all categorical features. 

In [None]:
black_money[(black_money['Amount (USD)'] < 250000) | (black_money['Amount (USD)'] > 4750000)]

In [None]:
fig, axes = plt.subplots(1, 2, figsize = (18, 8))
axes.flatten()

for i, feature in enumerate(numerical_features):
    sns.boxplot(data = black_money, x = 'Source of Money', y = feature, ax = axes[i])
    axes[i].set_xlabel('Source of Money')
    axes[i].set_ylabel(feature)
    axes[i].set_title(f'Boxplot on Source of Money and {feature}')
    
plt.tight_layout()
plt.show()

For those illegal money transactions, the amount of money transaction is slightly lower than that of legal money transactions and risk score tends to be the same for both.

## Data Preprocessing / Feature Engineering

Considering the dataset with a lot of categorical features, it would be better to encode categorical features and perform feature engineering for better performance of the model

In [None]:
black_money.head()

In [None]:
black_money_enc = black_money.copy()

In [None]:
## Convert the financial instituition information into numeric values
black_money_enc['Financial Institution'] = black_money_enc['Financial Institution'].str.replace('Bank_', '', regex = False).astype('int')

In [None]:
# Extracting features from the date
black_money_enc['Year'] = black_money_enc['Date of Transaction'].dt.year
black_money_enc['Month'] = black_money_enc['Date of Transaction'].dt.month
black_money_enc['Day'] = black_money_enc['Date of Transaction'].dt.day
black_money_enc['DayOfWeek'] = black_money_enc['Date of Transaction'].dt.dayofweek
black_money_enc['Hour'] = black_money_enc['Date of Transaction'].dt.hour
black_money_enc.drop(['Date of Transaction'], axis = 1, inplace = True)

In [None]:
# Binning the transaction amount
black_money_enc['Amount Category'] = pd.cut(black_money_enc['Amount (USD)'], bins=[0, 10000, 50000, 100000, 500000, float('inf')],
                               labels=['Low', 'Medium', 'High', 'Very High', 'Extreme'])

# Creating interaction terms
black_money_enc['Amount_Risk_Interaction'] = black_money_enc['Amount (USD)'] * black_money_enc['Money Laundering Risk Score']
black_money_enc.drop(columns = ['Amount (USD)', 'Money Laundering Risk Score'], axis = 1, inplace = True)



In [None]:
black_money_enc.info()

In [None]:
encoding_features = black_money_enc.select_dtypes(include = ['object', 'bool', 'category'])

In [None]:
encoding_features

In [None]:
le = LabelEncoder()

for col in encoding_features.columns:
    black_money_enc[col] = le.fit_transform(black_money_enc[col])

In [None]:
black_money_enc.head()

I have created a new interaction term for model to better grasp the nonlinear relationship

Let's check again if there are no missing values after EDA

In [None]:
correlation_matrix = black_money_enc.corr()
plt.figure(figsize = (20, 10))
sns.heatmap(correlation_matrix, annot = True, cmap = 'coolwarm', fmt = '.2f')
plt.title('Correlation Matrix of features')
plt.show()

Considering the large number of variables, it would be better to limit only some variables to be visible

In [None]:
## Establish heatmap size
plt.figure(figsize = (20, 10))

## create correlation matrix with abs values
corr_matrix = black_money_enc.corr().abs()

## Selected 0.5 to only see highly correlated variables
threshold = 0.25

## Filter the Correlation matrix based on the threshold
filtered_corr_df = corr_matrix[(corr_matrix >= threshold) & (corr_matrix != 1.000)]

## Create Heatmap
heatmap = sns.heatmap(filtered_corr_df, vmin = -1, vmax = 1, annot = True, xticklabels = True, yticklabels = True, cmap = sns.color_palette('flare', as_cmap = True), linewidths = 0.1)
heatmap.set_title('Correlation on Heatmap', fontdict = {'fontsize' : 14})

In [None]:
black_money_enc.isna().any(axis = 1).sum()

Now we will look at correlation matrix

Now before I prepare for data splitting, checking target label proportion is important

In [None]:
black_money_enc['Source of Money'].value_counts(normalize = True)

which seems that the data is imbalanced, so it is better to upsample the data considering volume of our data entries.

In [None]:
copied_data = black_money_enc.copy()

## Split X and y
X = copied_data.drop('Source of Money', axis = 1)
y = copied_data['Source of Money']

## Split the dataset into training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 11, stratify = y)

## Initiate Upsampler and upsample the data
smote = SMOTE(random_state = 11)
X_train_upsampled, y_train_upsampled = smote.fit_resample(X_train, y_train)

## Normalize features
scaler = MinMaxScaler()
X_train_upsampled = scaler.fit_transform(X_train_upsampled)
X_test = scaler.transform(X_test)

# Initialize StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [None]:
X_train_upsampled.shape, y_train_upsampled.shape

## Model Construction

In [None]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print('classification_report : \n', classification_report(y_test, y_pred))
    print(f'Accuracy : {accuracy_score(y_test, y_pred)}')
    print(f'Recall : {recall_score(y_test, y_pred)}')
    print(f'Precision : {precision_score(y_test, y_pred)}')
    print(f'F1 score : {f1_score(y_test, y_pred)}')
    print(f'AUC : {roc_auc_score(y_test, y_pred)}')
    print('')
    
    # ROC Curve
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.show()

## Logistic Regression

In [None]:
print('Logistic Regression : \n')
log_clf = LogisticRegression(random_state = 42)
evaluate_model(log_clf, X_train_upsampled, y_train_upsampled, X_test, y_test)

## Support Vector Machine

In [None]:
# Support Vector Machine (SVM) Model
print("SVM Model : \n")
svm_model = SVC(probability=True, random_state=42)  # Enable probability estimates for ROC curve
evaluate_model(svm_model, X_train_upsampled, y_train_upsampled, X_test, y_test)

## Decision Tree

In [None]:
print('Decision Tree : \n')
dt_clf = DecisionTreeClassifier(random_state = 21)
evaluate_model(dt_clf, X_train_upsampled, y_train_upsampled, X_test, y_test)

## Random Forest 

In [None]:
print('Random Forest : \n')
rf_clf = RandomForestClassifier(random_state = 11)
evaluate_model(rf_clf, X_train_upsampled, y_train_upsampled, X_test, y_test)

## XGBoost

In [None]:
print('XGBoost : \n')
xgb_clf = XGBClassifier(random_state = 11)
evaluate_model(xgb_clf, X_train_upsampled, y_train_upsampled, X_test, y_test)