In [14]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr, count, to_timestamp, monotonically_increasing_id, desc, when, sum as _sum, monotonically_increasing_id
from pyspark.sql.functions import dayofmonth, weekofyear, month, year
from pyspark.sql.window import Window
from pyspark.sql.types import *
from pyspark.ml.feature import StandardScaler, VectorAssembler

import os 

import numpy as np

import matplotlib.pyplot as plt
from math import isnan

import multiprocessing

In [15]:
cores = multiprocessing.cpu_count()
instances = cores

spark = SparkSession.builder \
          .appName("MoneyLaundering") \
          .config("spark.driver.memory", "3g") \
          .config("spark.executor.memory", "4g") \
          .config("spark.executor.instances", cores) \
          .config("spark.executor.cores", cores//instances) \
          .config("spark.sql.shuffle.partitions", cores) \
          .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
          .config("spark.sql.execution.arrow.enabled", "true") \
          .getOrCreate()

spark.sparkContext.setLogLevel("OFF")
dataframe = spark.read.parquet("src/datasets/my_HI-Small_Trans.parquet", header=True)
dataframe = dataframe.withColumn('id', monotonically_increasing_id())

ds = pd.read_csv('src/pattern/HI-Small_Patterns.csv')

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/07/28 10:35:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/07/28 10:35:13 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
                                                                                

In [194]:
reduced_ds = ds[['timestamp','from_bank','from_account','to_bank','to_account','amount_received','receiving_currency','amount_paid','payment_currency','payment_format']]
pattern_len, df_len = len(reduced_ds.drop_duplicates()), dataframe.filter('is_laundering==1').count()
print("Laundering in pattern file: {}\nLaundering in the dataframe: {}".format(pattern_len, df_len))
print("Missing launderings:",df_len-pattern_len)

Laundering in pattern file: 3209
Laundering in the dataframe: 5177
Missing launderings: 1968


It means that some laundering transactions are not considered in the pattern file

Types of Laundering transactions

<img src="src/images/patterns.png" style="width: 600px">


In [None]:
set(ds.type)

In [45]:
def filter_dataframe(receiving_currency, payment_currency, payment_format="ACH", from_bank=None, from_account=None, to_bank=None, to_account=None):
    if to_bank == None and to_account == None:
        dataframe.filter('from_bank=={} and \
                        from_account=="{}" and \
                        receiving_currency=="{}" and \
                        payment_currency=="{}" and \
                        payment_format=="{}"'.format(from_bank, from_account, receiving_currency, payment_currency, payment_format)).sort('timestamp').show()
    elif from_bank == None and from_account == None:
        dataframe.filter('to_bank=={} and \
                        to_account=="{}" and \
                        receiving_currency=="{}" and \
                        payment_currency=="{}" and \
                        payment_format=="{}"'.format(to_bank, to_account, receiving_currency, payment_currency, payment_format)).sort('timestamp').show()
    else:
        dataframe.filter('from_bank=={} and \
                        from_account=="{}" and \
                        to_bank=={} and \
                        to_account=="{}" and \
                        receiving_currency=="{}" and \
                        payment_currency=="{}" and \
                        payment_format=="{}"'.format(from_bank, from_account, to_bank, to_account, receiving_currency, payment_currency, payment_format)).sort('timestamp').show()

# FAN-OUT (fig. a)

### More than 1-degree

##### Example 1

In [51]:
fan_out = ds[ds.type=='FAN-OUT']
fan_out0 = fan_out[fan_out.id==10]
fan_out0

Unnamed: 0,id,type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering,category
87,10,FAN-OUT,2022/09/02 06:37,150240,812D22980,48309,811C599A0,60056.52,Saudi Riyal,60056.52,Saudi Riyal,ACH,1,Max 2-degree Fan-Out
88,10,FAN-OUT,2022/09/02 09:09,150240,812D22980,222,811B83280,59890.28,Saudi Riyal,59890.28,Saudi Riyal,ACH,1,Max 2-degree Fan-Out


All transactions are:
- sent from 812D22980 to bank 150240
- receiving_currency = payment_currency = Saudi Riyal
- payment_format = ACH
Let's try to see how many launderings there are in the dataset that reflect these characteristics

In [47]:
filter_dataframe("Saudi Riyal","Saudi Riyal", from_bank=150240, from_account="812D22980")

[Stage 19:>                                                         (0 + 8) / 8]

+-------------------+---------+------------+-------+----------+---------------+------------------+-----------+----------------+--------------+-------------+-----------+
|          timestamp|from_bank|from_account|to_bank|to_account|amount_received|receiving_currency|amount_paid|payment_currency|payment_format|is_laundering|         id|
+-------------------+---------+------------+-------+----------+---------------+------------------+-----------+----------------+--------------+-------------+-----------+
|2022-09-01 06:47:00|   150240|   812D22980|  48309| 811C599A0|       27247.23|       Saudi Riyal|   27247.23|     Saudi Riyal|           ACH|            1|51539675261|
|2022-09-01 19:27:00|   150240|   812D22980|  48309| 811C599A0|        5971.98|       Saudi Riyal|    5971.98|     Saudi Riyal|           ACH|            1|      40064|
|2022-09-02 06:37:00|   150240|   812D22980|  48309| 811C599A0|       60056.52|       Saudi Riyal|   60056.52|     Saudi Riyal|           ACH|            1

                                                                                

All transactions with those characteristics are laundering

##### Example 2

In [74]:
fan_out = ds[ds.type=='FAN-OUT']
fan_out1 = fan_out[fan_out.id==0]
display(fan_out1)
filter_dataframe("US Dollar","US Dollar", from_bank=21174, from_account="800737690")

Unnamed: 0,id,type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering,category
0,0,FAN-OUT,2022/09/01 00:06,21174,800737690,12,80011F990,2848.96,Euro,2848.96,Euro,ACH,1,Max 16-degree Fan-Out
1,0,FAN-OUT,2022/09/01 04:33,21174,800737690,20,80020C5B0,8630.4,Euro,8630.4,Euro,ACH,1,Max 16-degree Fan-Out
2,0,FAN-OUT,2022/09/01 09:14,21174,800737690,20,80006A5E0,35642.49,Yuan,35642.49,Yuan,ACH,1,Max 16-degree Fan-Out
3,0,FAN-OUT,2022/09/01 09:56,21174,800737690,220,8007A5B70,5738987.96,US Dollar,5738987.96,US Dollar,ACH,1,Max 16-degree Fan-Out
4,0,FAN-OUT,2022/09/01 11:28,21174,800737690,1244,80093C0D0,7254.53,US Dollar,7254.53,US Dollar,ACH,1,Max 16-degree Fan-Out
5,0,FAN-OUT,2022/09/01 13:13,21174,800737690,513,80078E200,6990.87,US Dollar,6990.87,US Dollar,ACH,1,Max 16-degree Fan-Out
6,0,FAN-OUT,2022/09/01 14:11,21174,800737690,20,80066B990,12536.92,Euro,12536.92,Euro,ACH,1,Max 16-degree Fan-Out
7,0,FAN-OUT,2022/09/02 15:40,21174,800737690,410,8002CC310,3511.82,Euro,3511.82,Euro,ACH,1,Max 16-degree Fan-Out
8,0,FAN-OUT,2022/09/02 21:23,21174,800737690,1292,8004030A0,16135.09,US Dollar,16135.09,US Dollar,ACH,1,Max 16-degree Fan-Out
9,0,FAN-OUT,2022/09/02 23:10,21174,800737690,1601,800578800,12183.28,US Dollar,12183.28,US Dollar,ACH,1,Max 16-degree Fan-Out


                                                                                

+-------------------+---------+------------+-------+----------+---------------+------------------+-----------+----------------+--------------+-------------+-----------+
|          timestamp|from_bank|from_account|to_bank|to_account|amount_received|receiving_currency|amount_paid|payment_currency|payment_format|is_laundering|         id|
+-------------------+---------+------------+-------+----------+---------------+------------------+-----------+----------------+--------------+-------------+-----------+
|2022-09-01 09:46:00|   225734|   8095AF170|   1362| 802783AA0|       16737.85|         US Dollar|   16737.85|       US Dollar|           ACH|            1|34359818776|
|2022-09-01 11:11:00|   225734|   8095AF170|  28694| 804ED8800|       17321.44|         US Dollar|   17321.44|       US Dollar|           ACH|            1|25769884739|
|2022-09-01 17:54:00|   225734|   8095AF170|  11904| 800E9BC40|        4382.74|         US Dollar|    4382.74|       US Dollar|           ACH|            1

### 1-degree

In [84]:
fan_out = ds[ds.type=='FAN-OUT']
fan_out1degree = fan_out[fan_out.category=="Max 1-degree Fan-Out"]
fan_out1degree

Unnamed: 0,id,type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering,category
73,7,FAN-OUT,2022/09/02 03:04,15231,80266F880,23691,8021353D0,15536.84,Euro,15536.84,Euro,ACH,1,Max 1-degree Fan-Out
113,15,FAN-OUT,2022/09/03 22:15,15231,80266F880,23691,8021353D0,12592.29,Euro,12592.29,Euro,ACH,1,Max 1-degree Fan-Out
132,19,FAN-OUT,2022/09/04 15:30,15231,80266F880,23691,8021353D0,8092.93,Euro,8092.93,Euro,ACH,1,Max 1-degree Fan-Out
154,24,FAN-OUT,2022/09/06 10:04,49365,812A09D40,119,812A09CF0,32736.73,Saudi Riyal,32736.73,Saudi Riyal,ACH,1,Max 1-degree Fan-Out
204,29,FAN-OUT,2022/09/07 00:21,119,812A09CF0,49365,812A09D40,12260.49,Saudi Riyal,12260.49,Saudi Riyal,ACH,1,Max 1-degree Fan-Out
205,30,FAN-OUT,2022/09/07 00:53,23691,8021353D0,15231,80266F880,6149.56,Euro,6149.56,Euro,ACH,1,Max 1-degree Fan-Out
280,40,FAN-OUT,2022/09/09 15:30,249176,812A70ED0,49508,812A70E80,17579.07,Saudi Riyal,17579.07,Saudi Riyal,ACH,1,Max 1-degree Fan-Out
295,43,FAN-OUT,2022/09/09 18:42,50202,812D129C0,222,812D127D0,70077.51,Saudi Riyal,70077.51,Saudi Riyal,ACH,1,Max 1-degree Fan-Out


Taking into consideration for example the transaction with id 24 with Max 1-degree, looking in the complete dataset we find that all the transactions having the same characteristics are laundering.

In [None]:
display(fan_out1degree[fan_out1degree.id==24])
filter_dataframe(receiving_currency="Saudi Riyal", payment_currency="Saudi Riyal", from_bank=49365, from_account="812A09D40", to_bank=119, to_account="812A09CF0")

In [None]:
display(fan_out1degree[fan_out1degree.id==29])
filter_dataframe(receiving_currency="Saudi Riyal", payment_currency="Saudi Riyal", from_bank=119, from_account="812A09CF0", to_bank=49365, to_account="812A09D40")

### Conclusions

The following features might be useful for fan-out:
- for each from_account, the number of outgoing nodes from the same bank, with the same receiving_currency and payment_currency
- some fan-outs are 1-degree: there are other transactions that are the same (different amounts and timestamps)
- check if there is an identical transaction with different amounts at the same time

# FAN-IN (fig. b)

It is the opposite of FAN-OUT

### More than 1-degree

#### Example 1

In [167]:
fan_in = ds[ds.type=='FAN-IN']
fan_in0 = fan_in[fan_in.id==1]
fan_in0

Unnamed: 0,id,type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering,category
2267,1,FAN-IN,2022/09/01 06:23,149345,812D1AC00,48308,811ED7DF0,45346.12,Saudi Riyal,45346.12,Saudi Riyal,ACH,1,Max 16-degree Fan-In
2268,1,FAN-IN,2022/09/01 15:27,223,8119F8CC0,48308,811ED7DF0,30299.15,Saudi Riyal,30299.15,Saudi Riyal,ACH,1,Max 16-degree Fan-In
2269,1,FAN-IN,2022/09/02 01:39,48309,81235EAA0,48308,811ED7DF0,8117.8,Saudi Riyal,8117.8,Saudi Riyal,ACH,1,Max 16-degree Fan-In
2270,1,FAN-IN,2022/09/02 06:36,48910,8120D49E0,48308,811ED7DF0,72635.55,Saudi Riyal,72635.55,Saudi Riyal,ACH,1,Max 16-degree Fan-In
2271,1,FAN-IN,2022/09/02 09:57,249176,81235EC40,48308,811ED7DF0,14363.22,Saudi Riyal,14363.22,Saudi Riyal,ACH,1,Max 16-degree Fan-In
2272,1,FAN-IN,2022/09/02 13:13,148348,81235EE30,48308,811ED7DF0,11495.93,Saudi Riyal,11495.93,Saudi Riyal,ACH,1,Max 16-degree Fan-In
2273,1,FAN-IN,2022/09/02 18:31,118,811A67510,48308,811ED7DF0,47915.92,Saudi Riyal,47915.92,Saudi Riyal,ACH,1,Max 16-degree Fan-In
2274,1,FAN-IN,2022/09/03 07:57,148016,811FCA7B0,48308,811ED7DF0,18841.21,Saudi Riyal,18841.21,Saudi Riyal,ACH,1,Max 16-degree Fan-In
2275,1,FAN-IN,2022/09/03 09:14,119,8000DA7D0,48308,811ED7DF0,65196.57,Saudi Riyal,65196.57,Saudi Riyal,ACH,1,Max 16-degree Fan-In
2276,1,FAN-IN,2022/09/03 16:24,223,811EDA940,48308,811ED7DF0,71981.58,Saudi Riyal,71981.58,Saudi Riyal,ACH,1,Max 16-degree Fan-In


All transactions are:
- received from 811ED7DF0 to bank 48308
- receiving_currency = payment_currency = Saudi Riyal
- payment_format = ACH
Let's try to see how many launderings there are in the dataset that reflect these characteristics

In [56]:
filter_dataframe("Saudi Riyal","Saudi Riyal", to_bank=48308, to_account="811ED7DF0")

+-------------------+---------+------------+-------+----------+---------------+------------------+-----------+----------------+--------------+-------------+-----------+
|          timestamp|from_bank|from_account|to_bank|to_account|amount_received|receiving_currency|amount_paid|payment_currency|payment_format|is_laundering|         id|
+-------------------+---------+------------+-------+----------+---------------+------------------+-----------+----------------+--------------+-------------+-----------+
|2022-09-01 06:23:00|   149345|   812D1AC00|  48308| 811ED7DF0|       45346.12|       Saudi Riyal|   45346.12|     Saudi Riyal|           ACH|            1|17179935492|
|2022-09-01 15:27:00|      223|   8119F8CC0|  48308| 811ED7DF0|       30299.15|       Saudi Riyal|   30299.15|     Saudi Riyal|           ACH|            1| 8590000904|
|2022-09-02 01:39:00|    48309|   81235EAA0|  48308| 811ED7DF0|         8117.8|       Saudi Riyal|     8117.8|     Saudi Riyal|           ACH|            1

                                                                                

Here we see that there is only one transaction that is not laundering. If we notice, this transaction happens at the same time as the transaction above, which is laundering

#### Esempio 2

In [75]:
fan_in = ds[ds.type=='FAN-IN']
fan_in1 = fan_in[fan_in.id==2]
display(fan_in1)
filter_dataframe("US Dollar","US Dollar", to_bank=1362, to_account="8001ECD70")

Unnamed: 0,id,type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering,category
2283,2,FAN-IN,2022/09/01 08:21,11,8001027C0,1362,8001ECD70,8340.74,US Dollar,8340.74,US Dollar,ACH,1,Max 10-degree Fan-In
2284,2,FAN-IN,2022/09/01 11:34,20,800246FC0,1362,8001ECD70,15524.16,US Dollar,15524.16,US Dollar,ACH,1,Max 10-degree Fan-In
2285,2,FAN-IN,2022/09/01 12:19,20,800060420,1362,8001ECD70,16367.46,US Dollar,16367.46,US Dollar,ACH,1,Max 10-degree Fan-In
2286,2,FAN-IN,2022/09/02 04:06,20,800283B60,1362,8001ECD70,1869.72,US Dollar,1869.72,US Dollar,ACH,1,Max 10-degree Fan-In
2287,2,FAN-IN,2022/09/02 12:55,12,800137450,1362,8001ECD70,5073.01,US Dollar,5073.01,US Dollar,ACH,1,Max 10-degree Fan-In
2288,2,FAN-IN,2022/09/02 14:13,12,800117590,1362,8001ECD70,16050.64,US Dollar,16050.64,US Dollar,ACH,1,Max 10-degree Fan-In
2289,2,FAN-IN,2022/09/03 14:57,1490,800A180A0,1362,8001ECD70,161.35,US Dollar,161.35,US Dollar,ACH,1,Max 10-degree Fan-In
2290,2,FAN-IN,2022/09/04 09:59,1,800056160,1362,8001ECD70,4383.81,US Dollar,4383.81,US Dollar,ACH,1,Max 10-degree Fan-In
2291,2,FAN-IN,2022/09/04 14:46,23,8001D7610,1362,8001ECD70,3757.42,US Dollar,3757.42,US Dollar,ACH,1,Max 10-degree Fan-In
2292,2,FAN-IN,2022/09/04 17:08,12,800205030,1362,8001ECD70,11896790.65,US Dollar,11896790.65,US Dollar,ACH,1,Max 10-degree Fan-In


+-------------------+---------+------------+-------+----------+---------------+------------------+-------------+----------------+--------------+-------------+-----------+
|          timestamp|from_bank|from_account|to_bank|to_account|amount_received|receiving_currency|  amount_paid|payment_currency|payment_format|is_laundering|         id|
+-------------------+---------+------------+-------+----------+---------------+------------------+-------------+----------------+--------------+-------------+-----------+
|2022-09-01 02:29:00|       10|   8001882D0|   1362| 8001ECD70|          44.43|         US Dollar|        44.43|       US Dollar|           ACH|            0|51539654555|
|2022-09-01 08:21:00|       11|   8001027C0|   1362| 8001ECD70|        8340.74|         US Dollar|      8340.74|       US Dollar|           ACH|            1|42949746096|
|2022-09-01 11:34:00|       20|   800246FC0|   1362| 8001ECD70|       15524.16|         US Dollar|     15524.16|       US Dollar|           ACH| 

                                                                                

Here we can see that all transactions between 8001882D0 and 8001ECD70 are not laundering

### 1-degree

In [81]:
fan_in = ds[ds.type=='FAN-IN']
fan_in1degree = fan_in[fan_in.category=="Max 1-degree Fan-In"]
fan_in1degree

Unnamed: 0,id,type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering,category
2295,4,FAN-IN,2022/09/01 15:06,119,812A09CF0,49365,812A09D40,36537.53,Saudi Riyal,36537.53,Saudi Riyal,ACH,1,Max 1-degree Fan-In
2416,18,FAN-IN,2022/09/05 07:32,49365,812A09D40,119,812A09CF0,70400.86,Saudi Riyal,70400.86,Saudi Riyal,ACH,1,Max 1-degree Fan-In
2537,33,FAN-IN,2022/09/09 14:15,249176,812A70ED0,49508,812A70E80,71496.43,Saudi Riyal,71496.43,Saudi Riyal,ACH,1,Max 1-degree Fan-In
2538,34,FAN-IN,2022/09/09 15:15,15231,80266F880,23691,8021353D0,1549.19,Euro,1549.19,Euro,ACH,1,Max 1-degree Fan-In


Taking into account all fan outs with 1 degree, we see that these transactions are the same, but with different amounts

In [217]:
fan_out1degree

Unnamed: 0,id,type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering,category
73,7,FAN-OUT,2022/09/02 03:04,15231,80266F880,23691,8021353D0,15536.84,Euro,15536.84,Euro,ACH,1,Max 1-degree Fan-Out
113,15,FAN-OUT,2022/09/03 22:15,15231,80266F880,23691,8021353D0,12592.29,Euro,12592.29,Euro,ACH,1,Max 1-degree Fan-Out
132,19,FAN-OUT,2022/09/04 15:30,15231,80266F880,23691,8021353D0,8092.93,Euro,8092.93,Euro,ACH,1,Max 1-degree Fan-Out
154,24,FAN-OUT,2022/09/06 10:04,49365,812A09D40,119,812A09CF0,32736.73,Saudi Riyal,32736.73,Saudi Riyal,ACH,1,Max 1-degree Fan-Out
204,29,FAN-OUT,2022/09/07 00:21,119,812A09CF0,49365,812A09D40,12260.49,Saudi Riyal,12260.49,Saudi Riyal,ACH,1,Max 1-degree Fan-Out
205,30,FAN-OUT,2022/09/07 00:53,23691,8021353D0,15231,80266F880,6149.56,Euro,6149.56,Euro,ACH,1,Max 1-degree Fan-Out
280,40,FAN-OUT,2022/09/09 15:30,249176,812A70ED0,49508,812A70E80,17579.07,Saudi Riyal,17579.07,Saudi Riyal,ACH,1,Max 1-degree Fan-Out
295,43,FAN-OUT,2022/09/09 18:42,50202,812D129C0,222,812D127D0,70077.51,Saudi Riyal,70077.51,Saudi Riyal,ACH,1,Max 1-degree Fan-Out


### Conclusions

The following features might be useful for fan-in:
- for each to_account, the number of incoming nodes to the same bank, with the same receiving_currency and payment_currency
- some fan-ins are 1-degree: there are other transactions that are the same (different amounts and timestamps)
- some recurring transactions between two accounts are not laundering

# GATHER-SCATTER & SCATTER-GATHER
These two essentially take fan-in and fan-out and put them together: gather-scatter arises from a fan-followed by a fan-out, conversely scatter-gather

In [142]:
gather_scatter = ds[ds.type=="GATHER-SCATTER"]
scatter_gather = ds[ds.type=="SCATTER-GATHER"]

In [150]:
gather_scatter[(gather_scatter.to_account=="811C599A0") & (gather_scatter.category=="Max 3-degree Fan-In")]

Unnamed: 0,id,type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering,category
629,0,GATHER-SCATTER,2022/09/01 00:04,119,811C597B0,48309,811C599A0,34254.65,Saudi Riyal,34254.65,Saudi Riyal,ACH,1,Max 3-degree Fan-In
630,0,GATHER-SCATTER,2022/09/01 19:27,150240,812D22980,48309,811C599A0,5971.98,Saudi Riyal,5971.98,Saudi Riyal,ACH,1,Max 3-degree Fan-In
631,0,GATHER-SCATTER,2022/09/04 05:06,222,811B83280,48309,811C599A0,50445.58,Saudi Riyal,50445.58,Saudi Riyal,ACH,1,Max 3-degree Fan-In
632,0,GATHER-SCATTER,2022/09/04 05:03,48309,811C599A0,48309,811C599A0,48649.42,Saudi Riyal,48649.42,Saudi Riyal,ACH,1,Max 3-degree Fan-In
1057,32,GATHER-SCATTER,2022/09/07 23:48,150240,812D22980,48309,811C599A0,55801.94,Saudi Riyal,55801.94,Saudi Riyal,ACH,1,Max 3-degree Fan-In
1058,32,GATHER-SCATTER,2022/09/09 04:55,222,811B83280,48309,811C599A0,70986.04,Saudi Riyal,70986.04,Saudi Riyal,ACH,1,Max 3-degree Fan-In
1059,32,GATHER-SCATTER,2022/09/11 16:43,119,811C597B0,48309,811C599A0,7043.12,Saudi Riyal,7043.12,Saudi Riyal,ACH,1,Max 3-degree Fan-In
1060,32,GATHER-SCATTER,2022/09/11 05:36,48309,811C599A0,48309,811C599A0,52436.24,Saudi Riyal,52436.24,Saudi Riyal,ACH,1,Max 3-degree Fan-In
1332,49,GATHER-SCATTER,2022/09/10 18:55,222,811B83280,48309,811C599A0,9267.0,Saudi Riyal,9267.0,Saudi Riyal,ACH,1,Max 3-degree Fan-In
1333,49,GATHER-SCATTER,2022/09/13 22:14,119,811C597B0,48309,811C599A0,55691.86,Saudi Riyal,55691.86,Saudi Riyal,ACH,1,Max 3-degree Fan-In


# BIPARTITE

In [176]:
bipartite = ds[ds.type=='BIPARTITE']
bipartite0 = bipartite[bipartite.id==0]
bipartite0

Unnamed: 0,id,type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering,category
2002,0,BIPARTITE,2022/09/01 11:03,11904,80103A420,14077,802533E40,12347.2,US Dollar,12347.2,US Dollar,ACH,1,
2003,0,BIPARTITE,2022/09/02 16:43,3489,80211F620,13265,8019C6830,15645.62,Euro,15645.62,Euro,ACH,1,
2004,0,BIPARTITE,2022/09/02 12:00,21745,800A75B90,1,80162B4B0,7687.95,US Dollar,7687.95,US Dollar,ACH,1,
2005,0,BIPARTITE,2022/09/01 04:39,1,8010AA4F0,12719,8015B5F50,9495.61,US Dollar,9495.61,US Dollar,ACH,1,
2006,0,BIPARTITE,2022/09/02 08:17,1,800DD9900,11157,8006F0580,488.77,US Dollar,488.77,US Dollar,ACH,1,
2007,0,BIPARTITE,2022/09/01 17:05,1522,800587B60,2454,802679190,5949.73,US Dollar,5949.73,US Dollar,ACH,1,
2008,0,BIPARTITE,2022/09/01 16:19,513,801947FF0,1292,800639010,13556.28,US Dollar,13556.28,US Dollar,ACH,1,
2009,0,BIPARTITE,2022/09/02 08:58,1244,800ACE160,21831,800AEC6D0,12580.81,Euro,12580.81,Euro,ACH,1,
2010,0,BIPARTITE,2022/09/01 11:32,1547,8020543A0,23525,8018DC000,4666.4,Euro,4666.4,Euro,ACH,1,
2011,0,BIPARTITE,2022/09/01 07:39,21575,801A58F80,11,80048D890,5019.15,Euro,5019.15,Euro,ACH,1,


In [198]:
dataframe.filter('from_account=="80103A420" and is_laundering==1').show()

+-------------------+---------+------------+-------+----------+---------------+------------------+-----------+----------------+--------------+-------------+-----------+
|          timestamp|from_bank|from_account|to_bank|to_account|amount_received|receiving_currency|amount_paid|payment_currency|payment_format|is_laundering|         id|
+-------------------+---------+------------+-------+----------+---------------+------------------+-----------+----------------+--------------+-------------+-----------+
|2022-09-01 11:03:00|    11904|   80103A420|  14077| 802533E40|        12347.2|         US Dollar|    12347.2|       US Dollar|           ACH|            1|42949720883|
+-------------------+---------+------------+-------+----------+---------------+------------------+-----------+----------------+--------------+-------------+-----------+



In [214]:
bipartite_df=spark.createDataFrame(bipartite) 
bipartite_df.select('type','from_account').groupBy('type','from_account').count().filter('count>1').show()

+---------+------------+-----+
|     type|from_account|count|
+---------+------------+-----+
|BIPARTITE|   812A09D40|    3|
|BIPARTITE|   812A09CF0|    2|
|BIPARTITE|   811C4EDD0|    2|
|BIPARTITE|   8021353D0|    2|
|BIPARTITE|   811EDA940|    2|
|BIPARTITE|   80266F880|    4|
|BIPARTITE|   811FFF630|    2|
|BIPARTITE|   812A70ED0|    2|
|BIPARTITE|   800F1D640|    2|
|BIPARTITE|   812D0C3C0|    2|
+---------+------------+-----+



In [216]:
bipartite_df.select('type','to_account').groupBy('type','to_account').count().filter('count>1').show()

+---------+----------+-----+
|     type|to_account|count|
+---------+----------+-----+
|BIPARTITE| 812A09D40|    2|
|BIPARTITE| 812A09CF0|    3|
|BIPARTITE| 80266F880|    2|
|BIPARTITE| 811FCA7B0|    3|
|BIPARTITE| 8021353D0|    4|
|BIPARTITE| 811ED7DF0|    2|
|BIPARTITE| 811A65E30|    2|
|BIPARTITE| 812A70E80|    2|
|BIPARTITE| 812D0C600|    2|
|BIPARTITE| 80FF51600|    2|
+---------+----------+-----+



The bipartite graph shows that bipartite laundering occurs when an account splits money into multiple destinations that in turn receive money from other sources.
The dataset above shows accounts that transact more than once to another account and the account that receives money more than once.

In conclusion, the bipartite recycling set does not reflect the image of the graph

# STACK

In [199]:
stack = ds[ds.type=='STACK']

Unnamed: 0,id,type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering,category
1345,0,STACK,2022/09/02 12:36,223,8000DD890,40312,80F52D550,14153.46,Swiss Franc,14153.46,Swiss Franc,ACH,1,
1346,0,STACK,2022/09/03 19:52,40312,80F52D550,23370,80BEA98A0,11975.53,UK Pound,11975.53,UK Pound,ACH,1,
1347,0,STACK,2022/09/01 10:19,18617,8038D3520,24482,801C0F2B0,13712.96,Euro,13712.96,Euro,ACH,1,
1348,0,STACK,2022/09/03 05:06,24482,801C0F2B0,214,80C1B9E20,333126.09,Mexican Peso,333126.09,Mexican Peso,ACH,1,
1349,0,STACK,2022/09/01 09:22,40836,80F6B88B0,16606,8064545E0,11800.69,US Dollar,11800.69,US Dollar,ACH,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1806,41,STACK,2022/09/11 11:17,2991,8025D1EB0,27444,803C925B0,13658.31,US Dollar,13658.31,US Dollar,ACH,1,
1807,41,STACK,2022/09/12 11:02,21142,809F26890,1729,8083CF250,392562.58,US Dollar,392562.58,US Dollar,ACH,1,
1808,41,STACK,2022/09/12 17:43,1729,8083CF250,20846,809406BF0,335012.90,Euro,335012.90,Euro,ACH,1,
1809,42,STACK,2022/09/12 15:41,23691,8021353D0,15231,80266F880,4409.63,Euro,4409.63,Euro,ACH,1,


In [201]:
stack[stack.from_account=="80F52D550"]

Unnamed: 0,id,type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering,category
1346,0,STACK,2022/09/03 19:52,40312,80F52D550,23370,80BEA98A0,11975.53,UK Pound,11975.53,UK Pound,ACH,1,


Stack laundering has a similar problem to bipartite