In [7]:
import numpy as np
import pandas as pd

from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp, sum as _sum, count, monotonically_increasing_id

# Pattern reading
Patters are read from the txt file and the result is stored in a csv file

In [8]:
with open('datasets/HI-Small_Patterns.txt') as f:
    small_patterns = f.readlines()
    
laundering = []
laundering_with_categories = []
for row in small_patterns:
    if 'BEGIN LAUNDERING ATTEMPT' in row:
        row = row.replace('\n','')
        laundering_with_categories.append(row)
        laund = row.split(':')[0]
        laundering.append(laund.split('- ')[1])
        
print(set(laundering))

dict_laundering = {}
for laund in laundering: 
    dict_laundering[laund] = {}
    
array_patterns = []
for key in dict_laundering.keys(): #filtro per il nome della frode
    count = 0
    insert = False
    type_laundering = None
    for row in small_patterns:
        if ('BEGIN LAUNDERING ATTEMPT - '+key) in row: #filtro per tutte le frodi di quel tipo
            insert = True
            type = row.split(':  ')
            if len(type) == 2: #considero la categoria (e.g. Max 13-degree Fan-In)
                type_laundering = type[1].replace('\n', '')

            dict_laundering[key][count] = []
        elif ('END LAUNDERING ATTEMPT - '+key) in row:
            count += 1
            type_laundering = None
            insert = False
        elif insert: 
            array_row = row.replace('\n','').split(',')
            patt = [count, key]
            for el in array_row: 
                patt.append(el)

            if type_laundering == None:       
                patt.append(None)     
                dict_laundering[key][count].append(array_row)
            else: 
                patt.append(type_laundering)
                dict_laundering[key][count].append(array_row)
            
            array_patterns.append(patt)
            
columns = ['id','type','timestamp','from_bank','from_account','to_bank','to_account','amount_received','receiving_currency','amount_paid','payment_currency','payment_format','is_laundering','category']
pd.DataFrame(array_patterns, columns=columns).to_csv('datasets/HI-Small_Patterns.csv',index=False)

{'SCATTER-GATHER', 'CYCLE', 'BIPARTITE', 'RANDOM', 'STACK', 'GATHER-SCATTER', 'FAN-IN', 'FAN-OUT'}


The features calculated in the previous code are: 
- id 
- type
- category

Transactions with these 3 identical features mean that they belong to the same type of illecit transaction

# Pattern understanding 

In [10]:
spark = SparkSession.builder \
    .appName("AMD-SM2L Joint Project") \
    .config("spark.driver.memory", "3g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.sql.execution.arrow.enabled", "true") \
    .master("local[*]") \
    .getOrCreate()

spark.sparkContext.setLogLevel("OFF")
dataframe = spark.read.parquet("datasets/1_cleaned_HI-Small.parquet", header=True)
dataframe = dataframe.withColumn('id', monotonically_increasing_id())

ds = pd.read_csv('datasets/HI-Small_Patterns.csv')

                                                                                

In [11]:
reduced_ds = ds[['timestamp','from_bank','from_account','to_bank','to_account','amount_received','receiving_currency','amount_paid','payment_currency','payment_format']]
pattern_len, df_len = len(reduced_ds.drop_duplicates()), dataframe.filter('is_laundering==1').count()
print("Laundering in pattern file: {}\nLaundering in the dataframe: {}".format(pattern_len, df_len))
print("Missing launderings:",df_len-pattern_len)

[Stage 1:>                                                          (0 + 8) / 9]

Laundering in pattern file: 3209
Laundering in the dataframe: 5177
Missing launderings: 1968


                                                                                

Not all laundering transactions in the dataframe downloaded from Kaggle were reported in the pattern txt file

Types of Laundering Patterns

<img src="images/patterns.png" style="width: 600px">

In [None]:
set(ds.type)

{'BIPARTITE',
 'CYCLE',
 'FAN-IN',
 'FAN-OUT',
 'GATHER-SCATTER',
 'RANDOM',
 'SCATTER-GATHER',
 'STACK'}

In [12]:
def filter_dataframe(receiving_currency, payment_currency, payment_format="ACH", from_bank=None, from_account=None, to_bank=None, to_account=None):
    if to_bank == None and to_account == None:
        dataframe.filter('from_bank=={} and \
                        from_account=="{}" and \
                        receiving_currency=="{}" and \
                        payment_currency=="{}" and \
                        payment_format=="{}"'.format(from_bank, from_account, receiving_currency, payment_currency, payment_format)).sort('timestamp').show()
    elif from_bank == None and from_account == None:
        dataframe.filter('to_bank=={} and \
                        to_account=="{}" and \
                        receiving_currency=="{}" and \
                        payment_currency=="{}" and \
                        payment_format=="{}"'.format(to_bank, to_account, receiving_currency, payment_currency, payment_format)).sort('timestamp').show()
    else:
        dataframe.filter('from_bank=={} and \
                        from_account=="{}" and \
                        to_bank=={} and \
                        to_account=="{}" and \
                        receiving_currency=="{}" and \
                        payment_currency=="{}" and \
                        payment_format=="{}"'.format(from_bank, from_account, to_bank, to_account, receiving_currency, payment_currency, payment_format)).sort('timestamp').show()

# FAN-OUT (fig. a)

In [13]:
fan_out = ds[ds.type=='FAN-OUT']

### More than 1-degree

##### Example 1

In [6]:
fan_out0 = fan_out[fan_out.id==10]
fan_out0

Unnamed: 0,id,type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering,category
87,10,FAN-OUT,2022/09/02 06:37,150240,812D22980,48309,811C599A0,60056.52,Saudi Riyal,60056.52,Saudi Riyal,ACH,1,Max 2-degree Fan-Out
88,10,FAN-OUT,2022/09/02 09:09,150240,812D22980,222,811B83280,59890.28,Saudi Riyal,59890.28,Saudi Riyal,ACH,1,Max 2-degree Fan-Out


All transactions are:
- sent from 812D22980 to bank 150240
- receiving_currency = payment_currency = Saudi Riyal
- payment_format = ACH
Let's try to see how many launderings there are in the dataset that reflect these characteristics

In [47]:
filter_dataframe("Saudi Riyal","Saudi Riyal", from_bank=150240, from_account="812D22980")

[Stage 19:>                                                         (0 + 8) / 8]

+-------------------+---------+------------+-------+----------+---------------+------------------+-----------+----------------+--------------+-------------+-----------+
|          timestamp|from_bank|from_account|to_bank|to_account|amount_received|receiving_currency|amount_paid|payment_currency|payment_format|is_laundering|         id|
+-------------------+---------+------------+-------+----------+---------------+------------------+-----------+----------------+--------------+-------------+-----------+
|2022-09-01 06:47:00|   150240|   812D22980|  48309| 811C599A0|       27247.23|       Saudi Riyal|   27247.23|     Saudi Riyal|           ACH|            1|51539675261|
|2022-09-01 19:27:00|   150240|   812D22980|  48309| 811C599A0|        5971.98|       Saudi Riyal|    5971.98|     Saudi Riyal|           ACH|            1|      40064|
|2022-09-02 06:37:00|   150240|   812D22980|  48309| 811C599A0|       60056.52|       Saudi Riyal|   60056.52|     Saudi Riyal|           ACH|            1

                                                                                

All transactions with those characteristics are laundering

##### Example 2

In [74]:
fan_out1 = fan_out[fan_out.id==0]
display(fan_out1)
filter_dataframe("US Dollar","US Dollar", from_bank=21174, from_account="800737690")

Unnamed: 0,id,type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering,category
0,0,FAN-OUT,2022/09/01 00:06,21174,800737690,12,80011F990,2848.96,Euro,2848.96,Euro,ACH,1,Max 16-degree Fan-Out
1,0,FAN-OUT,2022/09/01 04:33,21174,800737690,20,80020C5B0,8630.4,Euro,8630.4,Euro,ACH,1,Max 16-degree Fan-Out
2,0,FAN-OUT,2022/09/01 09:14,21174,800737690,20,80006A5E0,35642.49,Yuan,35642.49,Yuan,ACH,1,Max 16-degree Fan-Out
3,0,FAN-OUT,2022/09/01 09:56,21174,800737690,220,8007A5B70,5738987.96,US Dollar,5738987.96,US Dollar,ACH,1,Max 16-degree Fan-Out
4,0,FAN-OUT,2022/09/01 11:28,21174,800737690,1244,80093C0D0,7254.53,US Dollar,7254.53,US Dollar,ACH,1,Max 16-degree Fan-Out
5,0,FAN-OUT,2022/09/01 13:13,21174,800737690,513,80078E200,6990.87,US Dollar,6990.87,US Dollar,ACH,1,Max 16-degree Fan-Out
6,0,FAN-OUT,2022/09/01 14:11,21174,800737690,20,80066B990,12536.92,Euro,12536.92,Euro,ACH,1,Max 16-degree Fan-Out
7,0,FAN-OUT,2022/09/02 15:40,21174,800737690,410,8002CC310,3511.82,Euro,3511.82,Euro,ACH,1,Max 16-degree Fan-Out
8,0,FAN-OUT,2022/09/02 21:23,21174,800737690,1292,8004030A0,16135.09,US Dollar,16135.09,US Dollar,ACH,1,Max 16-degree Fan-Out
9,0,FAN-OUT,2022/09/02 23:10,21174,800737690,1601,800578800,12183.28,US Dollar,12183.28,US Dollar,ACH,1,Max 16-degree Fan-Out


                                                                                

+-------------------+---------+------------+-------+----------+---------------+------------------+-----------+----------------+--------------+-------------+-----------+
|          timestamp|from_bank|from_account|to_bank|to_account|amount_received|receiving_currency|amount_paid|payment_currency|payment_format|is_laundering|         id|
+-------------------+---------+------------+-------+----------+---------------+------------------+-----------+----------------+--------------+-------------+-----------+
|2022-09-01 09:46:00|   225734|   8095AF170|   1362| 802783AA0|       16737.85|         US Dollar|   16737.85|       US Dollar|           ACH|            1|34359818776|
|2022-09-01 11:11:00|   225734|   8095AF170|  28694| 804ED8800|       17321.44|         US Dollar|   17321.44|       US Dollar|           ACH|            1|25769884739|
|2022-09-01 17:54:00|   225734|   8095AF170|  11904| 800E9BC40|        4382.74|         US Dollar|    4382.74|       US Dollar|           ACH|            1

In [111]:
dataframe.filter('from_bank==20 and from_account=="8001C97D0"').show()

                                                                                

+-------------------+---------+------------+-------+----------+---------------+------------------+-----------+----------------+--------------+-------------+----------+
|          timestamp|from_bank|from_account|to_bank|to_account|amount_received|receiving_currency|amount_paid|payment_currency|payment_format|is_laundering|        id|
+-------------------+---------+------------+-------+----------+---------------+------------------+-----------+----------------+--------------+-------------+----------+
|2022-09-01 00:21:00|       20|   8001C97D0| 122094| 8088E2210|        6331.16|              Euro|    6331.16|            Euro|        Cheque|            0|     20249|
|2022-09-01 00:25:00|       20|   8001C97D0| 122094| 8088E2210|        3044.58|              Euro|    3044.58|            Euro|   Credit Card|            0|     20250|
|2022-09-01 00:26:00|       20|   8001C97D0| 122094| 8088E2210|         517.35|              Euro|     517.35|            Euro|          Wire|            0|    

### 1-degree

In [84]:
fan_out1degree = fan_out[fan_out.category=="Max 1-degree Fan-Out"]
fan_out1degree

Unnamed: 0,id,type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering,category
73,7,FAN-OUT,2022/09/02 03:04,15231,80266F880,23691,8021353D0,15536.84,Euro,15536.84,Euro,ACH,1,Max 1-degree Fan-Out
113,15,FAN-OUT,2022/09/03 22:15,15231,80266F880,23691,8021353D0,12592.29,Euro,12592.29,Euro,ACH,1,Max 1-degree Fan-Out
132,19,FAN-OUT,2022/09/04 15:30,15231,80266F880,23691,8021353D0,8092.93,Euro,8092.93,Euro,ACH,1,Max 1-degree Fan-Out
154,24,FAN-OUT,2022/09/06 10:04,49365,812A09D40,119,812A09CF0,32736.73,Saudi Riyal,32736.73,Saudi Riyal,ACH,1,Max 1-degree Fan-Out
204,29,FAN-OUT,2022/09/07 00:21,119,812A09CF0,49365,812A09D40,12260.49,Saudi Riyal,12260.49,Saudi Riyal,ACH,1,Max 1-degree Fan-Out
205,30,FAN-OUT,2022/09/07 00:53,23691,8021353D0,15231,80266F880,6149.56,Euro,6149.56,Euro,ACH,1,Max 1-degree Fan-Out
280,40,FAN-OUT,2022/09/09 15:30,249176,812A70ED0,49508,812A70E80,17579.07,Saudi Riyal,17579.07,Saudi Riyal,ACH,1,Max 1-degree Fan-Out
295,43,FAN-OUT,2022/09/09 18:42,50202,812D129C0,222,812D127D0,70077.51,Saudi Riyal,70077.51,Saudi Riyal,ACH,1,Max 1-degree Fan-Out


Taking into consideration for example the transaction with id 24 with Max 1-degree, looking in the complete dataset we find that all the transactions having the same characteristics are laundering.

In [None]:
display(fan_out1degree[fan_out1degree.id==24])
filter_dataframe(receiving_currency="Saudi Riyal", payment_currency="Saudi Riyal", from_bank=49365, from_account="812A09D40", to_bank=119, to_account="812A09CF0")

In [None]:
display(fan_out1degree[fan_out1degree.id==29])
filter_dataframe(receiving_currency="Saudi Riyal", payment_currency="Saudi Riyal", from_bank=119, from_account="812A09CF0", to_bank=49365, to_account="812A09D40")

### Final understanding

In [114]:
features_fan_out = ['timestamp','from_account','from_bank','category','payment_format']
fan_outDF = spark.createDataFrame(fan_out)
fan_outDF = fan_outDF.withColumn("timestamp", to_timestamp(col("timestamp"), "yyyy/MM/dd HH:mm"))
fan_outDF = fan_outDF.select(features_fan_out)\
    .groupBy('from_account','from_bank','category','payment_format')\
        .agg(count('*').alias('count'), 
             min('timestamp').alias('min_ts'),
             max('timestamp').alias('max_ts'))\
            .select('from_account','from_bank','category','count','payment_format','min_ts','max_ts')

fan_outDF.withColumn('timestamp_range', datediff(col('max_ts'), col('min_ts'))).orderBy('category').show(truncate=False)

+------------+---------+---------------------+-----+--------------+-------------------+-------------------+---------------+
|from_account|from_bank|category             |count|payment_format|min_ts             |max_ts             |timestamp_range|
+------------+---------+---------------------+-----+--------------+-------------------+-------------------+---------------+
|80266F880   |15231    |Max 1-degree Fan-Out |3    |ACH           |2022-09-02 03:04:00|2022-09-04 15:30:00|2              |
|812A09D40   |49365    |Max 1-degree Fan-Out |1    |ACH           |2022-09-06 10:04:00|2022-09-06 10:04:00|0              |
|812A09CF0   |119      |Max 1-degree Fan-Out |1    |ACH           |2022-09-07 00:21:00|2022-09-07 00:21:00|0              |
|8021353D0   |23691    |Max 1-degree Fan-Out |1    |ACH           |2022-09-07 00:53:00|2022-09-07 00:53:00|0              |
|812A70ED0   |249176   |Max 1-degree Fan-Out |1    |ACH           |2022-09-09 15:30:00|2022-09-09 15:30:00|0              |
|812D129

We can see that the difference between the minimum and maximum timestamp is 4 days.

#### Particular case

How is it possibile that a Max 1-degree Fan-Out has 3 in count field? 

In [121]:
fan_out[(fan_out.from_account=="80266F880") & (fan_out.from_bank==15231)]

Unnamed: 0,id,type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering,category
73,7,FAN-OUT,2022/09/02 03:04,15231,80266F880,23691,8021353D0,15536.84,Euro,15536.84,Euro,ACH,1,Max 1-degree Fan-Out
113,15,FAN-OUT,2022/09/03 22:15,15231,80266F880,23691,8021353D0,12592.29,Euro,12592.29,Euro,ACH,1,Max 1-degree Fan-Out
132,19,FAN-OUT,2022/09/04 15:30,15231,80266F880,23691,8021353D0,8092.93,Euro,8092.93,Euro,ACH,1,Max 1-degree Fan-Out


All of the transactions above are directed to the same account with same currencies. Maybe similar transactions can't be considered the same fan-out (transactions having same to_bank and to_account)

How many cases like this are there in the dataset? 

In [122]:
fan_outDF2 = spark.createDataFrame(fan_out)
fan_outDF2.groupBy('from_bank','from_account','to_bank','to_account', 'receiving_currency','payment_currency').count().filter('count>1').show()

+---------+------------+-------+----------+------------------+----------------+-----+
|from_bank|from_account|to_bank|to_account|receiving_currency|payment_currency|count|
+---------+------------+-------+----------+------------------+----------------+-----+
|    15231|   80266F880|  23691| 8021353D0|              Euro|            Euro|    3|
|   150240|   812D22980|    222| 811B83280|       Saudi Riyal|     Saudi Riyal|    3|
|   150240|   812D22980|  48309| 811C599A0|       Saudi Riyal|     Saudi Riyal|    2|
|   148350|   812D0C3C0| 148350| 811FFF630|       Saudi Riyal|     Saudi Riyal|    2|
|   148350|   812D0C3C0|  50528| 812D0C600|       Saudi Riyal|     Saudi Riyal|    2|
|   150240|   812D22980|    119| 811C597B0|       Saudi Riyal|     Saudi Riyal|    2|
+---------+------------+-------+----------+------------------+----------------+-----+



For example the transactions above have similar values

In [120]:
fan_out[(fan_out.from_bank==150240) & (fan_out.from_account=="812D22980") & (fan_out.to_account=="811B83280")]

Unnamed: 0,id,type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering,category
88,10,FAN-OUT,2022/09/02 09:09,150240,812D22980,222,811B83280,59890.28,Saudi Riyal,59890.28,Saudi Riyal,ACH,1,Max 2-degree Fan-Out
213,32,FAN-OUT,2022/09/10 00:18,150240,812D22980,222,811B83280,6046.83,Saudi Riyal,6046.83,Saudi Riyal,ACH,1,Max 2-degree Fan-Out
282,41,FAN-OUT,2022/09/10 07:41,150240,812D22980,222,811B83280,28341.71,Saudi Riyal,28341.71,Saudi Riyal,ACH,1,Max 3-degree Fan-Out


The two transactions occurred in "2022/09/10 00:18" belong to two different degree category.


In [132]:
display(fan_out[(fan_out.receiving_currency=="Saudi Riyal") & (fan_out.payment_format=="ACH") & (fan_out.category=="Max 3-degree Fan-Out")])
fan_out[fan_out.id==32]

Unnamed: 0,id,type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering,category
281,41,FAN-OUT,2022/09/09 15:59,150240,812D22980,48309,811C599A0,74116.0,Saudi Riyal,74116.0,Saudi Riyal,ACH,1,Max 3-degree Fan-Out
282,41,FAN-OUT,2022/09/10 07:41,150240,812D22980,222,811B83280,28341.71,Saudi Riyal,28341.71,Saudi Riyal,ACH,1,Max 3-degree Fan-Out
283,41,FAN-OUT,2022/09/11 18:02,150240,812D22980,119,811C597B0,27043.63,Saudi Riyal,27043.63,Saudi Riyal,ACH,1,Max 3-degree Fan-Out


Unnamed: 0,id,type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering,category
212,32,FAN-OUT,2022/09/07 10:23,150240,812D22980,119,811C597B0,64806.02,Saudi Riyal,64806.02,Saudi Riyal,ACH,1,Max 2-degree Fan-Out
213,32,FAN-OUT,2022/09/10 00:18,150240,812D22980,222,811B83280,6046.83,Saudi Riyal,6046.83,Saudi Riyal,ACH,1,Max 2-degree Fan-Out


In a single fan-out pattern, are curriencies all different? 

In [28]:
fan_outDF = spark.createDataFrame(fan_out)

fan_outDF.select('id','receiving_currency')\
    .groupBy('id','receiving_currency').count()\
    .select('id','receiving_currency')\
    .groupBy('id').count().filter('count<=1').show()

+---+-----+
| id|count|
+---+-----+
| 29|    1|
| 30|    1|
|  9|    1|
| 11|    1|
| 32|    1|
| 40|    1|
| 43|    1|
| 35|    1|
|  7|    1|
| 15|    1|
| 19|    1|
| 10|    1|
| 24|    1|
| 13|    1|
| 41|    1|
+---+-----+



In [33]:
fan_out[fan_out.id==32]

Unnamed: 0,id,type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering,category
212,32,FAN-OUT,2022/09/07 10:23,150240,812D22980,119,811C597B0,64806.02,Saudi Riyal,64806.02,Saudi Riyal,ACH,1,Max 2-degree Fan-Out
213,32,FAN-OUT,2022/09/10 00:18,150240,812D22980,222,811B83280,6046.83,Saudi Riyal,6046.83,Saudi Riyal,ACH,1,Max 2-degree Fan-Out


In [35]:
fan_out[fan_out.id==1]

Unnamed: 0,id,type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering,category
16,1,FAN-OUT,2022/09/01 01:34,116,80F1AF380,127593,80A817660,16502.02,Australian Dollar,16502.02,Australian Dollar,ACH,1,Max 12-degree Fan-Out
17,1,FAN-OUT,2022/09/01 05:44,116,80F1AF380,13145,80D210C20,17193.07,US Dollar,17193.07,US Dollar,ACH,1,Max 12-degree Fan-Out
18,1,FAN-OUT,2022/09/01 19:38,116,80F1AF380,9679,809ACB660,278872.19,Yen,278872.19,Yen,ACH,1,Max 12-degree Fan-Out
19,1,FAN-OUT,2022/09/02 05:02,116,80F1AF380,1655,811F9C580,3620.93,Euro,3620.93,Euro,ACH,1,Max 12-degree Fan-Out
20,1,FAN-OUT,2022/09/02 16:17,116,80F1AF380,222819,808CE0050,5430.89,UK Pound,5430.89,UK Pound,ACH,1,Max 12-degree Fan-Out
21,1,FAN-OUT,2022/09/02 17:29,116,80F1AF380,21387,803158350,284.56,Euro,284.56,Euro,ACH,1,Max 12-degree Fan-Out
22,1,FAN-OUT,2022/09/02 19:56,116,80F1AF380,7042,810C0C4D0,17454.74,US Dollar,17454.74,US Dollar,ACH,1,Max 12-degree Fan-Out
23,1,FAN-OUT,2022/09/03 06:57,116,80F1AF380,243614,8113C5790,53268.16,Shekel,53268.16,Shekel,ACH,1,Max 12-degree Fan-Out
24,1,FAN-OUT,2022/09/03 18:53,116,80F1AF380,1124,805F9C820,6986.5,US Dollar,6986.5,US Dollar,ACH,1,Max 12-degree Fan-Out
25,1,FAN-OUT,2022/09/04 16:18,116,80F1AF380,12,8023ED8B0,19725.23,US Dollar,19725.23,US Dollar,ACH,1,Max 12-degree Fan-Out


Why is not the transactions with id 32 considered the account 811C599A0 like in the id 41? 

### Conclusions

The following features might be useful for fan-out:
- for each from_account, the number of outgoing nodes from the same bank and all out-node could have different: 
    * receiving_currency (same as payment_currency in the same transaction)
    * payment_currency
    * payment_format
- transactions to the same account (& same bank) can't be considered same fan-out
- some fan-outs are 1-degree: there are other transactions that are the same (different amounts and timestamps)
- check if there is an identical transaction with different amounts at the same time
- there must be at most 4 days between the first transaction and the last in the series

# FAN-IN (fig. b)

It is the opposite of FAN-OUT

In [8]:
fan_in = ds[ds.type=='FAN-IN']

### More than 1-degree

#### Example 1

In [40]:
fan_in0 = fan_in[fan_in.id==0]
fan_in0

Unnamed: 0,id,type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering,category
2265,0,FAN-IN,2022/09/01 04:47,222,811B83280,119,811C597B0,13148.66,Saudi Riyal,13148.66,Saudi Riyal,ACH,1,Max 2-degree Fan-In
2266,0,FAN-IN,2022/09/04 16:56,48309,811C599A0,119,811C597B0,24189.19,Saudi Riyal,24189.19,Saudi Riyal,ACH,1,Max 2-degree Fan-In


All transactions are:
- received from 811ED7DF0 to bank 48308
- receiving_currency = payment_currency = Saudi Riyal
- payment_format = ACH
Let's try to see how many launderings there are in the dataset that reflect these characteristics

In [36]:
filter_dataframe("Saudi Riyal","Saudi Riyal", to_bank=48308, to_account="811ED7DF0")

                                                                                

+-------------------+---------+------------+-------+----------+---------------+------------------+-----------+----------------+--------------+-------------+-----------+
|          timestamp|from_bank|from_account|to_bank|to_account|amount_received|receiving_currency|amount_paid|payment_currency|payment_format|is_laundering|         id|
+-------------------+---------+------------+-------+----------+---------------+------------------+-----------+----------------+--------------+-------------+-----------+
|2022-09-01 06:23:00|   149345|   812D1AC00|  48308| 811ED7DF0|       45346.12|       Saudi Riyal|   45346.12|     Saudi Riyal|           ACH|            1|17179935492|
|2022-09-01 15:27:00|      223|   8119F8CC0|  48308| 811ED7DF0|       30299.15|       Saudi Riyal|   30299.15|     Saudi Riyal|           ACH|            1| 8590000904|
|2022-09-02 01:39:00|    48309|   81235EAA0|  48308| 811ED7DF0|         8117.8|       Saudi Riyal|     8117.8|     Saudi Riyal|           ACH|            1

In [22]:
fan_in.columns

Index(['id', 'type', 'timestamp', 'from_bank', 'from_account', 'to_bank',
       'to_account', 'amount_received', 'receiving_currency', 'amount_paid',
       'payment_currency', 'payment_format', 'is_laundering', 'category'],
      dtype='object')

Here we see that there is only one transaction that is not laundering. If we notice, this transaction happens at the same time as the transaction above, which is laundering

#### Esempio 2

In [75]:
fan_in1 = fan_in[fan_in.id==2]
display(fan_in1)
filter_dataframe("US Dollar","US Dollar", to_bank=1362, to_account="8001ECD70")

Unnamed: 0,id,type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering,category
2283,2,FAN-IN,2022/09/01 08:21,11,8001027C0,1362,8001ECD70,8340.74,US Dollar,8340.74,US Dollar,ACH,1,Max 10-degree Fan-In
2284,2,FAN-IN,2022/09/01 11:34,20,800246FC0,1362,8001ECD70,15524.16,US Dollar,15524.16,US Dollar,ACH,1,Max 10-degree Fan-In
2285,2,FAN-IN,2022/09/01 12:19,20,800060420,1362,8001ECD70,16367.46,US Dollar,16367.46,US Dollar,ACH,1,Max 10-degree Fan-In
2286,2,FAN-IN,2022/09/02 04:06,20,800283B60,1362,8001ECD70,1869.72,US Dollar,1869.72,US Dollar,ACH,1,Max 10-degree Fan-In
2287,2,FAN-IN,2022/09/02 12:55,12,800137450,1362,8001ECD70,5073.01,US Dollar,5073.01,US Dollar,ACH,1,Max 10-degree Fan-In
2288,2,FAN-IN,2022/09/02 14:13,12,800117590,1362,8001ECD70,16050.64,US Dollar,16050.64,US Dollar,ACH,1,Max 10-degree Fan-In
2289,2,FAN-IN,2022/09/03 14:57,1490,800A180A0,1362,8001ECD70,161.35,US Dollar,161.35,US Dollar,ACH,1,Max 10-degree Fan-In
2290,2,FAN-IN,2022/09/04 09:59,1,800056160,1362,8001ECD70,4383.81,US Dollar,4383.81,US Dollar,ACH,1,Max 10-degree Fan-In
2291,2,FAN-IN,2022/09/04 14:46,23,8001D7610,1362,8001ECD70,3757.42,US Dollar,3757.42,US Dollar,ACH,1,Max 10-degree Fan-In
2292,2,FAN-IN,2022/09/04 17:08,12,800205030,1362,8001ECD70,11896790.65,US Dollar,11896790.65,US Dollar,ACH,1,Max 10-degree Fan-In


+-------------------+---------+------------+-------+----------+---------------+------------------+-------------+----------------+--------------+-------------+-----------+
|          timestamp|from_bank|from_account|to_bank|to_account|amount_received|receiving_currency|  amount_paid|payment_currency|payment_format|is_laundering|         id|
+-------------------+---------+------------+-------+----------+---------------+------------------+-------------+----------------+--------------+-------------+-----------+
|2022-09-01 02:29:00|       10|   8001882D0|   1362| 8001ECD70|          44.43|         US Dollar|        44.43|       US Dollar|           ACH|            0|51539654555|
|2022-09-01 08:21:00|       11|   8001027C0|   1362| 8001ECD70|        8340.74|         US Dollar|      8340.74|       US Dollar|           ACH|            1|42949746096|
|2022-09-01 11:34:00|       20|   800246FC0|   1362| 8001ECD70|       15524.16|         US Dollar|     15524.16|       US Dollar|           ACH| 

                                                                                

Here we can see that all transactions between 8001882D0 and 8001ECD70 are not laundering

### 1-degree

In [81]:
fan_in = ds[ds.type=='FAN-IN']
fan_in1degree = fan_in[fan_in.category=="Max 1-degree Fan-In"]
fan_in1degree

Unnamed: 0,id,type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering,category
2295,4,FAN-IN,2022/09/01 15:06,119,812A09CF0,49365,812A09D40,36537.53,Saudi Riyal,36537.53,Saudi Riyal,ACH,1,Max 1-degree Fan-In
2416,18,FAN-IN,2022/09/05 07:32,49365,812A09D40,119,812A09CF0,70400.86,Saudi Riyal,70400.86,Saudi Riyal,ACH,1,Max 1-degree Fan-In
2537,33,FAN-IN,2022/09/09 14:15,249176,812A70ED0,49508,812A70E80,71496.43,Saudi Riyal,71496.43,Saudi Riyal,ACH,1,Max 1-degree Fan-In
2538,34,FAN-IN,2022/09/09 15:15,15231,80266F880,23691,8021353D0,1549.19,Euro,1549.19,Euro,ACH,1,Max 1-degree Fan-In


Taking into account all fan outs with 1 degree, we see that these transactions are the same, but with different amounts

In [217]:
fan_out1degree

Unnamed: 0,id,type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering,category
73,7,FAN-OUT,2022/09/02 03:04,15231,80266F880,23691,8021353D0,15536.84,Euro,15536.84,Euro,ACH,1,Max 1-degree Fan-Out
113,15,FAN-OUT,2022/09/03 22:15,15231,80266F880,23691,8021353D0,12592.29,Euro,12592.29,Euro,ACH,1,Max 1-degree Fan-Out
132,19,FAN-OUT,2022/09/04 15:30,15231,80266F880,23691,8021353D0,8092.93,Euro,8092.93,Euro,ACH,1,Max 1-degree Fan-Out
154,24,FAN-OUT,2022/09/06 10:04,49365,812A09D40,119,812A09CF0,32736.73,Saudi Riyal,32736.73,Saudi Riyal,ACH,1,Max 1-degree Fan-Out
204,29,FAN-OUT,2022/09/07 00:21,119,812A09CF0,49365,812A09D40,12260.49,Saudi Riyal,12260.49,Saudi Riyal,ACH,1,Max 1-degree Fan-Out
205,30,FAN-OUT,2022/09/07 00:53,23691,8021353D0,15231,80266F880,6149.56,Euro,6149.56,Euro,ACH,1,Max 1-degree Fan-Out
280,40,FAN-OUT,2022/09/09 15:30,249176,812A70ED0,49508,812A70E80,17579.07,Saudi Riyal,17579.07,Saudi Riyal,ACH,1,Max 1-degree Fan-Out
295,43,FAN-OUT,2022/09/09 18:42,50202,812D129C0,222,812D127D0,70077.51,Saudi Riyal,70077.51,Saudi Riyal,ACH,1,Max 1-degree Fan-Out


### Final understanding

In [85]:
features_fan_in = ['timestamp','to_account','to_bank','receiving_currency','payment_currency','category','payment_format']
fan_inDF = spark.createDataFrame(fan_in)
fan_inDF = fan_inDF.withColumn("timestamp", to_timestamp(col("timestamp"), "yyyy/MM/dd HH:mm"))
fan_inDF = fan_inDF.select(features_fan_in)\
    .groupBy('to_account','to_bank','receiving_currency','payment_currency','category','payment_format')\
        .agg(count('*').alias('count'), 
             min('timestamp').alias('min_ts'),
             max('timestamp').alias('max_ts'))\
            .select('to_account','to_bank','receiving_currency','payment_currency','category','count','payment_format','min_ts','max_ts')

fan_inDF.withColumn('timestamp_range', datediff(col('max_ts'), col('min_ts'))).orderBy('category').show(truncate=False)

+----------+-------+------------------+-----------------+--------------------+-----+--------------+-------------------+-------------------+---------------+
|to_account|to_bank|receiving_currency|payment_currency |category            |count|payment_format|min_ts             |max_ts             |timestamp_range|
+----------+-------+------------------+-----------------+--------------------+-----+--------------+-------------------+-------------------+---------------+
|812A09D40 |49365  |Saudi Riyal       |Saudi Riyal      |Max 1-degree Fan-In |1    |ACH           |2022-09-01 15:06:00|2022-09-01 15:06:00|0              |
|812A09CF0 |119    |Saudi Riyal       |Saudi Riyal      |Max 1-degree Fan-In |1    |ACH           |2022-09-05 07:32:00|2022-09-05 07:32:00|0              |
|812A70E80 |49508  |Saudi Riyal       |Saudi Riyal      |Max 1-degree Fan-In |1    |ACH           |2022-09-09 14:15:00|2022-09-09 14:15:00|0              |
|8021353D0 |23691  |Euro              |Euro             |Max 1-d

### Conclusions

The following features might be useful for fan-in:
- for each to_account, the number of incoming nodes to the same bank and all in node must have the same: 
    * receiving_currency 
    * payment_currency
    * payment_format
- some fan-ins are 1-degree: there are other transactions that are the same (different amounts and timestamps)
- some recurring transactions between two accounts are not laundering
- there must be at most 4 days between the first transaction and the last in the series

# GATHER-SCATTER & SCATTER-GATHER
These two essentially take fan-in and fan-out and put them together: gather-scatter arises from a fan-followed by a fan-out, conversely scatter-gather

In [86]:
gather_scatter = ds[ds.type=="GATHER-SCATTER"]
scatter_gather = ds[ds.type=="SCATTER-GATHER"]

## GATHER-SCATTER

Gather scatter exists if there are fan-ins followed by fan-outs.

We can see the 2 examples below.

#### Example 1
Example with a 13 degree fan-in followed by a 4 degree fan-out

In [87]:
gather_scatter[gather_scatter.id==1].sort_values('timestamp')

Unnamed: 0,id,type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering,category
634,1,GATHER-SCATTER,2022/09/01 00:43,12963,80296CA20,241869,80F390FC0,12562.68,Swiss Franc,12562.68,Swiss Franc,ACH,1,Max 13-degree Fan-In
635,1,GATHER-SCATTER,2022/09/01 07:15,1588,801ABC430,241869,80F390FC0,10475.35,Swiss Franc,10475.35,Swiss Franc,ACH,1,Max 13-degree Fan-In
636,1,GATHER-SCATTER,2022/09/01 15:25,3051,8012E62B0,241869,80F390FC0,13315.35,Swiss Franc,13315.35,Swiss Franc,ACH,1,Max 13-degree Fan-In
637,1,GATHER-SCATTER,2022/09/01 15:32,2843,8043B07C0,241869,80F390FC0,7999.49,Swiss Franc,7999.49,Swiss Franc,ACH,1,Max 13-degree Fan-In
638,1,GATHER-SCATTER,2022/09/01 17:25,29788,80399CED0,241869,80F390FC0,8430.98,Swiss Franc,8430.98,Swiss Franc,ACH,1,Max 13-degree Fan-In
639,1,GATHER-SCATTER,2022/09/02 07:29,28183,80BF4C160,241869,80F390FC0,295.9,Swiss Franc,295.9,Swiss Franc,ACH,1,Max 13-degree Fan-In
640,1,GATHER-SCATTER,2022/09/02 15:44,131167,80C21E690,241869,80F390FC0,3354.5,Swiss Franc,3354.5,Swiss Franc,ACH,1,Max 13-degree Fan-In
641,1,GATHER-SCATTER,2022/09/03 03:54,129974,80FE25690,241869,80F390FC0,9547.03,Swiss Franc,9547.03,Swiss Franc,ACH,1,Max 13-degree Fan-In
642,1,GATHER-SCATTER,2022/09/03 07:12,3051,806A3D750,241869,80F390FC0,2024.65,Swiss Franc,2024.65,Swiss Franc,ACH,1,Max 13-degree Fan-In
643,1,GATHER-SCATTER,2022/09/03 11:07,13157,805CA5A30,241869,80F390FC0,4513.56,Swiss Franc,4513.56,Swiss Franc,ACH,1,Max 13-degree Fan-In


In the dataset gather-scatter referring to an account are marked by the same id. For example, taking into accout the id equal to 1 we have 13 fan-in to 80F390FC0 and then it transfers the money to other 4 accounts. 

All of the fan-in transactions have same currencies, while outgoings transaction have the same currencies in a single transaction while they are different between different transactions. 

**Fan-in and fan-out degrees can be different.**

#### Example 2

In [52]:
gather_scatter[gather_scatter.id==10]

Unnamed: 0,id,type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering,category
779,10,GATHER-SCATTER,2022/09/02 23:18,23537,801BE1590,12,8010D4440,11002.46,US Dollar,11002.46,US Dollar,ACH,1,Max 14-degree Fan-In
780,10,GATHER-SCATTER,2022/09/03 00:57,8074,80803C930,12,8010D4440,13705.3,US Dollar,13705.3,US Dollar,ACH,1,Max 14-degree Fan-In
781,10,GATHER-SCATTER,2022/09/03 11:32,2991,806CF1310,12,8010D4440,9786.57,US Dollar,9786.57,US Dollar,ACH,1,Max 14-degree Fan-In
782,10,GATHER-SCATTER,2022/09/04 17:03,11,80028E6D0,12,8010D4440,13764.67,US Dollar,13764.67,US Dollar,ACH,1,Max 14-degree Fan-In
783,10,GATHER-SCATTER,2022/09/04 18:24,15,80A2DE560,12,8010D4440,17065.68,US Dollar,17065.68,US Dollar,ACH,1,Max 14-degree Fan-In
784,10,GATHER-SCATTER,2022/09/05 11:20,19329,8048E30F0,12,8010D4440,17498.02,US Dollar,17498.02,US Dollar,ACH,1,Max 14-degree Fan-In
785,10,GATHER-SCATTER,2022/09/05 13:10,26624,80BAEB110,12,8010D4440,1291.27,US Dollar,1291.27,US Dollar,ACH,1,Max 14-degree Fan-In
786,10,GATHER-SCATTER,2022/09/05 15:30,29279,8066C7E10,12,8010D4440,16614.28,US Dollar,16614.28,US Dollar,ACH,1,Max 14-degree Fan-In
787,10,GATHER-SCATTER,2022/09/05 18:44,116214,80DDCA6A0,12,8010D4440,1231.08,US Dollar,1231.08,US Dollar,ACH,1,Max 14-degree Fan-In
788,10,GATHER-SCATTER,2022/09/05 23:55,33872,80C73E400,12,8010D4440,9016.37,US Dollar,9016.37,US Dollar,ACH,1,Max 14-degree Fan-In


In [21]:
gather_scatter[(gather_scatter.to_account!="811C599A0") & (gather_scatter.from_account=="811C599A0")]# & (gather_scatter.category=="Max 3-degree Fan-In")]

Unnamed: 0,id,type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering,category
633,0,GATHER-SCATTER,2022/09/04 14:59,48309,811C599A0,119,811C597B0,64379.45,Saudi Riyal,64379.45,Saudi Riyal,ACH,1,Max 3-degree Fan-In
871,15,GATHER-SCATTER,2022/09/04 16:03,48309,811C599A0,150240,812D22980,145391200.0,Saudi Riyal,145391200.0,Saudi Riyal,ACH,1,Max 3-degree Fan-In
993,26,GATHER-SCATTER,2022/09/08 19:12,48309,811C599A0,119,811C597B0,48259.06,Saudi Riyal,48259.06,Saudi Riyal,ACH,1,Max 2-degree Fan-In
1040,30,GATHER-SCATTER,2022/09/06 23:05,48309,811C599A0,119,811C597B0,11878.48,Saudi Riyal,11878.48,Saudi Riyal,ACH,1,Max 2-degree Fan-In
1061,32,GATHER-SCATTER,2022/09/12 15:24,48309,811C599A0,222,811B83280,19530.36,Saudi Riyal,19530.36,Saudi Riyal,ACH,1,Max 3-degree Fan-In
1062,32,GATHER-SCATTER,2022/09/13 18:47,48309,811C599A0,150240,812D22980,51201.63,Saudi Riyal,51201.63,Saudi Riyal,ACH,1,Max 3-degree Fan-In
1335,49,GATHER-SCATTER,2022/09/14 13:40,48309,811C599A0,119,811C597B0,30360.54,Saudi Riyal,30360.54,Saudi Riyal,ACH,1,Max 3-degree Fan-In
1337,49,GATHER-SCATTER,2022/09/17 02:45,48309,811C599A0,222,811B83280,34822.31,Saudi Riyal,34822.31,Saudi Riyal,ACH,1,Max 3-degree Fan-In


In [14]:
gather_scatter[(gather_scatter.from_account=="811C597B0")]

Unnamed: 0,id,type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering,category
629,0,GATHER-SCATTER,2022/09/01 00:04,119,811C597B0,48309,811C599A0,34254.65,Saudi Riyal,34254.65,Saudi Riyal,ACH,1,Max 3-degree Fan-In
869,15,GATHER-SCATTER,2022/09/03 12:22,119,811C597B0,150240,812D22980,22910.8,Saudi Riyal,22910.8,Saudi Riyal,ACH,1,Max 3-degree Fan-In
919,19,GATHER-SCATTER,2022/09/08 14:50,119,811C597B0,222,811B83280,5905.43,Saudi Riyal,5905.43,Saudi Riyal,ACH,1,Max 2-degree Fan-In
992,26,GATHER-SCATTER,2022/09/08 06:38,119,811C597B0,48309,811C599A0,15786.73,Saudi Riyal,15786.73,Saudi Riyal,ACH,1,Max 2-degree Fan-In
1042,30,GATHER-SCATTER,2022/09/07 09:06,119,811C597B0,48309,811C599A0,657.36,Saudi Riyal,657.36,Saudi Riyal,ACH,1,Max 2-degree Fan-In
1043,30,GATHER-SCATTER,2022/09/10 23:17,119,811C597B0,119,811C597B0,14144807.18,Saudi Riyal,14144807.18,Saudi Riyal,ACH,1,Max 2-degree Fan-In
1044,30,GATHER-SCATTER,2022/09/11 07:32,119,811C597B0,222,811B83280,31324.25,Saudi Riyal,31324.25,Saudi Riyal,ACH,1,Max 2-degree Fan-In
1059,32,GATHER-SCATTER,2022/09/11 16:43,119,811C597B0,48309,811C599A0,7043.12,Saudi Riyal,7043.12,Saudi Riyal,ACH,1,Max 3-degree Fan-In
1333,49,GATHER-SCATTER,2022/09/13 22:14,119,811C597B0,48309,811C599A0,55691.86,Saudi Riyal,55691.86,Saudi Riyal,ACH,1,Max 3-degree Fan-In


In [11]:
gather_scatter[(gather_scatter.to_account=="811C599A0") & (gather_scatter.category=="Max 3-degree Fan-In")]

Unnamed: 0,id,type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering,category
629,0,GATHER-SCATTER,2022/09/01 00:04,119,811C597B0,48309,811C599A0,34254.65,Saudi Riyal,34254.65,Saudi Riyal,ACH,1,Max 3-degree Fan-In
630,0,GATHER-SCATTER,2022/09/01 19:27,150240,812D22980,48309,811C599A0,5971.98,Saudi Riyal,5971.98,Saudi Riyal,ACH,1,Max 3-degree Fan-In
631,0,GATHER-SCATTER,2022/09/04 05:06,222,811B83280,48309,811C599A0,50445.58,Saudi Riyal,50445.58,Saudi Riyal,ACH,1,Max 3-degree Fan-In
632,0,GATHER-SCATTER,2022/09/04 05:03,48309,811C599A0,48309,811C599A0,48649.42,Saudi Riyal,48649.42,Saudi Riyal,ACH,1,Max 3-degree Fan-In
1057,32,GATHER-SCATTER,2022/09/07 23:48,150240,812D22980,48309,811C599A0,55801.94,Saudi Riyal,55801.94,Saudi Riyal,ACH,1,Max 3-degree Fan-In
1058,32,GATHER-SCATTER,2022/09/09 04:55,222,811B83280,48309,811C599A0,70986.04,Saudi Riyal,70986.04,Saudi Riyal,ACH,1,Max 3-degree Fan-In
1059,32,GATHER-SCATTER,2022/09/11 16:43,119,811C597B0,48309,811C599A0,7043.12,Saudi Riyal,7043.12,Saudi Riyal,ACH,1,Max 3-degree Fan-In
1060,32,GATHER-SCATTER,2022/09/11 05:36,48309,811C599A0,48309,811C599A0,52436.24,Saudi Riyal,52436.24,Saudi Riyal,ACH,1,Max 3-degree Fan-In
1332,49,GATHER-SCATTER,2022/09/10 18:55,222,811B83280,48309,811C599A0,9267.0,Saudi Riyal,9267.0,Saudi Riyal,ACH,1,Max 3-degree Fan-In
1333,49,GATHER-SCATTER,2022/09/13 22:14,119,811C597B0,48309,811C599A0,55691.86,Saudi Riyal,55691.86,Saudi Riyal,ACH,1,Max 3-degree Fan-In


#### Conclusions
For each transaction, compute how many transaction are directed to the same to_account with the same currencies and then count how many transactions are handled by the from_account (to_account that now sends money) to other accounts: 
- currencies are not relevant
- consider also self payment

So we need 2 features:
- fan-in (check if I have already calculated them)
- fan-out (check if I have already calculated them)


# BIPARTITE

In [176]:
bipartite = ds[ds.type=='BIPARTITE']
bipartite0 = bipartite[bipartite.id==0]
bipartite0

Unnamed: 0,id,type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering,category
2002,0,BIPARTITE,2022/09/01 11:03,11904,80103A420,14077,802533E40,12347.2,US Dollar,12347.2,US Dollar,ACH,1,
2003,0,BIPARTITE,2022/09/02 16:43,3489,80211F620,13265,8019C6830,15645.62,Euro,15645.62,Euro,ACH,1,
2004,0,BIPARTITE,2022/09/02 12:00,21745,800A75B90,1,80162B4B0,7687.95,US Dollar,7687.95,US Dollar,ACH,1,
2005,0,BIPARTITE,2022/09/01 04:39,1,8010AA4F0,12719,8015B5F50,9495.61,US Dollar,9495.61,US Dollar,ACH,1,
2006,0,BIPARTITE,2022/09/02 08:17,1,800DD9900,11157,8006F0580,488.77,US Dollar,488.77,US Dollar,ACH,1,
2007,0,BIPARTITE,2022/09/01 17:05,1522,800587B60,2454,802679190,5949.73,US Dollar,5949.73,US Dollar,ACH,1,
2008,0,BIPARTITE,2022/09/01 16:19,513,801947FF0,1292,800639010,13556.28,US Dollar,13556.28,US Dollar,ACH,1,
2009,0,BIPARTITE,2022/09/02 08:58,1244,800ACE160,21831,800AEC6D0,12580.81,Euro,12580.81,Euro,ACH,1,
2010,0,BIPARTITE,2022/09/01 11:32,1547,8020543A0,23525,8018DC000,4666.4,Euro,4666.4,Euro,ACH,1,
2011,0,BIPARTITE,2022/09/01 07:39,21575,801A58F80,11,80048D890,5019.15,Euro,5019.15,Euro,ACH,1,


In [198]:
dataframe.filter('from_account=="80103A420" and is_laundering==1').show()

+-------------------+---------+------------+-------+----------+---------------+------------------+-----------+----------------+--------------+-------------+-----------+
|          timestamp|from_bank|from_account|to_bank|to_account|amount_received|receiving_currency|amount_paid|payment_currency|payment_format|is_laundering|         id|
+-------------------+---------+------------+-------+----------+---------------+------------------+-----------+----------------+--------------+-------------+-----------+
|2022-09-01 11:03:00|    11904|   80103A420|  14077| 802533E40|        12347.2|         US Dollar|    12347.2|       US Dollar|           ACH|            1|42949720883|
+-------------------+---------+------------+-------+----------+---------------+------------------+-----------+----------------+--------------+-------------+-----------+



In [214]:
bipartite_df=spark.createDataFrame(bipartite) 
bipartite_df.select('type','from_account').groupBy('type','from_account').count().filter('count>1').show()

+---------+------------+-----+
|     type|from_account|count|
+---------+------------+-----+
|BIPARTITE|   812A09D40|    3|
|BIPARTITE|   812A09CF0|    2|
|BIPARTITE|   811C4EDD0|    2|
|BIPARTITE|   8021353D0|    2|
|BIPARTITE|   811EDA940|    2|
|BIPARTITE|   80266F880|    4|
|BIPARTITE|   811FFF630|    2|
|BIPARTITE|   812A70ED0|    2|
|BIPARTITE|   800F1D640|    2|
|BIPARTITE|   812D0C3C0|    2|
+---------+------------+-----+



In [216]:
bipartite_df.select('type','to_account').groupBy('type','to_account').count().filter('count>1').show()

+---------+----------+-----+
|     type|to_account|count|
+---------+----------+-----+
|BIPARTITE| 812A09D40|    2|
|BIPARTITE| 812A09CF0|    3|
|BIPARTITE| 80266F880|    2|
|BIPARTITE| 811FCA7B0|    3|
|BIPARTITE| 8021353D0|    4|
|BIPARTITE| 811ED7DF0|    2|
|BIPARTITE| 811A65E30|    2|
|BIPARTITE| 812A70E80|    2|
|BIPARTITE| 812D0C600|    2|
|BIPARTITE| 80FF51600|    2|
+---------+----------+-----+



The bipartite graph shows that bipartite laundering occurs when an account splits money into multiple destinations that in turn receive money from other sources.
The dataset above shows accounts that transact more than once to another account and the account that receives money more than once.

In conclusion, the bipartite recycling set does not reflect the image of the graph

# STACK

In [199]:
stack = ds[ds.type=='STACK']

Unnamed: 0,id,type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering,category
1345,0,STACK,2022/09/02 12:36,223,8000DD890,40312,80F52D550,14153.46,Swiss Franc,14153.46,Swiss Franc,ACH,1,
1346,0,STACK,2022/09/03 19:52,40312,80F52D550,23370,80BEA98A0,11975.53,UK Pound,11975.53,UK Pound,ACH,1,
1347,0,STACK,2022/09/01 10:19,18617,8038D3520,24482,801C0F2B0,13712.96,Euro,13712.96,Euro,ACH,1,
1348,0,STACK,2022/09/03 05:06,24482,801C0F2B0,214,80C1B9E20,333126.09,Mexican Peso,333126.09,Mexican Peso,ACH,1,
1349,0,STACK,2022/09/01 09:22,40836,80F6B88B0,16606,8064545E0,11800.69,US Dollar,11800.69,US Dollar,ACH,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1806,41,STACK,2022/09/11 11:17,2991,8025D1EB0,27444,803C925B0,13658.31,US Dollar,13658.31,US Dollar,ACH,1,
1807,41,STACK,2022/09/12 11:02,21142,809F26890,1729,8083CF250,392562.58,US Dollar,392562.58,US Dollar,ACH,1,
1808,41,STACK,2022/09/12 17:43,1729,8083CF250,20846,809406BF0,335012.90,Euro,335012.90,Euro,ACH,1,
1809,42,STACK,2022/09/12 15:41,23691,8021353D0,15231,80266F880,4409.63,Euro,4409.63,Euro,ACH,1,


In [201]:
stack[stack.from_account=="80F52D550"]

Unnamed: 0,id,type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering,category
1346,0,STACK,2022/09/03 19:52,40312,80F52D550,23370,80BEA98A0,11975.53,UK Pound,11975.53,UK Pound,ACH,1,


Stack laundering has a similar problem to bipartite

# CYCLE
Cycle is a pattern in which an account starts a transaction and after a specific number of hops, the initial source account become the recipient one

In [4]:
cycle = ds[ds.type=="CYCLE"]

In [7]:
cycle[cycle.id==0]

Unnamed: 0,id,type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering,category
342,0,CYCLE,2022/09/01 00:03,1467,8013C4030,20,80BC62F10,58702.1,Yuan,58702.1,Yuan,ACH,1,Max 10 hops
343,0,CYCLE,2022/09/01 02:52,20,80BC62F10,240229,80F025640,7332.87,Swiss Franc,7332.87,Swiss Franc,ACH,1,Max 10 hops
344,0,CYCLE,2022/09/02 08:44,240229,80F025640,217,80FD27570,26443.7,Shekel,26443.7,Shekel,ACH,1,Max 10 hops
345,0,CYCLE,2022/09/02 12:29,217,80FD27570,24856,8090E8EB0,10621.24,Canadian Dollar,10621.24,Canadian Dollar,ACH,1,Max 10 hops
346,0,CYCLE,2022/09/03 10:20,24856,8090E8EB0,71,804ABCE90,637140.6,Rupee,637140.6,Rupee,ACH,1,Max 10 hops
347,0,CYCLE,2022/09/03 12:08,71,804ABCE90,213737,805494C30,621578.18,Rupee,621578.18,Rupee,ACH,1,Max 10 hops
348,0,CYCLE,2022/09/03 13:24,213737,805494C30,14290,801B949C0,7222.58,Euro,7222.58,Euro,ACH,1,Max 10 hops
349,0,CYCLE,2022/09/04 03:24,14290,801B949C0,10057,803DE1580,892031.21,Yen,892031.21,Yen,ACH,1,Max 10 hops
350,0,CYCLE,2022/09/04 09:44,10057,803DE1580,28628,80ACEE280,11364.12,Australian Dollar,11364.12,Australian Dollar,ACH,1,Max 10 hops
351,0,CYCLE,2022/09/04 15:51,28628,80ACEE280,1467,8013C4030,7945.55,US Dollar,7945.55,US Dollar,ACH,1,Max 10 hops


That's an example of a 10 hops cycle:
- the account 8013C4030 starts the transaction
- after 10 hops it receives money
- for each transaction the timestamp is younger than the previous one 

In [8]:
cycle[cycle.payment_format!="ACH"]

Unnamed: 0,id,type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering,category


In [13]:
cycle[cycle.from_account==cycle.to_account]
cycle[cycle.payment_currency!=cycle.receiving_currency]

Unnamed: 0,id,type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering,category


In [23]:
categories = cycle[['category']].drop_duplicates()
set(categories['category'])

{'Max 10 hops',
 'Max 11 hops',
 'Max 12 hops',
 'Max 2 hops',
 'Max 3 hops',
 'Max 4 hops',
 'Max 5 hops',
 'Max 6 hops',
 'Max 7 hops',
 'Max 8 hops'}

In [6]:
cycle[cycle.from_account=="811DCA9B0"]

Unnamed: 0,id,type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering,category
590,47,CYCLE,2022/09/12 17:52,119,811DCA9B0,222,811D80C30,19428.85,Saudi Riyal,19428.85,Saudi Riyal,ACH,1,Max 7 hops


In [5]:
cycle[cycle.category=="Max 10 hops"]

Unnamed: 0,id,type,timestamp,from_bank,from_account,to_bank,to_account,amount_received,receiving_currency,amount_paid,payment_currency,payment_format,is_laundering,category
342,0,CYCLE,2022/09/01 00:03,1467,8013C4030,20,80BC62F10,58702.1,Yuan,58702.1,Yuan,ACH,1,Max 10 hops
343,0,CYCLE,2022/09/01 02:52,20,80BC62F10,240229,80F025640,7332.87,Swiss Franc,7332.87,Swiss Franc,ACH,1,Max 10 hops
344,0,CYCLE,2022/09/02 08:44,240229,80F025640,217,80FD27570,26443.7,Shekel,26443.7,Shekel,ACH,1,Max 10 hops
345,0,CYCLE,2022/09/02 12:29,217,80FD27570,24856,8090E8EB0,10621.24,Canadian Dollar,10621.24,Canadian Dollar,ACH,1,Max 10 hops
346,0,CYCLE,2022/09/03 10:20,24856,8090E8EB0,71,804ABCE90,637140.6,Rupee,637140.6,Rupee,ACH,1,Max 10 hops
347,0,CYCLE,2022/09/03 12:08,71,804ABCE90,213737,805494C30,621578.18,Rupee,621578.18,Rupee,ACH,1,Max 10 hops
348,0,CYCLE,2022/09/03 13:24,213737,805494C30,14290,801B949C0,7222.58,Euro,7222.58,Euro,ACH,1,Max 10 hops
349,0,CYCLE,2022/09/04 03:24,14290,801B949C0,10057,803DE1580,892031.21,Yen,892031.21,Yen,ACH,1,Max 10 hops
350,0,CYCLE,2022/09/04 09:44,10057,803DE1580,28628,80ACEE280,11364.12,Australian Dollar,11364.12,Australian Dollar,ACH,1,Max 10 hops
351,0,CYCLE,2022/09/04 15:51,28628,80ACEE280,1467,8013C4030,7945.55,US Dollar,7945.55,US Dollar,ACH,1,Max 10 hops
