In [1]:
from imports import *

if 'spark' in vars():
  spark.stop()

# Count available cores
cores = multiprocessing.cpu_count()
# In this case the amount of executors will be equal to the amount of cores
instances = cores

spark = SparkSession.builder \
          .appName("MoneyLaundering") \
          .config("spark.driver.memory", "3g") \
          .config("spark.executor.memory", "4g") \
          .config("spark.executor.instances", cores) \
          .config("spark.executor.cores", cores//instances) \
          .config("spark.sql.shuffle.partitions", cores) \
          .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
          .config("spark.sql.execution.arrow.enabled", "true") \
          .getOrCreate()

spark.sparkContext.setLogLevel("OFF")

dataframe = spark.read.parquet("src/datasets/my_HI-Small_Trans.parquet", header=True)
dataframe = dataframe.withColumn('id', monotonically_increasing_id())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/03 08:08:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

# Feature computing

## Compute features of the whole dataframe

In [2]:
manager = FeatureManager(dataframe)
manager.compute_features_of_whole_df()
# laundering = manager.dataframe.filter('is_laundering==1')
# non_laundering = manager.dataframe.filter('is_laundering==0')

ach_mapping = manager.ach_mapping

                                                                                

In [3]:
manager.dataframe.columns

['id',
 'timestamp',
 'from_account',
 'to_account',
 'same_account',
 'from_bank',
 'to_bank',
 'same_bank',
 'amount_received',
 'amount_paid',
 'same_amounts',
 'receiving_currency',
 'payment_currency',
 'same_currency',
 'payment_format',
 'is_laundering']

# Intro: Compute features of the graph
The next step is to understand the structure of the different patterns in order to identify further features


<img src="src/images/patterns.png" style="width: 600px">


In order to do that, I thought that the best solution was to process the dataset using GraphFrames, a package for Apache Spark which provides DataFrame-based Graph.

Looking at the image below, it would be beneficial to process certain features for each node in the graph to gain valuable insights into the transactions:
1. **Compute the number of in-out edges (fan-in, fan-out)** <br>
    A transaction involves an exchange between two accounts, and it would be valuable to calculate the connection degrees for each account:
    * In-out degrees for the sender account
    * In-out degrees for the receiver account
    <br><br>
2. **Identify intermediary transactions (scatter-gather)** <br>
    By analyzing the flow of transactions, we can identify intermediary transactions. These are transactions that act as intermediaries, facilitating the movement of funds between multiple accounts
    <br><br>
3. **Detect forwarding transactions** <br>
    An account receives a sum of money and then forwards it to another account
    <br><br>
4. **Check for intermediate transactions between two transactions** <br>
    We can check if certain transactions act as intermediaries between two other transactions
    <br><br>

____

# Computing

In [6]:
my_graph = MyGraph(dataframe)
my_graph.get_forwards()
my_graph.same_or_similar()
my_graph.compute_fan()
my_graph.find_cycles(8)



adding cycles of degree 2...
adding cycles of degree 3...
adding cycles of degree 4...
adding cycles of degree 5...
adding cycles of degree 6...
adding cycles of degree 7...
adding cycles of degree 8...
adding cycles of degree 9...


In [7]:
my_graph.join_ids()

In [16]:
my_graph = MyGraph(dataframe)
my_graph.find_cycles(12)
my_graph.cycles.show()

adding cycles of degree 2...
adding cycles of degree 3...
adding cycles of degree 4...
adding cycles of degree 5...
adding cycles of degree 6...
adding cycles of degree 7...
adding cycles of degree 8...
adding cycles of degree 9...
adding cycles of degree 10...
adding cycles of degree 11...
adding cycles of degree 12...


[Stage 4317:>                                                       (0 + 1) / 1]

+-----+---------+---------+--------+
|   id|min_cycle|max_cycle|involved|
+-----+---------+---------+--------+
|22937|        2|        2|       1|
|33710|        2|        2|       1|
|40064|        2|        4|       1|
|40065|        2|        4|       1|
|41892|        2|        2|       1|
|41955|        2|        2|       1|
|42963|        2|        2|       1|
|45546|        2|        2|       1|
|46498|        2|        3|       1|
|47454|     null|     null|       1|
|47455|     null|     null|       1|
|47456|     null|     null|       1|
|48067|        2|        2|       1|
|49417|        2|        2|       1|
|49430|        2|        2|       1|
|51703|        2|        2|       1|
|51863|        2|        2|       1|
|51874|        2|        2|       1|
|53161|        2|        2|       1|
|53650|        2|        2|       1|
+-----+---------+---------+--------+
only showing top 20 rows



                                                                                

In [25]:
my_graph.cycles.filter('min_cycle>5').show()

                                                                                

+-----------+---------+---------+--------+
|         id|min_cycle|max_cycle|involved|
+-----------+---------+---------+--------+
| 8590127893|        6|        6|       1|
|     399300|        7|        7|       1|
|17180380682|        8|        8|       1|
|17180336093|        8|        8|       1|
|51540230598|        8|        8|       1|
|42950030184|        8|        8|       1|
|60129581425|       10|       10|       1|
|34360334979|       10|       10|       1|
|51539950677|       10|       10|       1|
|34359785209|       10|       10|       1|
|42949745997|       10|       10|       1|
+-----------+---------+---------+--------+



In [None]:
# the example below is an example of "similar" transaction
# dataframe.filter('from_bank==10 and from_account=="800043990" and to_bank==10 and to_account=="800043990" and receiving_currency=="US Dollar"').show()
# the example below is an example of "same" transaction
# dataframe.filter('timestamp=="2022-09-01 00:07:00" and from_bank==1601 and from_account=="8005D0700"').show()

## Fan-in and Fan-out

In [None]:
dataframe.filter('id==17478').show()
dataframe.filter('from_account=="800737690" and to_account!="800737690" and abs(datediff("2022-09-01 04:33:00",timestamp)) <= 4 and payment_format=="ACH" and to_account!="80020C5B0"').show()

In [None]:
dataframe.filter('to_account=="812BD4500" and abs(datediff("2022-09-01 00:45:00",timestamp))<=4 and\
                  receiving_currency=="Euro" and payment_format=="Cheque"').show()

In [None]:
dataframe.filter('to_account=="8001275E0" and abs(datediff("2022-09-05 11:00:00",timestamp))<=4 and\
                  receiving_currency=="US Dollar" and payment_format=="Cheque"').show()