### Visual Analysis of the transaction logs to detect fraud detection.
** Networkx and pyvis Network modules are used to analyse the data graphically.

In [2]:
#import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from pyvis.network import Network
plt.rcParams["figure.figsize"] = (20,10)

In [3]:
# load the dataset
df=pd.read_csv('processedData1.csv')
df.head()

Unnamed: 0,gT,sId,rId,sAcc,rAcc,TranAmount,TranType,TranStatus,sBalbefore,sBalAfter,rBalBefore,rBalAfter,sf1,sf2,sTD,rTD,TranTS,sType,rType
0,N-RegDep,PN_Ret2,PN_EU_0_261,RAcc2,EUAcc0_261,131926.49,Dt,SU,1000000000.0,999868100.0,100131900.0,100000000.0,True,True,1/6/2011 0:11:22,1/6/2011 0:11:22,1/6/2011 0:11:22,RET,EU
1,N_Reg_RC,PN_EU_1_502,operator,EUAcc1_502,A0,2054.46,ArRC,SU,100000000.0,99997950.0,99180040.0,99177980.0,True,True,1/6/2011 0:16:8,1/6/2011 0:16:8,1/6/2011 0:16:8,EU,operator
2,N-RegDep,PN_Ret2,PN_EU_1_362,RAcc2,EUAcc1_362,182548.74,Dt,SU,999868100.0,999685500.0,100182500.0,100000000.0,True,True,1/6/2011 0:36:47,1/6/2011 0:36:47,1/6/2011 0:36:47,RET,EU
3,N_Reg_RC,PN_EU_1_26,operator,EUAcc1_26,A0,5507.28,ArRC,SU,100000000.0,99994490.0,99185540.0,99180040.0,True,True,1/6/2011 0:36:59,1/6/2011 0:36:59,1/6/2011 0:36:59,EU,operator
4,N_Reg_RC,PN_EU_1_18,operator,EUAcc1_18,A0,2102.6,ArRC,SU,100000000.0,99997900.0,99187650.0,99185540.0,True,True,1/6/2011 1:1:26,1/6/2011 1:1:26,1/6/2011 1:1:26,EU,operator


#### Step 1: Visualizing the graph network for all transactions
Make separate lists holding senders AccountId, receivers AccountId, Amount of transactions.

In [14]:
#pandas series into list
senders=list(df['sAcc'])
receivers=list(df['rAcc'])
Amount=list(df['TranAmount'])
l=len(senders)
un=df['sAcc'].nunique()
un2=df['rAcc'].nunique()

print('No of total senders:', l)
print('No of unique senders:', un )
print('No if unique receivers:', un2)

No of total senders: 54222
No of unique senders: 1864
No if unique receivers: 1562


#### Now using Network library create a garph and visualize the graph network representing transactions.

In [15]:
#Define a graph visualise it for basic analysis
net=Network('1500px', '1500px')
G=nx.DiGraph()

for i in range(1000): # Considering only 1000 samples just to see the visualization
    G.add_edge(senders[i],receivers[i],weight=Amount[i])
    # senders and receivers are the nodes, Amount is the edge length.
net.from_nx(G)
net.show('transactionsViz.html')

#### Step 2: Segregate the possible frauds

In [18]:
#Now create another graph G2 and perform the analysis
net2=Network()
G2=nx.Graph()
for i in range(l): #For all the transactions
    G2.add_edge(senders[i],receivers[i],weight=Amount[i])
net2.from_nx(G2)

#### get those nodes whose degree is higher, if no of transactions is high it can be a poosible fraud

In [27]:
nds=list(G2.nodes()) #list of all nodes
print(type(nds))
degrees = [val for (node, val) in G2.degree()]
l2=len(degrees)  # degree of respective nodes

repeated_nodes=[] 
for i in range(l2):
    val=degrees[i]
    if val>=10:        # If it has done more than 10 transactions
        ind = nds[i]
        repeated_nodes.append(ind)
print(len(repeated_nodes))

<class 'list'>
206


#### Select only those rows that appeares in repeated nodes and make a separate dataframe

In [28]:
sAccount=df['sAcc']  # All sender accouts
numSamples=len(sAccount)
count=0
drop_indices=[] 
for i in range(numSamples):
    acc=sAccount[i]
    if acc not in repeated_nodes: 
        count=count+1
        drop_indices.append(i)
print('No of accounts out of fradulent activities:',l-count)

No of accounts out of fradulent activities: 20014


In [31]:
# drop the samples that are not in the possible fraud list 
# and make a new dataframe with only possible fraud transactions.
df2=df.drop(drop_indices).reset_index()
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20014 entries, 0 to 20013
Data columns (total 20 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   index       20014 non-null  int64  
 1   gT          20014 non-null  object 
 2   sId         20014 non-null  object 
 3   rId         20014 non-null  object 
 4   sAcc        20014 non-null  object 
 5   rAcc        20014 non-null  object 
 6   TranAmount  20014 non-null  float64
 7   TranType    20014 non-null  object 
 8   TranStatus  20014 non-null  object 
 9   sBalbefore  20014 non-null  float64
 10  sBalAfter   20014 non-null  float64
 11  rBalBefore  20014 non-null  float64
 12  rBalAfter   20014 non-null  float64
 13  sf1         20014 non-null  bool   
 14  sf2         20014 non-null  bool   
 15  sTD         20014 non-null  object 
 16  rTD         20014 non-null  object 
 17  TranTS      20014 non-null  object 
 18  sType       20014 non-null  object 
 19  rType       20014 non-nul

#### Step 3: Segregate the senders/receivers according to the criteria set for fradulent transactions

In [32]:
#Find out the transactions type 'Individual' or 'Withdrawl Type'
tType=df2['TranType']
dind2=[]
for i in range(len(tType)):
    el=tType[i]  # Transaction type of the particular transaction
    if el=='ArRC' or el=='Dt':  #If not 'Ind' or 'Wl' then drop it.
        dind2.append(i)
df3=df2.drop(dind2).reset_index()
df3

Unnamed: 0,level_0,index,gT,sId,rId,sAcc,rAcc,TranAmount,TranType,TranStatus,...,sBalAfter,rBalBefore,rBalAfter,sf1,sf2,sTD,rTD,TranTS,sType,rType
0,3,7,N-RegC2C,PN_EU_2_5,PN_EU_1_121,EUAcc2_5,EUAcc1_121,66197.31,Ind,SU,...,9.993314e+07,1.000662e+08,1.000000e+08,True,True,1/6/2011 1:17:5,1/6/2011 1:17:5,1/6/2011 1:17:5,EU,EU
1,6,11,N-RegC2C,PN_EU_1_91,PN_EU_0_213,EUAcc1_91,EUAcc0_213,157137.75,Ind,SU,...,9.984129e+07,1.001571e+08,1.000000e+08,True,True,1/6/2011 2:1:41,1/6/2011 2:1:41,1/6/2011 2:1:41,EU,EU
2,15,37,N-RegC2C,PN_EU_3_10,PN_EU_0_1180,EUAcc3_10,EUAcc0_1180,48162.83,Ind,SU,...,9.995136e+07,1.000482e+08,1.000000e+08,True,True,1/6/2011 5:18:19,1/6/2011 5:18:19,1/6/2011 5:18:19,EU,EU
3,16,41,N-RegC2C,PN_EU_1_138,PN_EU_0_500,EUAcc1_138,EUAcc0_500,7203.14,Ind,SU,...,9.999272e+07,1.000034e+08,9.999623e+07,True,True,1/6/2011 5:56:6,1/6/2011 5:56:6,1/6/2011 5:56:6,EU,EU
4,25,69,N-RegC2C,PN_EU_2_26,PN_EU_0_883,EUAcc2_26,EUAcc0_883,203128.87,Ind,SU,...,9.979484e+07,1.002031e+08,1.000000e+08,True,True,1/6/2011 9:43:8,1/6/2011 9:43:8,1/6/2011 9:43:8,EU,EU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5715,19994,54153,N-RegC2C,PN_EU_1_371,PN_EU_2_126,EUAcc1_371,EUAcc2_126,3369.15,Ind,SU,...,1.022480e+08,1.029042e+08,1.029008e+08,True,True,30/9/2011 20:16:29,30/9/2011 20:16:29,30/9/2011 20:16:29,EU,EU
5716,20003,54188,N-RegC2C,PN_EU_1_382,PN_EU_0_405,EUAcc1_382,EUAcc0_405,323.91,Ind,SU,...,9.901628e+07,1.002065e+08,1.002062e+08,True,True,30/9/2011 22:11:43,30/9/2011 22:11:43,30/9/2011 22:11:43,EU,EU
5717,20006,54204,N_RegWith,PN_EU_3_33,PN_Ret3,EUAcc3_33,RAcc3,231559.46,Wl,SU,...,1.000289e+08,8.619040e+08,8.616712e+08,True,True,30/9/2011 22:54:3,30/9/2011 22:54:3,30/9/2011 22:54:3,EU,RET
5718,20007,54205,N-RegC2C,PN_EU_1_368,PN_EU_3_3,EUAcc1_368,EUAcc3_3,226206.39,Ind,SU,...,9.691174e+07,9.849312e+07,9.826692e+07,True,True,30/9/2011 22:57:3,30/9/2011 22:57:3,30/9/2011 22:57:3,EU,EU


In [37]:
#Now find out the transaction with lower than average transaction Amount
tAmt=df3['TranAmount']
l3=len(tAmt)
avg=np.mean(tAmt)
print('Average:',avg)
dind3=[]
#remove all those that have transaction amount higher than average
for i in range(l3):
    amount=tAmt[i]
    if amount>=avg:
        dind3.append(i)
len(dind3)

Average: 77041.1407290211


2106

#### Drop all other samples that are not showing any criteria for fraud transactions 

In [38]:
df4=df3.drop(dind3).reset_index(drop=True)
df4

Unnamed: 0,level_0,index,gT,sId,rId,sAcc,rAcc,TranAmount,TranType,TranStatus,...,sBalAfter,rBalBefore,rBalAfter,sf1,sf2,sTD,rTD,TranTS,sType,rType
0,3,7,N-RegC2C,PN_EU_2_5,PN_EU_1_121,EUAcc2_5,EUAcc1_121,66197.31,Ind,SU,...,9.993314e+07,1.000662e+08,1.000000e+08,True,True,1/6/2011 1:17:5,1/6/2011 1:17:5,1/6/2011 1:17:5,EU,EU
1,15,37,N-RegC2C,PN_EU_3_10,PN_EU_0_1180,EUAcc3_10,EUAcc0_1180,48162.83,Ind,SU,...,9.995136e+07,1.000482e+08,1.000000e+08,True,True,1/6/2011 5:18:19,1/6/2011 5:18:19,1/6/2011 5:18:19,EU,EU
2,16,41,N-RegC2C,PN_EU_1_138,PN_EU_0_500,EUAcc1_138,EUAcc0_500,7203.14,Ind,SU,...,9.999272e+07,1.000034e+08,9.999623e+07,True,True,1/6/2011 5:56:6,1/6/2011 5:56:6,1/6/2011 5:56:6,EU,EU
3,28,77,N-RegC2C,PN_EU_1_112,PN_EU_0_397,EUAcc1_112,EUAcc0_397,66492.79,Ind,SU,...,9.993284e+07,1.000665e+08,1.000000e+08,True,True,1/6/2011 11:5:51,1/6/2011 11:5:51,1/6/2011 11:5:51,EU,EU
4,31,80,N-RegC2C,PN_EU_2_105,PN_EU_1_494,EUAcc2_105,EUAcc1_494,47020.50,Ind,SU,...,9.995251e+07,1.000470e+08,1.000000e+08,True,True,1/6/2011 11:19:0,1/6/2011 11:19:0,1/6/2011 11:19:0,EU,EU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3609,19983,54121,N-RegC2C,PN_EU_1_144,PN_EU_0_504,EUAcc1_144,EUAcc0_504,66531.27,Ind,SU,...,9.826441e+07,1.010993e+08,1.010328e+08,True,True,30/9/2011 18:15:27,30/9/2011 18:15:27,30/9/2011 18:15:27,EU,EU
3610,19988,54135,N-RegC2C,PN_EU_1_48,PN_EU_0_1119,EUAcc1_48,EUAcc0_1119,35068.36,Ind,SU,...,8.780167e+07,1.000998e+08,1.000647e+08,True,True,30/9/2011 19:1:48,30/9/2011 19:1:48,30/9/2011 19:1:48,EU,EU
3611,19994,54153,N-RegC2C,PN_EU_1_371,PN_EU_2_126,EUAcc1_371,EUAcc2_126,3369.15,Ind,SU,...,1.022480e+08,1.029042e+08,1.029008e+08,True,True,30/9/2011 20:16:29,30/9/2011 20:16:29,30/9/2011 20:16:29,EU,EU
3612,20003,54188,N-RegC2C,PN_EU_1_382,PN_EU_0_405,EUAcc1_382,EUAcc0_405,323.91,Ind,SU,...,9.901628e+07,1.002065e+08,1.002062e+08,True,True,30/9/2011 22:11:43,30/9/2011 22:11:43,30/9/2011 22:11:43,EU,EU


#### df4 is the dataset created by droping the samples that not follow any criteria being involved in fraud transaction. So the samples in df4 are possible fraud transactions.

In [41]:
print('Number of possible transaction', len(df4))

Number of possible transaction 3614


#### Step 4: Visualize all the transactions along with fraud transactions

In [40]:
#Now visualize the all transactions along with fraud transactions
#pandas series into list
fse=list(df4['sAcc'])
fre=list(df4['rAcc'])
Amnt=list(df4['TranAmount'])
#Define a graph to visualise the fraud transactions
net2=Network('1800px', '1000px')
G2=nx.DiGraph()

# Add normal transactions
for i in range(1000):  #Visualizing only 1000 samples
    G2.add_edge(senders[i],receivers[i],weight=Amount[i])

# Add fraud transactions
for j in range(200):  # Visualizing only 200 samples
    G2.add_edge(fse[j],fre[j],weight=Amnt[j],color='red')

net2.from_nx(G2)
net2.show('AnalysedGraph.html')