# Fraud detection

In [1]:
#importing necessary librarires
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
import seaborn as sns

In [2]:
#loading the dataset
url= "E:/Python projects/Insaid/task 2/Fraud.csv"
df= pd.read_csv(url)
df.head() #first 5 rows

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [3]:
#last 5 rows
df.tail()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.0,C776919290,0.0,339682.13,1,0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.0,C1881841831,0.0,0.0,1,0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.0,C1365125890,68488.84,6379898.11,1,0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.0,C2080388513,0.0,0.0,1,0
6362619,743,CASH_OUT,850002.52,C1280323807,850002.52,0.0,C873221189,6510099.11,7360101.63,1,0


In [4]:
#checking for null values or missing values
df.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [5]:
#checking for outliers
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
step,6362620.0,243.3972,142.332,1.0,156.0,239.0,335.0,743.0
amount,6362620.0,179861.9,603858.2,0.0,13389.57,74871.94,208721.5,92445520.0
oldbalanceOrg,6362620.0,833883.1,2888243.0,0.0,0.0,14208.0,107315.2,59585040.0
newbalanceOrig,6362620.0,855113.7,2924049.0,0.0,0.0,0.0,144258.4,49585040.0
oldbalanceDest,6362620.0,1100702.0,3399180.0,0.0,0.0,132705.665,943036.7,356015900.0
newbalanceDest,6362620.0,1224996.0,3674129.0,0.0,0.0,214661.44,1111909.0,356179300.0
isFraud,6362620.0,0.00129082,0.0359048,0.0,0.0,0.0,0.0,1.0
isFlaggedFraud,6362620.0,2.514687e-06,0.001585775,0.0,0.0,0.0,0.0,1.0


In [6]:
#info of dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [7]:
#distribution of legit transactions and fraud transactions
df['isFraud'].value_counts()

0    6354407
1       8213
Name: isFraud, dtype: int64

In [8]:
df['isFlaggedFraud'].value_counts()

0    6362604
1         16
Name: isFlaggedFraud, dtype: int64

In [9]:
#removing the unnecessary columns
df.drop(['nameOrig','nameDest','step','type'],axis=1,inplace=True)
df.head()

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,9839.64,170136.0,160296.36,0.0,0.0,0,0
1,1864.28,21249.0,19384.72,0.0,0.0,0,0
2,181.0,181.0,0.0,0.0,0.0,1,0
3,181.0,181.0,0.0,21182.0,0.0,1,0
4,11668.14,41554.0,29885.86,0.0,0.0,0,0


In [10]:
#checking for multicollinearity
df.corr()

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
amount,1.0,-0.002762,-0.007861,0.294137,0.459304,0.076688,0.012295
oldbalanceOrg,-0.002762,1.0,0.998803,0.066243,0.042029,0.010154,0.003835
newbalanceOrig,-0.007861,0.998803,1.0,0.067812,0.041837,-0.008148,0.003776
oldbalanceDest,0.294137,0.066243,0.067812,1.0,0.976569,-0.005885,-0.000513
newbalanceDest,0.459304,0.042029,0.041837,0.976569,1.0,0.000535,-0.000529
isFraud,0.076688,0.010154,-0.008148,-0.005885,0.000535,1.0,0.044109
isFlaggedFraud,0.012295,0.003835,0.003776,-0.000513,-0.000529,0.044109,1.0


### This dataset is highly unbalanced

In [11]:
#separating the data for analysis
legit= df[df.isFraud==0]
fraud= df[df.isFraud==1]

In [12]:
print(legit.shape)
fraud.shape

(6354407, 7)


(8213, 7)

In [13]:
legit.amount.describe()

count    6.354407e+06
mean     1.781970e+05
std      5.962370e+05
min      1.000000e-02
25%      1.336840e+04
50%      7.468472e+04
75%      2.083648e+05
max      9.244552e+07
Name: amount, dtype: float64

In [14]:
fraud.amount.describe()

count    8.213000e+03
mean     1.467967e+06
std      2.404253e+06
min      0.000000e+00
25%      1.270913e+05
50%      4.414234e+05
75%      1.517771e+06
max      1.000000e+07
Name: amount, dtype: float64

In [15]:
#comparing the values for voth transaction
df.groupby('isFraud').mean()

Unnamed: 0_level_0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFlaggedFraud
isFraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,178197.0,832828.7,855970.228109,1101421.0,1224926.0,0.0
1,1467967.0,1649668.0,192392.631836,544249.6,1279708.0,0.001948


# Under sampling
Building a dataset contaning similar distribution  of normal and Fraudalent transaction

Number of fraudalent transaction-- 8213

In [16]:
legit_sample= legit.sample(n=8213)

### Concatenating 2 dataframes

In [17]:
new_dataset= pd.concat([legit_sample,fraud],axis=0)

In [18]:
new_dataset.head()

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
5255254,24238.26,143.0,0.0,0.0,0.0,0,0
2919936,10746.73,41302.0,30555.27,0.0,0.0,0,0
1379746,28082.45,8509.0,0.0,0.0,0.0,0,0
1295922,2954.57,0.0,0.0,0.0,0.0,0,0
1756491,30287.66,10953286.95,10983574.61,62798.89,32511.22,0,0


In [19]:
new_dataset.tail()

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
6362615,339682.13,339682.13,0.0,0.0,339682.13,1,0
6362616,6311409.28,6311409.28,0.0,0.0,0.0,1,0
6362617,6311409.28,6311409.28,0.0,68488.84,6379898.11,1,0
6362618,850002.52,850002.52,0.0,0.0,0.0,1,0
6362619,850002.52,850002.52,0.0,6510099.11,7360101.63,1,0


In [20]:
new_dataset.groupby('isFraud').mean()

Unnamed: 0_level_0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFlaggedFraud
isFraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,182555.3,769940.2,791457.579269,1081762.0,1214421.0,0.0
1,1467967.0,1649668.0,192392.631836,544249.6,1279708.0,0.001948


### Splitting the data into features and target

In [21]:
X= new_dataset.drop(columns='isFraud', axis=1)
y=new_dataset['isFraud']

In [22]:
X

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFlaggedFraud
5255254,24238.26,143.00,0.00,0.00,0.00,0
2919936,10746.73,41302.00,30555.27,0.00,0.00,0
1379746,28082.45,8509.00,0.00,0.00,0.00,0
1295922,2954.57,0.00,0.00,0.00,0.00,0
1756491,30287.66,10953286.95,10983574.61,62798.89,32511.22,0
...,...,...,...,...,...,...
6362615,339682.13,339682.13,0.00,0.00,339682.13,0
6362616,6311409.28,6311409.28,0.00,0.00,0.00,0
6362617,6311409.28,6311409.28,0.00,68488.84,6379898.11,0
6362618,850002.52,850002.52,0.00,0.00,0.00,0


In [23]:
y

5255254    0
2919936    0
1379746    0
1295922    0
1756491    0
          ..
6362615    1
6362616    1
6362617    1
6362618    1
6362619    1
Name: isFraud, Length: 16426, dtype: int64

### Split the data into train and test data

In [24]:
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.30,random_state=2,stratify=y)

In [25]:
X.shape

(16426, 6)

In [26]:
X_train.shape

(11498, 6)

In [27]:
X_test.shape

(4928, 6)

### Model training

Logistic Reg

In [28]:
model= LogisticRegression()

In [29]:
#trainiing the model with training data
model.fit(X_train,y_train)

LogisticRegression()

### Model evaluation

In [30]:
#accuracy check in test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction,y_test)

In [31]:
print('accuracy_score:',test_data_accuracy)

accuracy_score: 0.8120941558441559


In [34]:
#Confusion matrix
test_data_confusion_matrix= confusion_matrix(X_test_prediction,y_test)
print(test_data_confusion_matrix) #TP- 1611, FP- 853, FN- 73, TN- 2391

[[1611   73]
 [ 853 2391]]


In [33]:
# Classification report
test_data_classification= classification_report(X_test_prediction,y_test)
print(test_data_classification)

              precision    recall  f1-score   support

           0       0.65      0.96      0.78      1684
           1       0.97      0.74      0.84      3244

    accuracy                           0.81      4928
   macro avg       0.81      0.85      0.81      4928
weighted avg       0.86      0.81      0.82      4928



# Insights

1 -There were no missing values.

2- I have removed the columns like step,type and name columns because they were not impacting the target. Have checked for outliers and multicollinearity.And since the dataset is highly unbalanced, I did undersampling where I took the fraudalent transaction and normal transaction as equal number and divided the data set into train and test splits. and applied `logistic regression` as my classification model. And got accuracy score as '81%' which is quite good and got the precision and recall for fraud transaction as '97%' and '74%' which is also pretty descent.

3- I selected the numerical variables and removed the object types because they were irrelevant to the target. For example step is a factor of time.So ,its no where related to the fraudalent activity.

4- I demonstarted the performance by accuracy score, confusion matrix, and classification report.

5- All the numerical variables were key factors in determining fraud activity.

6- Yes these factors do make sense because the amount of transaction, initial and closing balance play the most important role in detection.

7- The company should take care about OTP infra and also on the quick message infrastructure,so that customer would get the debited or credited info as quick as possible.

8- If the actions are implemented, then precision and recall are the best methods to detect fraud detection.

### Suggestion- The model can be still improved if tested with more models like random forests and also by using some boosting techniques.
