In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score


In [2]:
df = pd.read_csv("fraud_data.csv")

In [3]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [4]:
df = df[["nameOrig", "oldbalanceOrg", "newbalanceOrig", "nameDest", "oldbalanceDest", "newbalanceDest", "isFraud"]]
df.head()

Unnamed: 0,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud
0,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0
1,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0
2,C1305486145,181.0,0.0,C553264065,0.0,0.0,1
3,C840083671,181.0,0.0,C38997010,21182.0,0.0,1
4,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0


In [5]:
df = df.sample(n=100000, random_state=42)

In [6]:
df["isFraud"].value_counts()

isFraud
0    99859
1      141
Name: count, dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, 3737323 to 6142173
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   nameOrig        100000 non-null  object 
 1   oldbalanceOrg   100000 non-null  float64
 2   newbalanceOrig  100000 non-null  float64
 3   nameDest        100000 non-null  object 
 4   oldbalanceDest  100000 non-null  float64
 5   newbalanceDest  100000 non-null  float64
 6   isFraud         100000 non-null  int64  
dtypes: float64(4), int64(1), object(2)
memory usage: 6.1+ MB


In [8]:
df = df.dropna()
df.isnull().sum()

nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
dtype: int64

In [9]:
# Feature Engineering
df['transactionAmount'] = df['oldbalanceOrg'] - df['newbalanceOrig']
df['balanceChangeRatioOrig'] = (df['oldbalanceOrg'] - df['newbalanceOrig']) / (df['oldbalanceOrg'] + 1)
df['balanceChangeRatioDest'] = (df['oldbalanceDest'] - df['newbalanceDest']) / (df['oldbalanceDest'] + 1)


In [10]:
df[df['isFraud'] == 1]

Unnamed: 0,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,transactionAmount,balanceChangeRatioOrig,balanceChangeRatioDest
6303116,C1773649560,93483.23,0.00,C2085820540,86882.30,180365.53,1,93483.23,0.999989,-1.075963
3960303,C747251675,806863.30,0.00,C1519845212,2344186.16,3151049.45,1,806863.30,0.999999,-0.344197
2736446,C728984460,4953893.08,4953893.08,C639921569,0.00,0.00,1,0.00,0.000000,0.000000
1030272,C555990868,1170282.92,0.00,C1318550066,173264.14,1343547.06,1,1170282.92,0.999999,-6.754290
1212611,C1724151856,1159010.44,0.00,C1229046559,175085.52,1334095.97,1,1159010.44,0.999999,-6.619644
...,...,...,...,...,...,...,...,...,...,...
6122464,C633635438,3707342.38,0.00,C1640217168,558618.20,4265960.58,1,3707342.38,1.000000,-6.636618
6273168,C2008189700,125107.12,0.00,C1820507846,0.00,0.00,1,125107.12,0.999992,0.000000
6008702,C1184464536,1648547.69,0.00,C919670437,0.00,0.00,1,1648547.69,0.999999,0.000000
6168680,C458467003,3571807.73,0.00,C837210602,527694.86,4099502.59,1,3571807.73,1.000000,-6.768686


In [11]:
df = df[df['transactionAmount'] >= 0]
df

Unnamed: 0,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,transactionAmount,balanceChangeRatioOrig,balanceChangeRatioDest
264914,C1264712553,30370.0,18722.92,M215391829,0.00,0.00,0,11647.08,0.383493,0.000000
5899326,C333676753,0.0,0.00,C1564353608,3198359.45,4750120.08,0,0.00,0.000000,-0.485174
3494160,C2002954533,0.0,0.00,M290849763,0.00,0.00,0,0.00,0.000000,0.000000
2331654,C813757373,0.0,0.00,C823291717,558068.66,578672.53,0,0.00,0.000000,-0.036920
1414955,C1850864812,0.0,0.00,C618657299,585494.94,644100.66,0,0.00,0.000000,-0.100096
...,...,...,...,...,...,...,...,...,...,...
5258158,C616254594,0.0,0.00,M67170937,0.00,0.00,0,0.00,0.000000,0.000000
1956908,C1918194668,689392.0,280990.55,C907111507,147126.43,555527.88,0,408401.45,0.592407,-2.775835
3249900,C1408706065,0.0,0.00,C131307582,4149925.81,5294358.35,0,0.00,0.000000,-0.275772
5189870,C387888280,0.0,0.00,C999640037,12853274.23,12994793.47,0,0.00,0.000000,-0.011010


In [12]:
# Encode categorical variables
label_encoder_orig = LabelEncoder()
label_encoder_dest = LabelEncoder()
df['nameOrig'] = label_encoder_orig.fit_transform(df['nameOrig'])
df['nameDest'] = label_encoder_dest.fit_transform(df['nameDest'])

In [13]:
df['nameOrig'].unique()
df['nameDest'].unique()

array([60714, 11925, 62030, ..., 38914, 40893, 51338])

In [14]:
# Define features and target
X = df.drop('isFraud', axis=1)
y = df['isFraud']

In [15]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [16]:
# Scale numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [17]:
# Model Selection
model = RandomForestClassifier(n_estimators=100, random_state=42)

In [18]:
# Train the model
model.fit(X_train, y_train)

In [19]:
# Predict on the test set
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

In [20]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nAccuracy Score:")
print(roc_auc_score(y_test, y_pred_proba))

Confusion Matrix:
[[23312     2]
 [   13    31]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     23314
           1       0.94      0.70      0.81        44

    accuracy                           1.00     23358
   macro avg       0.97      0.85      0.90     23358
weighted avg       1.00      1.00      1.00     23358


Accuracy Score:
0.9066416394363123


In [21]:
import pickle

# Assuming 'model' is your trained RandomForestClassifier
filename = 'detection.pkl'

# Open a file for writing in binary mode
with open(filename, 'wb') as file:
    # Use pickle.dump to serialize the model to the file
    pickle.dump(model, file)

In [22]:
# Open the file for reading in binary mode and load the model
with open('detection.pkl', 'rb') as file:
    detection_model = pickle.load(file)

# Verify the loaded model
print(detection_model)

RandomForestClassifier(random_state=42)
