In [1]:
#importing necessary modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import itertools

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, recall_score, f1_score

In [3]:
#Loading the csv file

dataframe = pd.read_csv('Fraud.csv')

#Dropping CustomerID and RecipientID and is.FlaggedFraud

dataframe = dataframe.drop(['nameOrig','nameDest','isFlaggedFraud'],axis=1)


In [4]:
# Now making a new column by 

dataframe['isFlagged'] = dataframe['amount'].apply(lambda x: 1 if x > 200 else 0)


In [16]:
# columns to check VIF for
X = dataframe[['step', 'type', 'amount', 'oldbalanceOrg','newbalanceOrig','oldbalanceDest','newbalanceDest','isFraud','isFlagged']]


In [9]:
# create an empty dataframe

from statsmodels.stats.outliers_influence import variance_inflation_factor



In [18]:
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Selecting the columns for VIF calculation
X = dataframe[['step', 'type', 'amount', 'oldbalanceOrg','newbalanceOrig','oldbalanceDest','newbalanceDest','isFraud','isFlagged']]

# If 'type' is categorical, encode it first
X = pd.get_dummies(X, columns=['type'], drop_first=True)

# Create an empty dataframe for VIF
vif = pd.DataFrame()

# Copy all the features of X into the vif dataframe
vif["features"] = X.columns

# Calculate VIF for all the variables
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# Display the VIF dataframe
print(vif)


          features  VIF Factor
0             step    3.966431
1           amount    2.655244
2    oldbalanceOrg  922.144871
3   newbalanceOrig  949.807607
4   oldbalanceDest   38.984900
5   newbalanceDest   42.138381
6          isFraud    1.155769
7        isFlagged   13.301964
8    type_CASH_OUT    4.830865
9       type_DEBIT    1.068764
10    type_PAYMENT    4.526775
11   type_TRANSFER    2.297614


In [19]:
df=dataframe.drop(['oldbalanceOrg','newbalanceOrig','oldbalanceDest','newbalanceDest'],axis=1)
df.head()

Unnamed: 0,step,type,amount,isFraud,isFlagged
0,1,PAYMENT,9839.64,0,1
1,1,PAYMENT,1864.28,0,1
2,1,TRANSFER,181.0,1,0
3,1,CASH_OUT,181.0,1,0
4,1,PAYMENT,11668.14,0,1


In [21]:
#import label encoder

from sklearn import preprocessing 

#make an instance of Label Encoder

label_encoder = preprocessing.LabelEncoder()
df['type'] = label_encoder.fit_transform(df['type'])
df.head()

Unnamed: 0,step,type,amount,isFraud,isFlagged
0,1,3,9839.64,0,1
1,1,3,1864.28,0,1
2,1,4,181.0,1,0
3,1,1,181.0,1,0
4,1,3,11668.14,0,1


In [22]:
Y = df["isFraud"]
X = df.drop(["isFraud"], axis= 1)

In [40]:
train_X,test_X,train_Y,test_Y=train_test_split(X, Y,test_size=0.2, random_state=42)

# Decision Tree Classifier
decision_tree = DecisionTreeClassifier()
decision_tree.fit(train_X, train_Y)

predictions_dt = decision_tree.predict(test_X)
decision_tree_score = decision_tree.score(test_X, test_Y) * 100

In [41]:
# Random Forest
random_forest = RandomForestClassifier(n_estimators= 100)
random_forest.fit(train_X, train_Y)

predictions_rf = random_forest.predict(test_X)
random_forest_score = random_forest.score(test_X, test_Y) * 100

In [42]:
# Print scores of our classifiers

print("Random Forest Score: ", random_forest_score)
print("Decision Tree Score: ", decision_tree_score)

Random Forest Score:  99.93467324702573
Decision Tree Score:  99.91083136637819


In [43]:
X_train, X_test, Y_train,Y_test=train_test_split(X, Y,test_size=0.2, random_state=42)

# Initialize and train the XGBoost model
import xgboost as xgb
from sklearn.metrics import accuracy_score

model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, Y_train)

Parameters: { "use_label_encoder" } are not used.



In [26]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Print the predictions
print("Predicted labels:", y_pred)

Predicted labels: [0 0 0 ... 0 0 0]


In [27]:
#Number of predicted frauds

y_pred[y_pred==1]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [28]:
print("Predicted labels:", y_pred)

Predicted labels: [0 0 0 ... 0 0 0]


In [29]:
y_pred.shape

(209715,)

In [30]:
# Calculate accuracy
accuracy = accuracy_score(Y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# For classification, you can also print a confusion matrix and a classification report
from sklearn.metrics import confusion_matrix, classification_report

conf_matrix = confusion_matrix(Y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

class_report = classification_report(Y_test, y_pred)
print("Classification Report:")
print(class_report)

Accuracy: 1.00
Confusion Matrix:
[[209487      4]
 [    98    126]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    209491
           1       0.97      0.56      0.71       224

    accuracy                           1.00    209715
   macro avg       0.98      0.78      0.86    209715
weighted avg       1.00      1.00      1.00    209715



In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report




# Apply SMOTE to the training data due to class imbalance as the number of frauds are relatively less

smote = SMOTE(random_state=42)
X_train_resampled, Y_train_resampled = smote.fit_resample(X_train, Y_train)

# Check the distribution of the resampled data
print("Original training set class distribution:\n", Y_train.value_counts())
print("Resampled training set class distribution:\n", Y_train_resampled.value_counts())

# Train a model on the resampled data
model = RandomForestClassifier(random_state=42)
model.fit(X_train_resampled, Y_train_resampled)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(Y_test, y_pred))

Original training set class distribution:
 0    837942
1       918
Name: isFraud, dtype: int64
Resampled training set class distribution:
 0    837942
1    837942
Name: isFraud, dtype: int64
              precision    recall  f1-score   support

           0       1.00      0.98      0.99    209491
           1       0.03      0.67      0.07       224

    accuracy                           0.98    209715
   macro avg       0.52      0.82      0.53    209715
weighted avg       1.00      0.98      0.99    209715



In [34]:
accuracy = accuracy_score(Y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display the confusion matrix
conf_matrix = confusion_matrix(Y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Display a full classification report
class_report = classification_report(Y_test, y_pred)
print("Classification Report:")
print(class_report)

Accuracy: 0.98
Confusion Matrix:
[[205285   4206]
 [    75    149]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99    209491
           1       0.03      0.67      0.07       224

    accuracy                           0.98    209715
   macro avg       0.52      0.82      0.53    209715
weighted avg       1.00      0.98      0.99    209715



In [38]:
#PREDICTING FRAUD FOR NEW DATA 

new_data = pd.DataFrame({'step': [1],'type':[2],'amount':[189],'isFraud':[1],'isFlagged':[0]})

# Predict class labels for the new data
new_predictions = model.predict(new_data.drop(columns='isFraud'))

# Output the predictions
print(new_predictions)

# Predict probability of each class
new_probabilities = model.predict_proba(new_data.drop(columns='isFraud'))

# Output the probabilities for the positive class
print(new_probabilities[:, 1])

[1]
[0.53]
