In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from scipy.sparse import csr_matrix, hstack

In [2]:
# Load the dataset
# I had to reduce size of the dataset coz while encoding it would require a lot of space, in TiB ,which my system cant afford
data = pd.read_csv('Fraud.csv')

In [3]:
# Data Cleaning
# Handle Missing Values
data = data.dropna()

In [4]:
# Split the dataset into features and target variable
X = data.drop(['isFraud', 'isFlaggedFraud'], axis=1)  # Excluding 'isFraud' and 'isFlaggedFraud' from features
y = data['isFraud']


In [5]:
# Label Encoding of Categorical Variables
categorical_columns = ['step', 'nameOrig', 'nameDest']

label_encoders = {}
X_encoded = []

for column in categorical_columns:
    label_encoder = LabelEncoder()
    label_encoder.fit(X[column])
    encoded_column = label_encoder.transform(X[column])
    X_encoded.append(encoded_column)
    label_encoders[column] = label_encoder

# One-Hot Encoding for 'type' column
onehot_encoder = OneHotEncoder(sparse=False)
encoded_type = onehot_encoder.fit_transform(X['type'].values.reshape(-1, 1))

# Combine categorical and numerical variables
X_final = csr_matrix(X_encoded[0].reshape(-1, 1))
for encoded_column in X_encoded[1:]:
    encoded_column = csr_matrix(encoded_column.reshape(-1, 1))
    X_final = hstack([X_final, encoded_column])

X_final = hstack([X_final, encoded_type, csr_matrix(X.drop(categorical_columns + ['type'], axis=1))])



In [6]:
# Split the data into training, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [7]:
# Train the model
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [8]:
# Make predictions on validation set
y_pred = model.predict(X_val)

In [9]:
# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)

In [10]:
print("Validation Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Validation Metrics:
Accuracy: 0.9997019764919057
Precision: 0.984
Recall: 0.7192982456140351
F1-score: 0.831081081081081


In [11]:
# Make predictions on test set
y_pred_test = model.predict(X_test)

In [12]:
# Evaluate the model on test set
accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
f1_test = f1_score(y_test, y_pred_test)

In [13]:
print("\nTest Metrics:")
print("Accuracy:", accuracy_test)
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1-score:", f1_test)


Test Metrics:
Accuracy: 0.9997711179457835
Precision: 0.9583333333333334
Recall: 0.8214285714285714
F1-score: 0.8846153846153847
