In [None]:
# Basic imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', None)
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE


In [None]:
#  Load dataset
df = pd.read_csv("../data/fraud_detection_dummy_dataset.csv")

In [None]:
# check shape and few samples
print(f'Shape: {df.shape}')
df.head()

In [None]:
# Check basic info
df.info()

In [None]:
# Count of missing value per column
df.isnull().sum()

In [None]:
# fitler only columns with nulls
missing = df.isnull().sum()
missing = missing[missing > 0].sort_values(ascending = True)

# Display
print(f'Missing value per column : {missing}')

In [None]:
# Visualize missingness
plt.figure(figsize = (12,6))
sns.heatmap(df.isnull(), cbar = False, yticklabels = False, cmap = "viridis")
plt.title("Heatmap of Missing Values")
plt.show()

In [None]:
# plot distribution to spot outliers
numeric_cols = ["amount", "ip_risk_score", "mouse_movement_score", 'avg_transaction_amount']
plt.figure(figsize = (15, 8))
for i, col in enumerate(numeric_cols, 1):
    plt.subplot(2,2,i)
    sns.boxplot(x = df[col])
    plt.title(f'Boxplot of {col}')
plt.tight_layout()
plt.show()

In [None]:
# using IQR metjod of amount
Q1 = df['amount'].quantile(0.25)
Q3 = df['amount'].quantile(0.75)
IQR = Q3-Q1

# Outliers detection
outliers = df[(df['amount'] < (Q1 - 1.5 * IQR)) | (df['amount'] > (Q3 + 1.5 * IQR))]
print(f'Outliers in "amount" : {len(outliers)}')

In [None]:
# Fraud Count Plot
sns.countplot(x = 'fraud_flag', data = df)
plt.title("Fraud vs Non-fraud Count")
plt.xticks([0,1], ['Legit', 'Fraud'])
plt.ylabel("Number of transactions")
plt.show()
print(df['fraud_flag'].value_counts(normalize = True).round(3))

In [None]:
# Amount Distribution by Fraud
plt.figure(figsize = (10,5))
sns.histplot(data = df, x = 'amount', hue = 'fraud_flag', bins = 100, kde = True, palette = 'Set1', element = 'step')
plt.xlim(0, 500)
plt.title("Transaction Amount by Fraud Flag")
plt.show()

In [None]:
# Payment method vs Fraud rate
fraud_by_method = df.groupby("payment_method")["fraud_flag"].mean().sort_values(ascending = False)
fraud_by_method.plot(kind = "bar", title = "Fraud Rate by Payment Method", ylabel = "Fraud Rate")
plt.xticks(rotation = 45)
plt.show()

In [None]:
# Device Type vs Fraud rate
fraud_by_method = df.groupby("device_type")["fraud_flag"].mean().sort_values(ascending = False)
fraud_by_method.plot(kind = "bar", title = "Fraud Rate by Device Type", ylabel = "Fraud Rate")
plt.xticks(rotation = 45)
plt.show()

In [None]:
# Geo location vs Fraud rate
fraud_by_method = df.groupby("geo_location")["fraud_flag"].mean().sort_values(ascending = False)
fraud_by_method.plot(kind = "bar", title = "Fraud Rate by Geo Location", ylabel = "Fraud Rate")
plt.xticks(rotation = 45)
plt.show()

In [None]:
# VPN use vs Fraud rate
fraud_by_method = df.groupby("is_vpn")["fraud_flag"].mean().sort_values(ascending = False)
fraud_by_method.plot(kind = "bar", title = "Fraud Rate by VPN use", ylabel = "Fraud Rate")
plt.xticks(rotation = 45)
plt.show()

In [None]:
# Fill numerical Nans with median
df['mouse_movement_score'].fillna(df['mouse_movement_score'].median(), inplace = True)
df['ip_risk_score'].fillna(df['ip_risk_score'].median(), inplace = True)
df['avg_transaction_amount'].fillna(df['avg_transaction_amount'].median(), inplace = True)

# Fill Categorial Nans with mode
for col in ['payment_method','device_type', 'day_of_week', 'geo_location']:
    df[col].fillna(df[col].mode()[0], inplace = True)    

In [None]:

# Converting categorical to numerical values
categorical_cols = ['payment_method', 'device_type', 'geo_location', 'day_of_week']
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [None]:
# Preparing train test data
X = df_encoded.drop(columns= ['transaction_id', 'fraud_flag'])
y = df_encoded['fraud_flag']

# split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print(f"Train size: {X_train.shape}, Test size: {X_test.shape}")

In [None]:
# Apply SMOTE only on the training set
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

print(f"Before SMOTE: {sum(y_train==1)} fraud / {len(y_train)} total")
print(f"After SMOTE:  {sum(y_train_bal==1)} fraud / {len(y_train_bal)} total")

In [None]:
# Train Random Forest Classifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_bal, y_train_bal)






# Predictions
y_pred = rf_model.predict(X_test)

# Evaluation
print(classification_report(y_test, y_pred))