In [4]:
import pandas as pd
import numpy as np
import random
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import pandas as pd


In [5]:
# Create a list of sample data for each feature
n = 2500  # Number of users
age = [random.randint(18, 65) for _ in range(n)]
gender = [random.choice(['Male', 'Female', 'Other']) for _ in range(n)]
location = [random.choice(['Urban', 'Suburban', 'Rural']) for _ in range(n)]
marital_status = [random.choice(['Single', 'Married', 'Divorced']) for _ in range(n)]
employment = [random.choice(['Employed', 'Unemployed', 'Self-Employed']) for _ in range(n)]
account_creation_date = pd.date_range(start='2019-01-01', end='2022-01-01', freq='M').to_list()
account_creation_date = [random.choice(account_creation_date) for _ in range(n)]
savings_amount = [random.uniform(1000.0, 100000.0) for _ in range(n)]
avg_transaction_value = [random.uniform(100.0, 10000.0) for _ in range(n)]
transaction_count_last_30_days = [random.randint(0, 30) for _ in range(n)]
avg_transaction_per_month = [random.uniform(0, 100) for _ in range(n)]
avg_transaction_amount = [random.uniform(50, 5000) for _ in range(n)]
device_type = [random.choice(['Mobile', 'Desktop', 'Tablet']) for _ in range(n)]
app_usage_time = [random.randint(0, 300) for _ in range(n)]  # in minutes
app_usage_duration = [random.randint(1, 365) for _ in range(n)]  # in days
browser = [random.choice(['Chrome', 'Firefox', 'Safari', 'Edge']) for _ in range(n)]
chatbot_usage = [random.choice([0, 1]) for _ in range(n)]
active_users = [random.randint(100, 1000) for _ in range(n)]
daily_new_users = [random.randint(0, 10) for _ in range(n)]
average_session_length = [random.uniform(0.5, 3.0) for _ in range(n)]  # in hours
monthly_revenue = [random.uniform(1000.0, 100000.0) for _ in range(n)]
fraud_label = [random.choice([0, 1]) for _ in range(n)]

# Create the DataFrame
df = pd.DataFrame({
    'Age': age,
    'Gender': gender,
    'Location': location,
    'Marital_Status': marital_status,
    'Employment': employment,
    'AccountCreationDate': account_creation_date,
    'SavingsAmount': savings_amount,
    'AvgTransactionValue': avg_transaction_value,
    'TransactionCountLast30Days': transaction_count_last_30_days,
    'AvgTransactionPerMonth': avg_transaction_per_month,
    'AvgTransactionAmount': avg_transaction_amount,
    'DeviceType': device_type,
    'AppUsageTime': app_usage_time,
    'AppUsageDuration': app_usage_duration,
    'Browser': browser,
    'ChatbotUsage': chatbot_usage,
    'ActiveUsers': active_users,
    'DailyNewUsers': daily_new_users,
    'AverageSessionLength': average_session_length,
    'MonthlyRevenue': monthly_revenue,
    'IsFraud': fraud_label
})

df.head(2).T

Unnamed: 0,0,1
Age,40,20
Gender,Other,Male
Location,Suburban,Suburban
Marital_Status,Married,Single
Employment,Self-Employed,Employed
AccountCreationDate,2020-04-30 00:00:00,2021-07-31 00:00:00
SavingsAmount,84459.753485,45781.05417
AvgTransactionValue,7468.947246,8496.428246
TransactionCountLast30Days,29,11
AvgTransactionPerMonth,52.95782,74.752222


In [6]:
# Features to use for anomaly detection (only numerical features)
features_to_use = ['Age', 'SavingsAmount', 'AvgTransactionValue', 
                   'TransactionCountLast30Days', 'AvgTransactionPerMonth', 
                   'AvgTransactionAmount', 'AppUsageTime', 'AppUsageDuration']

# Extract the feature set from the DataFrame
X = df[features_to_use]

# Initialize the LOF model
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.1)  # You can adjust these parameters

# Fit the model and get the prediction labels and scores
y_pred = lof.fit_predict(X)
scores = lof.negative_outlier_factor_

# Transform y_pred to match the label in your original DataFrame ('IsFraud')
# Inliers are labeled 1, while anomalies are labeled -1. Let's change anomalies to 1 and inliers to 0.
y_pred[y_pred == 1] = 0
y_pred[y_pred == -1] = 1

# Add to DataFrame
df['IsAnomaly_LOF'] = y_pred

# True labels
y_true = df['IsFraud']

# Evaluation metrics
# Calculate Precision
precision = precision_score(y_true, y_pred)
print(f"Precision: {precision}")

# Calculate Recall
recall = recall_score(y_true, y_pred)
print(f"Recall: {recall}")

# Calculate F1-score
f1 = f1_score(y_true, y_pred)
print(f"F1-Score: {f1}")

# Calculate ROC AUC
roc_auc = roc_auc_score(y_true, y_pred)
print(f"ROC AUC: {roc_auc}")

# Calculate the confusion matrix to evaluate the accuracy of the classification
conf_matrix = confusion_matrix(y_true, y_pred)
print(f"Confusion Matrix: \n{conf_matrix}")


Precision: 0.476
Recall: 0.0954290296712109
F1-Score: 0.15898463593854376
ROC AUC: 0.49543997373424864
Confusion Matrix: 
[[1122  131]
 [1128  119]]
