In [2]:
import pandas as pd
import numpy as np
import random
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [3]:
# Create a list of sample data for each feature
n = 2500  # Number of users
age = [random.randint(18, 65) for _ in range(n)]
gender = [random.choice(['Male', 'Female', 'Other']) for _ in range(n)]
location = [random.choice(['Urban', 'Suburban', 'Rural']) for _ in range(n)]
marital_status = [random.choice(['Single', 'Married', 'Divorced']) for _ in range(n)]
employment = [random.choice(['Employed', 'Unemployed', 'Self-Employed']) for _ in range(n)]
account_creation_date = pd.date_range(start='2019-01-01', end='2022-01-01', freq='M').to_list()
account_creation_date = [random.choice(account_creation_date) for _ in range(n)]
savings_amount = [random.uniform(1000.0, 100000.0) for _ in range(n)]
avg_transaction_value = [random.uniform(100.0, 10000.0) for _ in range(n)]
transaction_count_last_30_days = [random.randint(0, 30) for _ in range(n)]
avg_transaction_per_month = [random.uniform(0, 100) for _ in range(n)]
avg_transaction_amount = [random.uniform(50, 5000) for _ in range(n)]
device_type = [random.choice(['Mobile', 'Desktop', 'Tablet']) for _ in range(n)]
app_usage_time = [random.randint(0, 300) for _ in range(n)]  # in minutes
app_usage_duration = [random.randint(1, 365) for _ in range(n)]  # in days
browser = [random.choice(['Chrome', 'Firefox', 'Safari', 'Edge']) for _ in range(n)]
chatbot_usage = [random.choice([0, 1]) for _ in range(n)]
active_users = [random.randint(100, 1000) for _ in range(n)]
daily_new_users = [random.randint(0, 10) for _ in range(n)]
average_session_length = [random.uniform(0.5, 3.0) for _ in range(n)]  # in hours
monthly_revenue = [random.uniform(1000.0, 100000.0) for _ in range(n)]
fraud_label = [random.choice([0, 1]) for _ in range(n)]

# Create the DataFrame
df = pd.DataFrame({
    'Age': age,
    'Gender': gender,
    'Location': location,
    'Marital_Status': marital_status,
    'Employment': employment,
    'AccountCreationDate': account_creation_date,
    'SavingsAmount': savings_amount,
    'AvgTransactionValue': avg_transaction_value,
    'TransactionCountLast30Days': transaction_count_last_30_days,
    'AvgTransactionPerMonth': avg_transaction_per_month,
    'AvgTransactionAmount': avg_transaction_amount,
    'DeviceType': device_type,
    'AppUsageTime': app_usage_time,
    'AppUsageDuration': app_usage_duration,
    'Browser': browser,
    'ChatbotUsage': chatbot_usage,
    'ActiveUsers': active_users,
    'DailyNewUsers': daily_new_users,
    'AverageSessionLength': average_session_length,
    'MonthlyRevenue': monthly_revenue,
    'IsFraud': fraud_label
})

df.head(2).T

Unnamed: 0,0,1
Age,25,54
Gender,Male,Female
Location,Suburban,Suburban
Marital_Status,Married,Married
Employment,Employed,Employed
AccountCreationDate,2021-08-31 00:00:00,2020-03-31 00:00:00
SavingsAmount,1655.445789,75708.622678
AvgTransactionValue,3650.163127,3735.992691
TransactionCountLast30Days,17,19
AvgTransactionPerMonth,30.903039,82.113993


In [7]:
from sklearn.cluster import DBSCAN
import pandas as pd
import numpy as np

# Generate some sample data or use your dataframe 'df'
# For demonstration, let's assume df is your dataframe with only numerical features.
# df = pd.DataFrame({
#    'Feature1': np.random.randn(100),
#    'Feature2': np.random.randn(100),
#    'Feature3': np.random.randn(100),
#    ...
# })

# Select the numerical features you want to use for DBSCAN
features_to_check = ['Age', 'SavingsAmount', 'AvgTransactionValue', 
                     'TransactionCountLast30Days', 'AvgTransactionPerMonth', 
                     'AvgTransactionAmount', 'AppUsageTime', 'AppUsageDuration'] # Replace with your actual numerical features

# Prepare data for clustering
X = df[features_to_check].values

# Perform DBSCAN clustering
dbscan = DBSCAN(eps=0.5, min_samples=5)
labels = dbscan.fit_predict(X)

# Labeling outliers (-1 indicates an anomaly)
df['DBSCAN_Label'] = labels

# Flagging the anomalies
df['IsAnomaly_DBSCAN'] = df['DBSCAN_Label'] == -1

# See how many we detected as anomalies compared to actual anomalies
print("Detected as Anomalies: ", df['IsAnomaly_DBSCAN'].sum())
# If you have actual fraud labels
print("Actual Anomalies: ", df['IsFraud'].sum())

# You can then proceed to evaluate the model using metrics like precision, recall, etc.


Detected as Anomalies:  2500
Actual Anomalies:  1261


In [9]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# True labels
y_true = df['IsFraud']

# Predicted labels based on the anomaly detection model
y_pred = df['IsAnomaly_DBSCAN'].astype(int)

# Calculate Precision
precision = precision_score(y_true, y_pred)
print(f"Precision: {precision}")

# Calculate Recall
recall = recall_score(y_true, y_pred)
print(f"Recall: {recall}")

# Calculate F1-score
f1 = f1_score(y_true, y_pred)
print(f"F1-Score: {f1}")

# Calculate ROC AUC
roc_auc = roc_auc_score(y_true, y_pred)
print(f"ROC AUC: {roc_auc}")

# Calculate the confusion matrix to evaluate the accuracy of the classification
conf_matrix = confusion_matrix(y_true, y_pred)
print(f"Confusion Matrix: \n{conf_matrix}")


Precision: 0.5044
Recall: 1.0
F1-Score: 0.6705663387396968
ROC AUC: 0.5
Confusion Matrix: 
[[   0 1239]
 [   0 1261]]
