In [5]:
import warnings
warnings.filterwarnings("ignore")


In [6]:
import pandas as pd
import numpy as np
import random
from scipy import stats

from sklearn.metrics import classification_report
from sklearn.ensemble import IsolationForest

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix






In [7]:
# Create a list of sample data for each feature
n = 2500  # Number of users
age = [random.randint(18, 65) for _ in range(n)]
gender = [random.choice(['Male', 'Female', 'Other']) for _ in range(n)]
location = [random.choice(['Urban', 'Suburban', 'Rural']) for _ in range(n)]
marital_status = [random.choice(['Single', 'Married', 'Divorced']) for _ in range(n)]
employment = [random.choice(['Employed', 'Unemployed', 'Self-Employed']) for _ in range(n)]
account_creation_date = pd.date_range(start='2019-01-01', end='2022-01-01', freq='M').to_list()
account_creation_date = [random.choice(account_creation_date) for _ in range(n)]
savings_amount = [random.uniform(1000.0, 100000.0) for _ in range(n)]
avg_transaction_value = [random.uniform(100.0, 10000.0) for _ in range(n)]
transaction_count_last_30_days = [random.randint(0, 30) for _ in range(n)]
avg_transaction_per_month = [random.uniform(0, 100) for _ in range(n)]
avg_transaction_amount = [random.uniform(50, 5000) for _ in range(n)]
device_type = [random.choice(['Mobile', 'Desktop', 'Tablet']) for _ in range(n)]
app_usage_time = [random.randint(0, 300) for _ in range(n)]  # in minutes
app_usage_duration = [random.randint(1, 365) for _ in range(n)]  # in days
browser = [random.choice(['Chrome', 'Firefox', 'Safari', 'Edge']) for _ in range(n)]
chatbot_usage = [random.choice([0, 1]) for _ in range(n)]
active_users = [random.randint(100, 1000) for _ in range(n)]
daily_new_users = [random.randint(0, 10) for _ in range(n)]
average_session_length = [random.uniform(0.5, 3.0) for _ in range(n)]  # in hours
monthly_revenue = [random.uniform(1000.0, 100000.0) for _ in range(n)]
fraud_label = [random.choice([0, 1]) for _ in range(n)]

# Create the DataFrame
df = pd.DataFrame({
    'Age': age,
    'Gender': gender,
    'Location': location,
    'Marital_Status': marital_status,
    'Employment': employment,
    'AccountCreationDate': account_creation_date,
    'SavingsAmount': savings_amount,
    'AvgTransactionValue': avg_transaction_value,
    'TransactionCountLast30Days': transaction_count_last_30_days,
    'AvgTransactionPerMonth': avg_transaction_per_month,
    'AvgTransactionAmount': avg_transaction_amount,
    'DeviceType': device_type,
    'AppUsageTime': app_usage_time,
    'AppUsageDuration': app_usage_duration,
    'Browser': browser,
    'ChatbotUsage': chatbot_usage,
    'ActiveUsers': active_users,
    'DailyNewUsers': daily_new_users,
    'AverageSessionLength': average_session_length,
    'MonthlyRevenue': monthly_revenue,
    'IsFraud': fraud_label
})

df.head(2).T

Unnamed: 0,0,1
Age,47,41
Gender,Female,Male
Location,Suburban,Suburban
Marital_Status,Single,Single
Employment,Unemployed,Self-Employed
AccountCreationDate,2020-12-31 00:00:00,2019-12-31 00:00:00
SavingsAmount,97513.057229,97394.020289
AvgTransactionValue,9005.221964,2028.524843
TransactionCountLast30Days,9,25
AvgTransactionPerMonth,32.633458,83.214358


In [8]:
# Features for classification (assuming only numerical features are used)
features_to_use = ['Age', 'SavingsAmount', 'AvgTransactionValue', 
                   'TransactionCountLast30Days', 'AvgTransactionPerMonth', 
                   'AvgTransactionAmount', 'AppUsageTime', 'AppUsageDuration']

# Extract feature set
X = df[features_to_use]
y = df['IsFraud']

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Isolation Forest model
iso_forest = IsolationForest(n_estimators=100, contamination=0.05, random_state=42)

# Train the model on the training set
iso_forest.fit(X_train)

# Make predictions: 1 for normal, -1 for anomaly
y_pred = iso_forest.predict(X_test)

# Convert the prediction to match our labels: 0 for normal, 1 for fraud
y_pred = np.where(y_pred == 1, 0, 1)

# Evaluation Metrics
# Calculate Precision
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision}")

# Calculate Recall
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall}")

# Calculate F1-score
f1 = f1_score(y_test, y_pred)
print(f"F1-Score: {f1}")

# Calculate ROC AUC
roc_auc = roc_auc_score(y_test, y_pred)
print(f"ROC AUC: {roc_auc}")

# Calculate Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix: \n{conf_matrix}")


Precision: 0.5172413793103449
Recall: 0.05813953488372093
F1-Score: 0.10452961672473868
ROC AUC: 0.5001441476071498
Confusion Matrix: 
[[228  14]
 [243  15]]
