In [1]:
import pandas as pd
import numpy as np
import random
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
# Create a list of sample data for each feature
n = 2500  # Number of users
age = [random.randint(18, 65) for _ in range(n)]
gender = [random.choice(['Male', 'Female', 'Other']) for _ in range(n)]
location = [random.choice(['Urban', 'Suburban', 'Rural']) for _ in range(n)]
marital_status = [random.choice(['Single', 'Married', 'Divorced']) for _ in range(n)]
employment = [random.choice(['Employed', 'Unemployed', 'Self-Employed']) for _ in range(n)]
account_creation_date = pd.date_range(start='2019-01-01', end='2022-01-01', freq='M').to_list()
account_creation_date = [random.choice(account_creation_date) for _ in range(n)]
savings_amount = [random.uniform(1000.0, 100000.0) for _ in range(n)]
avg_transaction_value = [random.uniform(100.0, 10000.0) for _ in range(n)]
transaction_count_last_30_days = [random.randint(0, 30) for _ in range(n)]
avg_transaction_per_month = [random.uniform(0, 100) for _ in range(n)]
avg_transaction_amount = [random.uniform(50, 5000) for _ in range(n)]
device_type = [random.choice(['Mobile', 'Desktop', 'Tablet']) for _ in range(n)]
app_usage_time = [random.randint(0, 300) for _ in range(n)]  # in minutes
app_usage_duration = [random.randint(1, 365) for _ in range(n)]  # in days
browser = [random.choice(['Chrome', 'Firefox', 'Safari', 'Edge']) for _ in range(n)]
chatbot_usage = [random.choice([0, 1]) for _ in range(n)]
active_users = [random.randint(100, 1000) for _ in range(n)]
daily_new_users = [random.randint(0, 10) for _ in range(n)]
average_session_length = [random.uniform(0.5, 3.0) for _ in range(n)]  # in hours
monthly_revenue = [random.uniform(1000.0, 100000.0) for _ in range(n)]
fraud_label = [random.choice([0, 1]) for _ in range(n)]

# Create the DataFrame
df = pd.DataFrame({
    'Age': age,
    'Gender': gender,
    'Location': location,
    'Marital_Status': marital_status,
    'Employment': employment,
    'AccountCreationDate': account_creation_date,
    'SavingsAmount': savings_amount,
    'AvgTransactionValue': avg_transaction_value,
    'TransactionCountLast30Days': transaction_count_last_30_days,
    'AvgTransactionPerMonth': avg_transaction_per_month,
    'AvgTransactionAmount': avg_transaction_amount,
    'DeviceType': device_type,
    'AppUsageTime': app_usage_time,
    'AppUsageDuration': app_usage_duration,
    'Browser': browser,
    'ChatbotUsage': chatbot_usage,
    'ActiveUsers': active_users,
    'DailyNewUsers': daily_new_users,
    'AverageSessionLength': average_session_length,
    'MonthlyRevenue': monthly_revenue,
    'IsFraud': fraud_label
})

df.head(2).T

Unnamed: 0,0,1
Age,19,54
Gender,Male,Other
Location,Urban,Rural
Marital_Status,Single,Divorced
Employment,Unemployed,Unemployed
AccountCreationDate,2019-08-31 00:00:00,2020-01-31 00:00:00
SavingsAmount,47660.802683,75365.977503
AvgTransactionValue,9249.113347,9978.785383
TransactionCountLast30Days,24,11
AvgTransactionPerMonth,78.42488,2.720406


In [3]:
from sklearn.neighbors import NearestNeighbors
import numpy as np
import pandas as pd

# Assuming 'df' is your DataFrame and has been loaded
# Select features to include in the KNN model
features_to_check = ['Age', 'SavingsAmount', 'AvgTransactionValue', 
                     'TransactionCountLast30Days', 'AvgTransactionPerMonth', 
                     'AvgTransactionAmount', 'AppUsageTime', 'AppUsageDuration']

# Prepare the feature matrix
X = df[features_to_check].values

# Initialize the k-NN model
knn = NearestNeighbors(n_neighbors=5, algorithm='auto', metric='euclidean')
knn.fit(X)

# Find the k-neighbors of each point in object X
distances, indices = knn.kneighbors(X)

# Calculate the mean distance of the k-neighbors for each sample
mean_distances = np.mean(distances, axis=1)

# Define a threshold (commonly a distance that is higher than a particular percentile could be considered as an anomaly)
threshold = np.percentile(mean_distances, 95)

# Identify anomalies (observations whose mean distance to k neighbors is above the threshold)
anomalies_knn = np.where(mean_distances > threshold)[0]

# Label as fraudulent if any of the observations are anomalies
df['DetectedAsFraud_KNN'] = 0
df.loc[df.index.isin(anomalies_knn), 'DetectedAsFraud_KNN'] = 1

# See how many we detected as fraud compared to actual frauds
print("Detected as Fraud with KNN: ", df['DetectedAsFraud_KNN'].sum())
print("Actual Fraud: ", df['IsFraud'].sum())

# Your evaluation metrics code here to see how well we did (precision, recall, etc.)


Detected as Fraud with KNN:  125
Actual Fraud:  1231


In [5]:
# Create a new column in the DataFrame to keep track of detected anomalies by KNN
# Initialize with zeros
df['IsAnomaly_KNN'] = 0

# Set the detected anomalies to 1
df.loc[anomalies_knn, 'IsAnomaly_KNN'] = 1

# True labels
y_true = df['IsFraud']

# Predicted labels
y_pred = df['IsAnomaly_KNN']

# Evaluation metrics
# Calculate Precision
precision = precision_score(y_true, y_pred)
print(f"Precision: {precision}")

# Calculate Recall
recall = recall_score(y_true, y_pred)
print(f"Recall: {recall}")

# Calculate F1-score
f1 = f1_score(y_true, y_pred)
print(f"F1-Score: {f1}")

# Calculate ROC AUC
roc_auc = roc_auc_score(y_true, y_pred)
print(f"ROC AUC: {roc_auc}")

# Calculate the confusion matrix to evaluate the accuracy of the classification
conf_matrix = confusion_matrix(y_true, y_pred)
print(f"Confusion Matrix: \n{conf_matrix}")


Precision: 0.464
Recall: 0.047116165718927704
F1-Score: 0.0855457227138643
ROC AUC: 0.4971593436947672
Confusion Matrix: 
[[1202   67]
 [1173   58]]
