# 1. Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# 2. Data Loading

In [2]:
# Generate synthetic data
n = 1000  # Number of transactions

# Generate transaction IDs
transaction_ids = np.arange(1, n + 1)

# Generate random amounts between 1 and 1000
amounts = np.random.randint(1, 1001, n)

# Generate random sender and receiver IDs
senders = np.random.randint(1, 101, n)
receivers = np.random.randint(1, 101, n)

# Generate random timestamps
timestamps = pd.date_range(start='2021-01-01', end='2021-12-31', periods=n)

# Generate account creation dates (for the sake of this example, we'll assume the same for sender and receiver)
account_creation_dates = pd.date_range(start='2020-01-01', end='2020-12-31', periods=n)


In [3]:
# Generate user profile data: Age (20-60), Location (1-5 as different regions), Employment (0-unemployed, 1-employed)
ages = np.random.randint(20, 61, n)
locations = np.random.randint(1, 6, n)
employments = np.random.randint(0, 2, n)

# Create the data frame
df = pd.DataFrame({
    'Transaction_ID': transaction_ids,
    'Amount': amounts,
    'Sender': senders,
    'Receiver': receivers,
    'Timestamp': timestamps,
    'Account_Creation_Date': account_creation_dates,
    'Age': ages,
    'Location': locations,
    'Employment': employments,
})



In [4]:
# Introduce some fraudulent transactions (for the sake of this example, we'll flag transactions over 900 as fraudulent)
df['Is_Fraud'] = (df['Amount'] > 900).astype(int)

In [5]:
df

Unnamed: 0,Transaction_ID,Amount,Sender,Receiver,Timestamp,Account_Creation_Date,Age,Location,Employment,Is_Fraud
0,1,237,69,38,2021-01-01 00:00:00.000000000,2020-01-01 00:00:00.000000000,27,5,0,0
1,2,617,42,85,2021-01-01 08:44:41.081081081,2020-01-01 08:46:07.567567567,34,2,1,0
2,3,170,14,66,2021-01-01 17:29:22.162162162,2020-01-01 17:32:15.135135135,49,5,1,0
3,4,470,8,57,2021-01-02 02:14:03.243243243,2020-01-02 02:18:22.702702702,41,2,0,0
4,5,408,58,93,2021-01-02 10:58:44.324324324,2020-01-02 11:04:30.270270270,28,3,0,0
...,...,...,...,...,...,...,...,...,...,...
995,996,802,32,11,2021-12-29 13:01:15.675675676,2020-12-29 12:55:29.729729728,44,3,0,0
996,997,249,83,74,2021-12-29 21:45:56.756756756,2020-12-29 21:41:37.297297296,50,1,0,0
997,998,272,60,94,2021-12-30 06:30:37.837837840,2020-12-30 06:27:44.864864864,45,4,0,0
998,999,387,31,90,2021-12-30 15:15:18.918918920,2020-12-30 15:13:52.432432432,58,3,0,0


In [6]:
df.Is_Fraud.value_counts()

0    892
1    108
Name: Is_Fraud, dtype: int64

# 3. Data Pre - Processoing

In [7]:
# Feature engineering: Extract time-based features from the timestamp
df['Hour'] = df['Timestamp'].dt.hour
df['Day'] = df['Timestamp'].dt.day
df['Weekday'] = df['Timestamp'].dt.weekday

# Let's use these engineered features plus the Amount for anomaly detection
features = ['Amount', 'Hour', 'Day', 'Weekday']


In [9]:
features

['Amount', 'Hour', 'Day', 'Weekday']

In [10]:
X = df[features]
y = df['Is_Fraud']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = IsolationForest(contamination=0.1)
model.fit(X_train)

# Get predictions (1 for inliers, -1 for outliers)
y_pred = model.predict(X_test)
y_pred = [1 if x == 1 else 0 for x in y_pred]  # Convert to 0 and 1

# Evaluate the model
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.79      0.15      0.25       181
           1       0.07      0.63      0.13        19

    accuracy                           0.20       200
   macro avg       0.43      0.39      0.19       200
weighted avg       0.73      0.20      0.24       200



