In [34]:
# Simplest Possible Fraud Detection

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest


In [35]:
# Load the dataset
df = pd.read_csv(r"C:\Users\SHRILAKSHMI\OneDrive\Desktop\python projects\Fraud_Detection\fraud_detection\fraud_detection\data\csv_output\2018-04-01.csv")


In [36]:
# Print basic information
print("Dataset shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())

Dataset shape: (9488, 9)

First 5 rows:
   TRANSACTION_ID          TX_DATETIME  CUSTOMER_ID  TERMINAL_ID  TX_AMOUNT  \
0               0  2018-04-01 00:00:31          596         3156      57.16   
1               1  2018-04-01 00:02:10         4961         3412      81.51   
2               2  2018-04-01 00:07:56            2         1365     146.00   
3               3  2018-04-01 00:09:29         4128         8737      64.49   
4               4  2018-04-01 00:10:34          927         9906      50.99   

   TX_TIME_SECONDS  TX_TIME_DAYS  TX_FRAUD  TX_FRAUD_SCENARIO  
0               31             0         0                  0  
1              130             0         0                  0  
2              476             0         0                  0  
3              569             0         0                  0  
4              634             0         0                  0  


In [37]:
# Select only numerical features
features = ['TX_AMOUNT', 'TX_TIME_SECONDS', 'TX_TIME_DAYS']
X = df[features].values

In [38]:
# Train isolation forest model
model = IsolationForest(contamination=0.01, random_state=42)
model.fit(X)


In [39]:
# The model returns 1 for normal, -1 for anomalies
predictions = model.predict(X)
anomaly_scores = model.decision_function(X)

In [40]:
# Add results to dataframe
df['anomaly'] = np.where(predictions == -1, 1, 0)  # 1 if anomaly, 0 if normal


In [41]:
# Evaluate results
print("\nDetected anomalies (potential frauds):", df['anomaly'].sum())
print(f"Percentage of transactions flagged: {df['anomaly'].mean() * 100:.2f}%")


Detected anomalies (potential frauds): 95
Percentage of transactions flagged: 1.00%


In [42]:
# If we have the actual fraud labels, we can compare
if 'TX_FRAUD' in df.columns:
    print("\nActual fraud statistics:")
    print("Number of fraudulent transactions:", df['TX_FRAUD'].sum())
    print(f"Percentage of fraudulent transactions: {df['TX_FRAUD'].mean() * 100:.2f}%")

     # Calculate hit rate (how many actual frauds were detected)
    hits = ((df['anomaly'] == 1) & (df['TX_FRAUD'] == 1)).sum()
    if df['TX_FRAUD'].sum() > 0:
        hit_rate = hits / df['TX_FRAUD'].sum()
        print(f"\nPercentage of frauds detected: {hit_rate * 100:.2f}%")
    
    # Check if our unsupervised model found actual frauds that weren't labeled
    potential_new_frauds = ((df['anomaly'] == 1) & (df['TX_FRAUD'] == 0)).sum()
    print(f"Potential new frauds detected: {potential_new_frauds}")


Actual fraud statistics:
Number of fraudulent transactions: 3
Percentage of fraudulent transactions: 0.03%

Percentage of frauds detected: 100.00%
Potential new frauds detected: 92


In [43]:

# Conclusion
if df['anomaly'].sum() > 0:
    print("\nCONCLUSION: This dataset CONTAINS potential fraudulent transactions.")
else:
    print("\nCONCLUSION: No potential fraudulent transactions detected in this dataset.")



CONCLUSION: This dataset CONTAINS potential fraudulent transactions.
