In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

# ==========================================
# 1. LOAD DATA
# ==========================================
print("Loading data...")
df = pd.read_csv('Dataset.csv')

# ==========================================
# 2. FEATURE ENGINEERING (The "Secret Sauce")
# ==========================================
print("Creating new features...")

# Feature A: Risky Combo (New Place + specific Approval Type)
# This was identified as a 99% fraud indicator in the deep dive
df['Risky_Combo'] = ((df['Col 4'] == 0) & (df['Col 6'] == 1)).astype(int)

# Feature B: Distance-Value Interaction
# High value transactions at high distances are often suspicious
df['Dist_Value_Interaction'] = df['Col 1'] * df['Col 3']

# Feature C: Impossible Travel Indicator
# If distance from last transaction (Col 2) is extremely high (> 50km)
# Note: You can adjust the threshold based on domain knowledge
df['Impossible_Travel'] = (df['Col 2'] > 50).astype(int)

# ==========================================
# 3. AI MODEL: ISOLATION FOREST
# ==========================================
print("Training AI Model (Isolation Forest)...")

# Select features for the model
# We include the original columns plus our new engineered features
features_to_use = ['Col 1', 'Col 2', 'Col 3', 'Col 4', 'Col 5', 'Col 6', 'Col 7', 
                   'Risky_Combo', 'Dist_Value_Interaction']

X = df[features_to_use]

# Scale the data
# (Isolation Forest works better if variables are on similar scales, especially for distance)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize the model
# contamination='auto' lets the model decide the threshold, or use 0.05 for top 5%
model = IsolationForest(n_estimators=200, 
                        contamination=0.05, 
                        random_state=42, 
                        n_jobs=-1)

# Fit and Predict
# -1 = Anomaly (Fraud), 1 = Normal
df['Anomaly_Flag'] = model.fit_predict(X_scaled)

# Get the raw anomaly score (lower is more anomalous)
# This is great for ranking transactions from "Most Suspicious" to "Least"
df['Anomaly_Score'] = model.decision_function(X_scaled)

# ==========================================
# 4. EXPLAINABILITY (For your Dashboard)
# ==========================================
print("Generating explanations for the dashboard...")

def get_fraud_reason(row):
    """
    Assigns a plain-text reason for why the AI flagged this transaction.
    Useful for the 'Why' column in PowerBI.
    """
    if row['Anomaly_Flag'] == 1:
        return "Normal"
    
    reasons = []
    
    # Check specific patterns we found
    if row['Col 4'] == 0 and row['Col 6'] == 1:
        reasons.append("Suspicious Local Pattern (New Place + Approval Type 2)")
        
    if row['Col 1'] > 100 or row['Col 2'] > 100:
        reasons.append("High Distance / Impossible Travel")
        
    if row['Col 3'] > 4: # Assuming 4 is significantly above average (scaled or raw)
        reasons.append("Abnormal High Value")
        
    if not reasons:
        return "Statistical Anomaly (Complex Pattern)"
        
    return " + ".join(reasons)

# Apply the reason function
df['Fraud_Reason'] = df.apply(get_fraud_reason, axis=1)

# ==========================================
# 5. SAVE RESULTS
# ==========================================
output_filename = 'Fraud_Analysis_Results.csv'
print(f"Saving results to {output_filename}...")

# Save the full dataset with the new columns
df.to_csv(output_filename, index=False)

# Print Summary
num_outliers = len(df[df['Anomaly_Flag'] == -1])
print(f"\nSUCCESS! Found {num_outliers} potential fraudulent transactions.")
print(f"Data saved to {output_filename}")
print("Columns added:")
print("- Anomaly_Flag: -1 for Fraud, 1 for Normal")
print("- Anomaly_Score: The lower the score, the more likely it is fraud")
print("- Fraud_Reason: Text explanation for the dashboard")

Loading data...
Creating new features...
Training AI Model (Isolation Forest)...
Generating explanations for the dashboard...
Saving results to Fraud_Analysis_Results.csv...

SUCCESS! Found 49996 potential fraudulent transactions.
Data saved to Fraud_Analysis_Results.csv
Columns added:
- Anomaly_Flag: -1 for Fraud, 1 for Normal
- Anomaly_Score: The lower the score, the more likely it is fraud
- Fraud_Reason: Text explanation for the dashboard
