In [5]:
# ================================
# CRYPTO TRANSACTION FRAUD ANALYSIS
# ================================

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
import networkx as nx

# --------------------------------
# 1. Load all exchange datasets
# --------------------------------

DATA_DIR = "/mnt/data"   # change if files are elsewhere
files = {
    "binance": r"C:\Users\risha\Downloads\archive (9)\data_2024\binance.csv",
    "coinbase": r"C:\Users\risha\Downloads\archive (9)\data_2024\coinbase.csv",
    "kraken": r"C:\Users\risha\Downloads\archive (9)\data_2024\kraken.csv",
    "kucoin": r"C:\Users\risha\Downloads\archive (9)\data_2024\kucoin.csv"
}


dataframes = {}

for name, filename in files.items():
    filepath = os.path.join(DATA_DIR, filename)
    df = pd.read_csv(filepath)

    print(f"\nLoaded {name} → shape: {df.shape}")
    print(df.head())

    dataframes[name] = df

# --------------------------------
# 2. Standardize column names
# --------------------------------

def normalize_columns(df):
    df = df.copy()
    df.columns = df.columns.str.lower().str.strip()

    rename_map = {
        "timestamp": "timestamp",
        "time": "timestamp",
        "date": "timestamp",
        "txid": "tx_id",
        "tx_hash": "tx_id",
        "transaction_id": "tx_id",
        "from": "from_address",
        "sender": "from_address",
        "to": "to_address",
        "receiver": "to_address",
        "amount": "amount",
        "qty": "amount",
        "value": "amount",
        "asset": "asset",
        "currency": "asset"
    }

    for col in df.columns:
        if col in rename_map:
            df.rename(columns={col: rename_map[col]}, inplace=True)

    # Handle missing columns safely
    if "asset" not in df.columns:
        df["asset"] = "UNKNOWN"

    return df

for name in dataframes:
    dataframes[name] = normalize_columns(dataframes[name])

# --------------------------------
# 3. Merge into one combined dataset
# --------------------------------

combined = pd.concat(dataframes.values(), ignore_index=True)
print("\nCombined dataset shape:", combined.shape)

# --------------------------------
# 4. Clean + Convert Data Types
# --------------------------------

# Convert timestamp
combined["timestamp"] = pd.to_datetime(combined["timestamp"], errors="coerce")

# Coerce amount to numeric
combined["amount"] = pd.to_numeric(combined["amount"], errors="coerce")

# Drop empty rows
combined = combined.dropna(subset=["timestamp", "amount"])

# Add derived columns
combined["date"] = combined["timestamp"].dt.date
combined["hour"] = combined["timestamp"].dt.hour

print("\nCleaned dataset preview:")
print(combined.head())

# --------------------------------
# 5. Basic Insights
# --------------------------------

print("\n===== BASIC INSIGHTS =====")
print("Total Transactions:", len(combined))
print("Total Volume:", combined["amount"].sum())
print("Unique Assets:", combined["asset"].nunique())

print("\nTop 10 assets by volume:")
print(combined.groupby("asset")["amount"].sum().sort_values(ascending=False).head(10))

# Daily transaction count
daily_counts = combined.groupby("date")["tx_id"].count()
print("\nDaily transaction counts:")
print(daily_counts.head())

# --------------------------------
# 6. Visualization
# --------------------------------

# A) Transaction Amount Distribution
plt.figure(figsize=(8,5))
combined["amount"].plot(kind="hist", bins=50)
plt.title("Amount Distribution")
plt.xlabel("Amount")
plt.savefig(os.path.join(DATA_DIR, "amount_distribution.png"))
plt.close()

# B) Daily Transaction Count
plt.figure(figsize=(10,5))
daily_counts.plot()
plt.title("Daily Transaction Count")
plt.xlabel("Date")
plt.ylabel("Transactions")
plt.savefig(os.path.join(DATA_DIR, "daily_transactions.png"))
plt.close()

# --------------------------------
# 7. Detect Suspicious Transactions (ANOMALY DETECTION)
# --------------------------------

print("\n===== RUNNING ANOMALY DETECTION =====")

clean_for_ml = combined.dropna(subset=["amount"]).copy()

clean_for_ml["log_amount"] = np.log1p(clean_for_ml["amount"])

model = IsolationForest(contamination=0.02, random_state=42)
clean_for_ml["anomaly_score"] = model.fit_predict(clean_for_ml[["amount", "log_amount"]])

# Anomalies = -1
suspicious = clean_for_ml[clean_for_ml["anomaly_score"] == -1]
print("\nSuspicious transactions detected:", len(suspicious))

# Save them
suspicious.to_csv(os.path.join(DATA_DIR, "suspicious_transactions.csv"), index=False)
print("Saved suspicious transactions → suspicious_transactions.csv")

# --------------------------------
# 8. Graph Analysis (Address Network)
# --------------------------------

if "from_address" in combined.columns and "to_address" in combined.columns:
    print("\n===== GRAPH ANALYSIS =====")

    graph_df = combined.dropna(subset=["from_address", "to_address"])
    G = nx.from_pandas_edgelist(
        graph_df,
        source="from_address",
        target="to_address",
        edge_attr="amount",
        create_using=nx.DiGraph()
    )

    pagerank = nx.pagerank(G)
    pr_df = pd.DataFrame.from_dict(pagerank, orient="index", columns=["pagerank"])
    pr_df = pr_df.sort_values(by="pagerank", ascending=False)

    print("\nTop 10 high-impact addresses (PageRank):")
    print(pr_df.head(10))

    pr_df.to_csv(os.path.join(DATA_DIR, "top_addresses_pagerank.csv"))
else:
    print("\nGraph analysis skipped (from/to columns missing).")

print("\nAnalysis Complete. All files saved to:", DATA_DIR)



Loaded binance → shape: (2000, 8)
                  time     open     high      low    close     volume  \
0  2024-10-18 15:00:00  0.03861  0.03872  0.03853  0.03861  2005.2370   
1  2024-10-18 16:00:00  0.03860  0.03862  0.03844  0.03855  1241.2893   
2  2024-10-18 17:00:00  0.03854  0.03855  0.03845  0.03852   798.4479   
3  2024-10-18 18:00:00  0.03853  0.03880  0.03851  0.03867  2854.0002   
4  2024-10-18 19:00:00  0.03867  0.03872  0.03860  0.03865   813.1840   

      pair exchange  
0  ETH-BTC  Binance  
1  ETH-BTC  Binance  
2  ETH-BTC  Binance  
3  ETH-BTC  Binance  
4  ETH-BTC  Binance  

Loaded coinbase → shape: (2000, 8)
                  time    open    high     low   close    volume     pair  \
0  2024-11-29 06:00:00  2.2789  2.2806  2.2789  2.2806      3.14  RLC-USD   
1  2024-11-29 05:00:00  2.2857  2.2858  2.2721  2.2721   7485.58  RLC-USD   
2  2024-11-29 04:00:00  2.2602  2.2854  2.2494  2.2854   2610.41  RLC-USD   
3  2024-11-29 03:00:00  2.2813  2.2813  2.2538  2.

KeyError: 'amount'