In [1]:
import os
import json
import yaml
import pickle
import pandas as pd
import numpy as np
import networkx as nx

from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import IsolationForest

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import save_model

# ============================================================
# PATHS
# ============================================================

BASE_DIR = r"C:\Users\NXTWAVE\Downloads\Carbon Credit Fraud Detection"
DATA_PATH = os.path.join(BASE_DIR, "Table_5.5_Carbon_Stock_in_India_Forests.csv")

MODEL_DIR = os.path.join(BASE_DIR, "model")
META_DIR = os.path.join(BASE_DIR, "metadata")
RESULT_DIR = os.path.join(BASE_DIR, "results")

os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(META_DIR, exist_ok=True)
os.makedirs(RESULT_DIR, exist_ok=True)

# ============================================================
# LOAD DATA
# ============================================================

df = pd.read_csv(DATA_PATH)
df.columns = df.columns.str.strip().str.replace(" ", "_")

print("Dataset Shape:", df.shape)
print("Columns:", df.columns.tolist())

# ============================================================
# BASIC CLEANING
# ============================================================

df.fillna(method="ffill", inplace=True)
df.reset_index(drop=True, inplace=True)

# Create a pseudo issuer / project id for graph modeling
df["issuer_id"] = df.index % 10
df["project_id"] = df.index

# ============================================================
# GRAPH CONSTRUCTION
# ============================================================

G = nx.Graph()

for _, row in df.iterrows():
    G.add_node(row["project_id"], carbon_stock=row[df.columns[1]])
    G.add_edge(row["issuer_id"], row["project_id"])

# Graph-based features
df["degree_centrality"] = df["project_id"].apply(
    lambda x: nx.degree(G, x)
)

df["issuer_connection_count"] = df["issuer_id"].apply(
    lambda x: nx.degree(G, x)
)

# ============================================================
# FEATURE SELECTION
# ============================================================

numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
X = df[numeric_cols]

# ============================================================
# SCALING
# ============================================================

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

with open(os.path.join(BASE_DIR, "scaler.pkl"), "wb") as f:
    pickle.dump(scaler, f)

# ============================================================
# ANOMALY DETECTION MODEL
# ============================================================

iso_forest = IsolationForest(
    n_estimators=200,
    contamination=0.08,
    random_state=42
)

fraud_labels = iso_forest.fit_predict(X_scaled)
df["fraud_label"] = fraud_labels
df["fraud_flag"] = df["fraud_label"].apply(lambda x: "Fraud" if x == -1 else "Legit")

# ============================================================
# SAVE MODEL (.pkl)
# ============================================================

with open(os.path.join(MODEL_DIR, "carbon_fraud_model.pkl"), "wb") as f:
    pickle.dump(iso_forest, f)

# ============================================================
# OPTIONAL NEURAL MODEL (.h5)
# ============================================================

nn_model = Sequential([
    Dense(32, activation="relu", input_shape=(X_scaled.shape[1],)),
    Dense(16, activation="relu"),
    Dense(1, activation="sigmoid")
])

nn_model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

nn_model.fit(
    X_scaled,
    (fraud_labels == -1).astype(int),
    epochs=15,
    batch_size=8,
    verbose=1
)

save_model(nn_model, os.path.join(MODEL_DIR, "carbon_fraud_model.h5"))

# ============================================================
# SAVE RESULTS (.json)
# ============================================================

results_json = df[[
    "project_id",
    "issuer_id",
    "fraud_flag"
]].to_dict(orient="records")

with open(os.path.join(RESULT_DIR, "fraud_predictions.json"), "w") as f:
    json.dump(results_json, f, indent=4)

# ============================================================
# SAVE METADATA (.yaml)
# ============================================================

schema = {
    "project": "AI-Driven Carbon Credit Fraud Detection",
    "domain": "Web3 + ClimateTech",
    "ml_technique": [
        "Graph ML",
        "Isolation Forest",
        "Neural Anomaly Detection"
    ],
    "features_used": numeric_cols,
    "labels": {
        "Fraud": -1,
        "Legit": 1
    }
}

with open(os.path.join(META_DIR, "schema.yaml"), "w") as f:
    yaml.dump(schema, f)

# ============================================================
# README
# ============================================================

readme_text = """
AI-Driven Carbon Credit Fraud Detection

Problem:
Detect fake or duplicate carbon credits in blockchain-based carbon markets.

ML Stack:
- Graph-based Feature Engineering
- Isolation Forest (Anomaly Detection)
- Neural Network Validation

Outputs:
- Trained Models (.pkl, .h5)
- Fraud Predictions (.json)
- Metadata (.yaml)

Use-cases:
- Carbon Exchanges
- ESG Audits
- Climate Finance Risk Assessment
"""

with open(os.path.join(BASE_DIR, "README.txt"), "w") as f:
    f.write(readme_text)

print("✅ Carbon Credit Fraud Detection Pipeline Completed Successfully!")



Dataset Shape: (36, 6)
Columns: ['States/UTs', 'Geographical_-_Area_(sq_km)', 'RFA_-_(sq_km)', 'Volume_of_Growing_Stock_(m._cum)_-_In_Forest', 'Volume_of_Growing_Stock_(m._cum)_-_In_TOF', 'Volume_of_Growing_Stock_(m._cum)_-_Total']


  df.fillna(method="ffill", inplace=True)




Epoch 1/15


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
✅ Carbon Credit Fraud Detection Pipeline Completed Successfully!


  save_model(nn_model, os.path.join(MODEL_DIR, "carbon_fraud_model.h5"))
