In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ethereum-data/Eth_Txs.csv
/kaggle/input/ethereum-transaction-dataset-raw/ethereum.csv
/kaggle/input/ethereum-frauddetection-dataset/transaction_dataset.csv


In [2]:
# ==========================================
# CORRECTED FRAUD DETECTION TRAINING CELL
# ==========================================

# Install deps (usually preinstalled)
!pip install pandas scikit-learn joblib imbalanced-learn --quiet

import pandas as pd
import numpy as np
import os
import joblib

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

# -------------------------------
# 1Ô∏è‚É£ LOAD DATASET
# -------------------------------

path = "/kaggle/input/ethereum-frauddetection-dataset/transaction_dataset.csv"

df = pd.read_csv(path)

print("üìä Dataset Shape:", df.shape)
print("\nColumns:\n", df.columns.tolist())

# -------------------------------
# 2Ô∏è‚É£ CLEANING
# -------------------------------

# Drop ID / non-useful columns
drop_cols = ["Unnamed: 0", "Index", "Address"]
df = df.drop(columns=drop_cols, errors="ignore")

# Target column
y = df["FLAG"]

# Feature columns
X = df.drop(columns=["FLAG"])

# Keep only numeric features
X = X.select_dtypes(include=[np.number])

# Fill missing values
X = X.fillna(0)

print("\n‚úÖ Features Shape:", X.shape)
print("‚úÖ Labels Shape:", y.shape)

# -------------------------------
# 3Ô∏è‚É£ TRAIN / TEST SPLIT
# -------------------------------

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y   # Keeps fraud ratio balanced
)

# -------------------------------
# 4Ô∏è‚É£ HANDLE CLASS IMBALANCE
# -------------------------------

print("\n‚öñÔ∏è Applying SMOTE balancing...")

smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

print("Balanced Training Shape:", X_train_bal.shape)

# -------------------------------
# 5Ô∏è‚É£ MODEL TRAINING
# -------------------------------

print("\nüöÄ Training RandomForest Fraud Model...")

model = RandomForestClassifier(
    n_estimators=200,
    max_depth=12,
    random_state=42,
    n_jobs=-1,
    class_weight="balanced"
)

model.fit(X_train_bal, y_train_bal)

# -------------------------------
# 6Ô∏è‚É£ EVALUATION
# -------------------------------

y_pred = model.predict(X_test)

acc = accuracy_score(y_test, y_pred)

print(f"\n‚úÖ Accuracy: {acc:.4f}\n")

print("üìä Classification Report:\n")
print(classification_report(y_test, y_pred, zero_division=0))

# -------------------------------
# 7Ô∏è‚É£ SAVE MODEL
# -------------------------------

os.makedirs("/kaggle/working/models", exist_ok=True)

model_path = "/kaggle/working/models/fraud_model.pkl"
features_path = "/kaggle/working/models/fraud_features.pkl"

joblib.dump(model, model_path)
joblib.dump(X.columns.tolist(), features_path)

print("\nüíæ Model saved at:", model_path)
print("üì¶ Features saved at:", features_path)

print("\nüéâ FRAUD MODEL TRAINING COMPLETE ‚Äî Ready for VS Code.")


üìä Dataset Shape: (9841, 51)

Columns:
 ['Unnamed: 0', 'Index', 'Address', 'FLAG', 'Avg min between sent tnx', 'Avg min between received tnx', 'Time Diff between first and last (Mins)', 'Sent tnx', 'Received Tnx', 'Number of Created Contracts', 'Unique Received From Addresses', 'Unique Sent To Addresses', 'min value received', 'max value received ', 'avg val received', 'min val sent', 'max val sent', 'avg val sent', 'min value sent to contract', 'max val sent to contract', 'avg value sent to contract', 'total transactions (including tnx to create contract', 'total Ether sent', 'total ether received', 'total ether sent contracts', 'total ether balance', ' Total ERC20 tnxs', ' ERC20 total Ether received', ' ERC20 total ether sent', ' ERC20 total Ether sent contract', ' ERC20 uniq sent addr', ' ERC20 uniq rec addr', ' ERC20 uniq sent addr.1', ' ERC20 uniq rec contract addr', ' ERC20 avg time between sent tnx', ' ERC20 avg time between rec tnx', ' ERC20 avg time between rec 2 tnx', ' ERC

In [3]:
# ==========================================
# GAS FEE PREDICTION ‚Äî TRAINING CELL
# ==========================================

# Install deps (usually preinstalled)
!pip install pandas scikit-learn joblib --quiet

import pandas as pd
import numpy as np
import os
import joblib

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# -------------------------------
# 1Ô∏è‚É£ LOAD DATASET
# -------------------------------

path = "/kaggle/input/ethereum-transaction-dataset-raw/ethereum.csv"

df = pd.read_csv(path)

print("üìä Dataset Shape:", df.shape)
print("\nColumns:\n", df.columns.tolist())
print("\nSample Data:")
display(df.head())

# -------------------------------
# 2Ô∏è‚É£ CLEANING
# -------------------------------

# Drop non-ML useful columns
drop_cols = [
    "DateTime (UTC)"
]

df = df.drop(columns=drop_cols, errors="ignore")

# Rename columns for easier handling
df.columns = df.columns.str.replace(" ", "_")
df.columns = df.columns.str.replace("(", "")
df.columns = df.columns.str.replace(")", "")
df.columns = df.columns.str.replace("$", "")
df.columns = df.columns.str.replace("/", "_")

# Fill nulls
df = df.fillna(0)

# -------------------------------
# 3Ô∏è‚É£ FEATURE / TARGET SPLIT
# -------------------------------

target_col = "TxnFeeETH"

# Fix column name if needed
for col in df.columns:
    if "TxnFee" in col and "ETH" in col:
        target_col = col

y = df[target_col]

X = df.drop(columns=[target_col])

# Keep numeric only
X = X.select_dtypes(include=[np.number])

print("\n‚úÖ Features Shape:", X.shape)
print("‚úÖ Target Shape:", y.shape)

# -------------------------------
# 4Ô∏è‚É£ TRAIN / TEST SPLIT
# -------------------------------

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

# -------------------------------
# 5Ô∏è‚É£ MODEL TRAINING
# -------------------------------

print("\nüöÄ Training Gas Prediction Model...")

model = RandomForestRegressor(
    n_estimators=200,
    max_depth=12,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

# -------------------------------
# 6Ô∏è‚É£ EVALUATION
# -------------------------------

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\nüìä Regression Metrics:\n")
print(f"MAE  : {mae:.6f}")
print(f"RMSE : {rmse:.6f}")
print(f"R¬≤   : {r2:.4f}")

# -------------------------------
# 7Ô∏è‚É£ SAVE MODEL
# -------------------------------

os.makedirs("/kaggle/working/models", exist_ok=True)

model_path = "/kaggle/working/models/gas_fee_model.pkl"
features_path = "/kaggle/working/models/gas_features.pkl"

joblib.dump(model, model_path)
joblib.dump(X.columns.tolist(), features_path)

print("\nüíæ Model saved at:", model_path)
print("üì¶ Features saved at:", features_path)

print("\nüéâ GAS MODEL TRAINING COMPLETE ‚Äî Ready for VS Code.")


üìä Dataset Shape: (5000, 9)

Columns:
 ['Blockno', 'UnixTimestamp', 'DateTime (UTC)', 'Value_IN(ETH)', 'Value_OUT(ETH)', 'CurrentValue @ $3083.38454496098/Eth', 'TxnFee(ETH)', 'TxnFee(USD)', 'Historical $Price/Eth']

Sample Data:


Unnamed: 0,Blockno,UnixTimestamp,DateTime (UTC),Value_IN(ETH),Value_OUT(ETH),CurrentValue @ $3083.38454496098/Eth,TxnFee(ETH),TxnFee(USD),Historical $Price/Eth
0,19557289,1711929611,2024-04-01 00:00:11,0.0,0,0.0,0.002915,8.988436,3505.52
1,19557290,1711929623,2024-04-01 00:00:23,0.0,0,0.0,0.004286,13.216714,3505.52
2,19557290,1711929623,2024-04-01 00:00:23,0.0,0,0.0,0.000817,2.520163,3505.52
3,19557291,1711929635,2024-04-01 00:00:35,0.0,0,0.0,0.002683,8.271781,3505.52
4,19557291,1711929635,2024-04-01 00:00:35,0.0,0,0.0,0.003421,10.549526,3505.52



‚úÖ Features Shape: (5000, 7)
‚úÖ Target Shape: (5000,)

üöÄ Training Gas Prediction Model...

üìä Regression Metrics:

MAE  : 0.000015
RMSE : 0.000213
R¬≤   : 0.9989

üíæ Model saved at: /kaggle/working/models/gas_fee_model.pkl
üì¶ Features saved at: /kaggle/working/models/gas_features.pkl

üéâ GAS MODEL TRAINING COMPLETE ‚Äî Ready for VS Code.


In [4]:
# ==========================================
# TRANSACTION CLASSIFICATION TRAINING CELL
# ==========================================

!pip install pandas scikit-learn joblib --quiet

import pandas as pd
import numpy as np
import os
import joblib

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# -------------------------------
# 1Ô∏è‚É£ LOAD DATASET
# -------------------------------

path = "/kaggle/input/ethereum-data/Eth_Txs.csv"

df = pd.read_csv(path)

print("üìä Dataset Shape:", df.shape)
print("\nColumns:\n", df.columns.tolist())
display(df.head())

# -------------------------------
# 2Ô∏è‚É£ CLEANING
# -------------------------------

# Clean Value column (remove 'Ether')
df["Value"] = df["Value"].str.replace(" Ether", "")
df["Value"] = pd.to_numeric(df["Value"], errors="coerce")

# Rename TxFee column
df = df.rename(columns={"[TxFee]": "TxFee"})

# Fill nulls
df = df.fillna(0)

# -------------------------------
# 3Ô∏è‚É£ CREATE LABELS (Heuristic)
# -------------------------------

def classify_tx(value, to_addr):
    
    if value < 1:
        return "Small Transfer"
    
    elif value < 10:
        return "Medium Transfer"
    
    elif value >= 10:
        return "High Value Transfer"
    
    else:
        return "Other"

df["TxType"] = df.apply(
    lambda x: classify_tx(x["Value"], str(x["To"])),
    axis=1
)

print("\nLabel Distribution:\n")
print(df["TxType"].value_counts())

# -------------------------------
# 4Ô∏è‚É£ FEATURE ENGINEERING
# -------------------------------

# Convert addresses to simple numeric proxies
df["From_Length"] = df["From"].astype(str).apply(len)
df["To_Length"] = df["To"].astype(str).apply(len)

features = [
    "Value",
    "TxFee",
    "Block",
    "From_Length",
    "To_Length"
]

X = df[features]
y = df["TxType"]

# -------------------------------
# 5Ô∏è‚É£ TRAIN / TEST SPLIT
# -------------------------------

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# -------------------------------
# 6Ô∏è‚É£ MODEL TRAINING
# -------------------------------

print("\nüöÄ Training Transaction Classifier...")

model = RandomForestClassifier(
    n_estimators=150,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

# -------------------------------
# 7Ô∏è‚É£ EVALUATION
# -------------------------------

y_pred = model.predict(X_test)

acc = accuracy_score(y_test, y_pred)

print(f"\n‚úÖ Accuracy: {acc:.4f}\n")
print("üìä Classification Report:\n")
print(classification_report(y_test, y_pred))

# -------------------------------
# 8Ô∏è‚É£ SAVE MODEL
# -------------------------------

os.makedirs("/kaggle/working/models", exist_ok=True)

model_path = "/kaggle/working/models/tx_classifier.pkl"
features_path = "/kaggle/working/models/tx_features.pkl"

joblib.dump(model, model_path)
joblib.dump(features, features_path)

print("\nüíæ Model saved at:", model_path)
print("üì¶ Features saved at:", features_path)

print("\nüéâ TRANSACTION CLASSIFIER COMPLETE.")


üìä Dataset Shape: (499466, 8)

Columns:
 ['Record', 'TxHash', 'Block', 'Age', 'From', 'To', 'Value', '[TxFee]']


Unnamed: 0,Record,TxHash,Block,Age,From,To,Value,[TxFee]
0,0,0x7bbd62c3784e56ed55f2dfe1d52685d2412ac3f86fbb...,5184886.0,19 secs ago,0xdd487c027448d3364355707d91eefadc2dae9f88,0x3e1b1fe45cb2040b97cdb3191d4933ad1ff0928d,0.5 Ether,0.000399
1,1,0xbdc661846c450213cc2542a68b96c82f9cd611beb5d3...,5184886.0,19 secs ago,0xb66a63e5ba7a888450af2ede7a47fd99777b647a,BinanceWallet,0.79841 Ether,0.00042
2,2,0x6ae621e1311a56810f09cd8675b1fba2254ef8008732...,5184886.0,19 secs ago,0xb20f603635d3148fc197114ce7930ba2203c0b19,BinanceWallet,0.59958 Ether,0.00042
3,3,0x937015270f59d404ed48c1b08533e6b5a609cc1ceb38...,5184886.0,19 secs ago,0xad5268de391998acf4d8a60364f8f7237fb6a34f,BinanceWallet,0.70841989 Ether,0.00042
4,4,0x59f693671087ef67c68d3f00255d3dee17539cc4c219...,5184886.0,19 secs ago,0x525d09f85d1d65c1a52fe7cdfbea2c863a44938f,BinanceWallet,0.65480251 Ether,0.00042



Label Distribution:

TxType
Small Transfer         444182
Medium Transfer         45879
High Value Transfer      9405
Name: count, dtype: int64

üöÄ Training Transaction Classifier...

‚úÖ Accuracy: 1.0000

üìä Classification Report:

                     precision    recall  f1-score   support

High Value Transfer       1.00      1.00      1.00      1881
    Medium Transfer       1.00      1.00      1.00      9176
     Small Transfer       1.00      1.00      1.00     88837

           accuracy                           1.00     99894
          macro avg       1.00      1.00      1.00     99894
       weighted avg       1.00      1.00      1.00     99894


üíæ Model saved at: /kaggle/working/models/tx_classifier.pkl
üì¶ Features saved at: /kaggle/working/models/tx_features.pkl

üéâ TRANSACTION CLASSIFIER COMPLETE.
