# Elliptic Transactions Classification

In [None]:
import networkx as nx
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score, make_scorer
from imblearn.combine import SMOTEENN
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline


### SMOTE transformer to use in pipeline

In [None]:
# Custom SMOTEENN transformer
class SMOTEENNTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, sampling_strategy='auto'):
        self.sampling_strategy = sampling_strategy
        self.smoteenn = SMOTEENN(sampling_strategy=sampling_strategy)
    
    def fit(self, X, y):
        return self
    
    def transform(self, X, y=None):
        if y is not None:
            X_resampled, y_resampled = self.smoteenn.fit_resample(X, y)
            return X_resampled
        return X

### More enhanced sampler

In [None]:
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import TomekLinks
from imblearn.pipeline import Pipeline as ImbPipeline

class EnhancedSampler(BaseEstimator, TransformerMixin):
    def __init__(self, sampling_strategy='auto'):
        self.sampling_strategy = sampling_strategy
        self.pipeline = ImbPipeline([
            ('adasyn', ADASYN(sampling_strategy=sampling_strategy)),
            ('tomek', TomekLinks())
        ])
    
    def fit(self, X, y):
        return self
    
    def transform(self, X, y=None):
        if y is not None:
            X_resampled, y_resampled = self.pipeline.fit_resample(X, y)
            return X_resampled, y_resampled
        return X

### Data Load

In [None]:
# Load and preprocess data
edge_df = pd.read_csv("elliptic_txs_edgelist.csv")
class_df = pd.read_csv("elliptic_txs_classes.csv")

edge_df.rename(columns={"txId1": "source", "txId2": "target"}, inplace=True)
merged_df = edge_df.merge(class_df, left_on="source", right_on="txId", how="left")
merged_df = merged_df.merge(
    class_df,
    left_on="target",
    right_on="txId",
    how="left",
    suffixes=("_source", "_target"),
)

### Create Digraph

In [None]:
# Create Directed Graph
def create_directed_graph(edge_df):
    G_dir = nx.DiGraph()
    G_dir.add_nodes_from(class_df["txId"])
    G_dir.add_edges_from(edge_df[["source", "target"]].values)
    return G_dir

G_dir = create_directed_graph(edge_df)

### Feature Compute

In [None]:
def compute_enhanced_features(G, node, max_hops=3):
    subgraph = nx.ego_graph(G, node, radius=max_hops, undirected=False)
    
    features = {
        "num_nodes": subgraph.number_of_nodes(),
        "num_edges": subgraph.number_of_edges(),
        "in_degree": subgraph.in_degree(node),
        "out_degree": subgraph.out_degree(node),
        "degree_centrality": nx.degree_centrality(subgraph)[node],
        "in_degree_centrality": nx.in_degree_centrality(subgraph)[node],
        "out_degree_centrality": nx.out_degree_centrality(subgraph)[node],
        "pagerank": nx.pagerank(subgraph)[node],
        "clustering_coeff": nx.clustering(subgraph.to_undirected(), node),
        "local_clustering_coeff": nx.average_clustering(subgraph.to_undirected()),
        "average_neighbor_degree": np.mean(
            [subgraph.degree(n) for n in subgraph.neighbors(node)]
        ) if list(subgraph.neighbors(node)) else 0,
        "connectivity_ratio": subgraph.number_of_edges() / (subgraph.number_of_nodes() * (subgraph.number_of_nodes() - 1) + 1),
        "strongly_connected_components": len(list(nx.strongly_connected_components(subgraph))),
        "weakly_connected_components": len(list(nx.weakly_connected_components(subgraph))),
    }

    # Safe centrality calculations
    try:
        features["harmonic_centrality"] = nx.harmonic_centrality(subgraph)[node]
    except:
        features["harmonic_centrality"] = 0

    try:
        features["eigenvector_centrality"] = nx.eigenvector_centrality(
            subgraph, 
            max_iter=100,  # Reduced iterations
            tol=1e-2       # Increased tolerance
        ).get(node, 0)
    except nx.PowerIterationFailedConvergence:
        features["eigenvector_centrality"] = 0

    try:
        strongly_components = list(nx.strongly_connected_components(subgraph))
        features["max_strongly_connected_components"] = max(len(c) for c in strongly_components) if strongly_components else 0
    except:
        features["max_strongly_connected_components"] = 0

    try:
        features["betweenness_centrality"] = nx.betweenness_centrality(subgraph)[node]
    except:
        features["betweenness_centrality"] = 0

    return features

def extract_features_for_classes(G, merged_df, node_class, max_hops=3):
    nodes = merged_df[merged_df["class_source"] == str(node_class)]["source"].tolist()
    features = {node: compute_enhanced_features(G, node, max_hops) for node in nodes}
    return features

In [None]:
# Extract and prepare features
class_1_features = extract_features_for_classes(G_dir, merged_df, node_class=1, max_hops=3)
class_2_features = extract_features_for_classes(G_dir, merged_df, node_class=2, max_hops=3)

all_features = pd.DataFrame.from_dict(
    {**class_1_features, **class_2_features}, orient="index"
)
all_features["class"] = ["1"] * len(class_1_features) + ["2"] * len(class_2_features)

# Prepare the data
X = all_features.drop(columns=["class"])
y = all_features["class"].astype(int)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

## Random Forest Classifier

In [None]:
# Hyperparameter grid
param_dist = {
    'classifier__n_estimators': [100, 250, 500, 750, 1000],
    'classifier__max_depth': [5, 10, 15, 20, 25, None],
    'classifier__min_samples_split': [2, 4, 6, 8],
    'classifier__min_samples_leaf': [1, 2, 3, 4],
    'classifier__class_weight': [None, 'balanced', 'balanced_subsample'],
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__max_features': ['sqrt', 'log2', None]
}

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('feature_selector', SelectFromModel(ExtraTreesClassifier(n_estimators=50))),
    ('sampler', EnhancedSampler(sampling_strategy='auto')),
    ('classifier', RandomForestClassifier(random_state=42))
])

random_search = RandomizedSearchCV(
    pipeline, 
    param_distributions=param_dist, 
    n_iter=100,  # Increased iterations
    cv=5, 
    scoring='balanced_accuracy',
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

In [None]:
# Best model evaluation
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

# Print results
print("Best Parameters:", random_search.best_params_)
print("\nBest Cross-Validated Score:", random_search.best_score_)
print("\nTest Set Balanced Accuracy:", balanced_accuracy_score(y_test, y_pred))

## Logistic Regression

In [None]:
from imblearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
oversample = SMOTE(random_state=42)
lr = LogisticRegression(max_iter=1000, random_state=42)

pipeline = Pipeline([('smote', oversample), 
                     ('logreg', lr)])

param_grid = {
    'smote__k_neighbors': [3, 5],
    'logreg__C': [0.01, 0.1, 1, 10],
    'logreg__class_weight': ['balanced', None]
}

grid_search = GridSearchCV(
    pipeline, param_grid, cv=5, scoring='recall', n_jobs=-1
)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_


In [None]:
# Predict the test set labels



y_pred = best_model.predict(X_test)

# Generate a classification report
# This shows precision, recall (which is the % of each category predicted correctly), and F1-score by class.
print("Classification Report:")
print(classification_report(y_test, y_pred))

# You can also print the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix (rows=actual, cols=predicted):")
print(cm)

Opaque Features

In [None]:
df = pd.read_csv('elliptic_txs_features.csv')
# Step 1: Move column names into the first row
df = pd.concat([pd.DataFrame([df.columns], columns=df.columns), df], ignore_index=True)

# Step 2: Rename columns sequentially from 1 to the number of columns
df.columns = range(1, len(df.columns) + 1)
df.head()

In [None]:
classes = pd.read_csv('elliptic_txs_classes.csv')
classes.head()

In [None]:
df['class'] = classes['class']
df.head()

In [None]:
labeld = df[df['class'] != 'unknown']
labeld.head()

In [None]:
labeld['class'].value_counts()

In [None]:
X = labeld.drop([1, 'class'], axis=1)
y = labeld['class']
X.head()

In [None]:
X.dtypes

In [None]:
# Create a list of columns from "2" to "167" (inclusive)
cols_to_convert = [i for i in range(2, 168)]

# Convert these columns to float
X[cols_to_convert] = X[cols_to_convert].astype(float)
X.dtypes

 

In [None]:
# X = (X - X.mean())/X.std()
# X.head()

In [None]:
oversample = SMOTE(random_state=42)
lr = LogisticRegression(max_iter=1000, random_state=42)

pipeline = Pipeline([('smote', oversample), 
                     ('logreg', lr)])

param_grid = {
    'smote__k_neighbors': [3, 5],
    'logreg__C': [0.01, 0.1, 1, 10],
    'logreg__class_weight': ['balanced', None]
}

grid_search = GridSearchCV(
    pipeline, param_grid, cv=5, scoring='recall', n_jobs=-1
)
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

In [None]:
y_pred = best_model.predict(X_test)

# Generate a classification report
# This shows precision, recall (which is the % of each category predicted correctly), and F1-score by class.
print("Classification Report:")
print(classification_report(y_test, y_pred))

# You can also print the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix (rows=actual, cols=predicted):")
print(cm)