# Elliptic Transactions Classification

In [1]:
import networkx as nx
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score, make_scorer
from imblearn.combine import SMOTEENN
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline


### SMOTE transformer to use in pipeline

In [2]:
# Custom SMOTEENN transformer
class SMOTEENNTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, sampling_strategy='auto'):
        self.sampling_strategy = sampling_strategy
        self.smoteenn = SMOTEENN(sampling_strategy=sampling_strategy)
    
    def fit(self, X, y):
        return self
    
    def transform(self, X, y=None):
        if y is not None:
            X_resampled, y_resampled = self.smoteenn.fit_resample(X, y)
            return X_resampled
        return X

### More enhanced sampler

In [3]:
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import TomekLinks
from imblearn.pipeline import Pipeline as ImbPipeline

class EnhancedSampler(BaseEstimator, TransformerMixin):
    def __init__(self, sampling_strategy='auto'):
        self.sampling_strategy = sampling_strategy
        self.pipeline = ImbPipeline([
            ('adasyn', ADASYN(sampling_strategy=sampling_strategy)),
            ('tomek', TomekLinks())
        ])
    
    def fit(self, X, y):
        return self
    
    def transform(self, X, y=None):
        if y is not None:
            X_resampled, y_resampled = self.pipeline.fit_resample(X, y)
            return X_resampled, y_resampled
        return X

### Data Load

In [4]:
# Load and preprocess data
edge_df = pd.read_csv("elliptic_txs_edgelist.csv")
class_df = pd.read_csv("elliptic_txs_classes.csv")

edge_df.rename(columns={"txId1": "source", "txId2": "target"}, inplace=True)
merged_df = edge_df.merge(class_df, left_on="source", right_on="txId", how="left")
merged_df = merged_df.merge(
    class_df,
    left_on="target",
    right_on="txId",
    how="left",
    suffixes=("_source", "_target"),
)

### Create Digraph

In [5]:
# Create Directed Graph
def create_directed_graph(edge_df):
    G_dir = nx.DiGraph()
    G_dir.add_nodes_from(class_df["txId"])
    G_dir.add_edges_from(edge_df[["source", "target"]].values)
    return G_dir

G_dir = create_directed_graph(edge_df)

### Feature Compute

In [6]:
def compute_enhanced_features(G, node, max_hops=3):
    subgraph = nx.ego_graph(G, node, radius=max_hops, undirected=False)
    
    features = {
        "num_nodes": subgraph.number_of_nodes(),
        "num_edges": subgraph.number_of_edges(),
        "in_degree": subgraph.in_degree(node),
        "out_degree": subgraph.out_degree(node),
        "degree_centrality": nx.degree_centrality(subgraph)[node],
        "in_degree_centrality": nx.in_degree_centrality(subgraph)[node],
        "out_degree_centrality": nx.out_degree_centrality(subgraph)[node],
        "pagerank": nx.pagerank(subgraph)[node],
        "clustering_coeff": nx.clustering(subgraph.to_undirected(), node),
        "local_clustering_coeff": nx.average_clustering(subgraph.to_undirected()),
        "average_neighbor_degree": np.mean(
            [subgraph.degree(n) for n in subgraph.neighbors(node)]
        ) if list(subgraph.neighbors(node)) else 0,
        "connectivity_ratio": subgraph.number_of_edges() / (subgraph.number_of_nodes() * (subgraph.number_of_nodes() - 1) + 1),
        "strongly_connected_components": len(list(nx.strongly_connected_components(subgraph))),
        "weakly_connected_components": len(list(nx.weakly_connected_components(subgraph))),
    }

    # Safe centrality calculations
    try:
        features["harmonic_centrality"] = nx.harmonic_centrality(subgraph)[node]
    except:
        features["harmonic_centrality"] = 0

    try:
        features["eigenvector_centrality"] = nx.eigenvector_centrality(
            subgraph, 
            max_iter=100,  # Reduced iterations
            tol=1e-2       # Increased tolerance
        ).get(node, 0)
    except nx.PowerIterationFailedConvergence:
        features["eigenvector_centrality"] = 0

    try:
        strongly_components = list(nx.strongly_connected_components(subgraph))
        features["max_strongly_connected_components"] = max(len(c) for c in strongly_components) if strongly_components else 0
    except:
        features["max_strongly_connected_components"] = 0

    try:
        features["betweenness_centrality"] = nx.betweenness_centrality(subgraph)[node]
    except:
        features["betweenness_centrality"] = 0

    return features

def extract_features_for_classes(G, merged_df, node_class, max_hops=3):
    nodes = merged_df[merged_df["class_source"] == str(node_class)]["source"].tolist()
    features = {node: compute_enhanced_features(G, node, max_hops) for node in nodes}
    return features

In [7]:
# Extract and prepare features
class_1_features = extract_features_for_classes(G_dir, merged_df, node_class=1, max_hops=3)
class_2_features = extract_features_for_classes(G_dir, merged_df, node_class=2, max_hops=3)

all_features = pd.DataFrame.from_dict(
    {**class_1_features, **class_2_features}, orient="index"
)
all_features["class"] = ["1"] * len(class_1_features) + ["2"] * len(class_2_features)

# Prepare the data
X = all_features.drop(columns=["class"])
y = all_features["class"].astype(int)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

## Random Forest Classifier

In [8]:
# Hyperparameter grid
param_dist = {
    'classifier__n_estimators': [100, 250, 500, 750, 1000],
    'classifier__max_depth': [5, 10, 15, 20, 25, None],
    'classifier__min_samples_split': [2, 4, 6, 8],
    'classifier__min_samples_leaf': [1, 2, 3, 4],
    'classifier__class_weight': [None, 'balanced', 'balanced_subsample'],
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__max_features': ['sqrt', 'log2', None]
}

In [9]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('feature_selector', SelectFromModel(ExtraTreesClassifier(n_estimators=50))),
    ('sampler', EnhancedSampler(sampling_strategy='auto')),
    ('classifier', RandomForestClassifier(random_state=42))
])

random_search = RandomizedSearchCV(
    pipeline, 
    param_distributions=param_dist, 
    n_iter=100,  # Increased iterations
    cv=5, 
    scoring='balanced_accuracy',
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
# Best model evaluation
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

# Print results
print("Best Parameters:", random_search.best_params_)
print("\nBest Cross-Validated Score:", random_search.best_score_)
print("\nTest Set Balanced Accuracy:", balanced_accuracy_score(y_test, y_pred))

Best Parameters: {'classifier__n_estimators': 1000, 'classifier__min_samples_split': 4, 'classifier__min_samples_leaf': 2, 'classifier__max_features': 'sqrt', 'classifier__max_depth': 10, 'classifier__criterion': 'entropy', 'classifier__class_weight': 'balanced_subsample'}

Best Cross-Validated Score: 0.6452000252000252

Test Set Balanced Accuracy: 0.6332172953359352


## XGBoost

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score
from sklearn.pipeline import Pipeline

ModuleNotFoundError: No module named 'xgboost'

In [None]:
# Hyperparameter grid for XGBoost
param_dist = {
    'classifier__n_estimators': [100, 250, 500, 750, 1000],
    'classifier__max_depth': [3, 5, 7, 10],
    'classifier__learning_rate': [0.01, 0.1, 0.3],
    'classifier__subsample': [0.6, 0.8, 1.0],
    'classifier__colsample_bytree': [0.6, 0.8, 1.0],
    'classifier__min_child_weight': [1, 3, 5],
    'classifier__gamma': [0, 0.1, 0.3, 0.5]
}

In [None]:
# Create pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', xgb.XGBClassifier(
        eval_metric='auc', 
        random_state=42
    ))
])

In [None]:
# Randomized Search
random_search = RandomizedSearchCV(
    pipeline, 
    param_distributions=param_dist, 
    n_iter=50,
    cv=5, 
    scoring='balanced_accuracy',
    random_state=42,
    n_jobs=-1
)

In [None]:
# Remap labels to 0 and 1
y = (all_features["class"].astype(int) - 1).astype(int)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [None]:
# Fit and evaluate
random_search.fit(X_train, y_train)
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

print("Best Parameters:", random_search.best_params_)
print("\nBest Cross-Validated Score:", random_search.best_score_)
print("\nTest Set Balanced Accuracy:", balanced_accuracy_score(y_test, y_pred))

# Feature importance
feature_importance = pd.Series(
    best_model.named_steps['classifier'].feature_importances_, 
    index=X.columns
).sort_values(ascending=False)
print("\nFeature Importance:\n", feature_importance)

KeyboardInterrupt: 

LOGISTIC REGRESSION

In [56]:
from imblearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [57]:
oversample = SMOTE(random_state=42)
lr = LogisticRegression(max_iter=1000, random_state=42)

pipeline = Pipeline([('smote', oversample), 
                     ('logreg', lr)])

param_grid = {
    'smote__k_neighbors': [3, 5],
    'logreg__C': [0.01, 0.1, 1, 10],
    'logreg__class_weight': ['balanced', None]
}

grid_search = GridSearchCV(
    pipeline, param_grid, cv=5, scoring='recall', n_jobs=-1
)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_


KeyboardInterrupt: 

In [None]:
# Predict the test set labels



y_pred = best_model.predict(X_test)

# Generate a classification report
# This shows precision, recall (which is the % of each category predicted correctly), and F1-score by class.
print("Classification Report:")
print(classification_report(y_test, y_pred))

# You can also print the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix (rows=actual, cols=predicted):")
print(cm)

Classification Report:
              precision    recall  f1-score   support

           1       0.12      0.82      0.21       920
           2       0.96      0.43      0.60      9514

    accuracy                           0.47     10434
   macro avg       0.54      0.63      0.41     10434
weighted avg       0.89      0.47      0.56     10434


Confusion Matrix (rows=actual, cols=predicted):
[[ 759  161]
 [5392 4122]]


Opaque Features

In [13]:
df = pd.read_csv('elliptic_txs_features.csv')
# Step 1: Move column names into the first row
df = pd.concat([pd.DataFrame([df.columns], columns=df.columns), df], ignore_index=True)

# Step 2: Rename columns sequentially from 1 to the number of columns
df.columns = range(1, len(df.columns) + 1)
df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,158,159,160,161,162,163,164,165,166,167
0,230425980,1,-0.1714692896288031,-0.1846675514329143,-1.2013688016765636,-0.1219695997591005,-0.0438745479173489,-0.1130020092847624,-0.0615837940730322,-0.1620967998165964,...,-0.5621534802884299,-0.6009988905192808,1.4613303209554889,1.4613689382001922,0.0182794000374458,-0.0874901561101501,-0.1311553038955873,-0.0975235937715251,-0.1206134067031157,-0.1197924596125166
1,5530458,1,-0.171484,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162112,...,0.947382,0.673103,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
2,232022460,1,-0.172107,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162749,...,0.670883,0.439728,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792
3,232438397,1,0.163054,1.96379,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,...,-0.577099,-0.613614,0.241128,0.241406,1.072793,0.08553,-0.131155,0.677799,-0.120613,-0.119792
4,230460314,1,1.011523,-0.081127,-1.201369,1.153668,0.333276,1.312656,-0.061584,-0.163523,...,-0.511871,-0.400422,0.517257,0.579382,0.018279,0.277775,0.326394,1.29375,0.178136,0.179117


In [12]:
classes = pd.read_csv('elliptic_txs_classes.csv')
classes.head()

Unnamed: 0,txId,class
0,230425980,unknown
1,5530458,unknown
2,232022460,unknown
3,232438397,2
4,230460314,unknown


In [15]:
df['class'] = classes['class']
df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,159,160,161,162,163,164,165,166,167,class
0,230425980,1,-0.1714692896288031,-0.1846675514329143,-1.2013688016765636,-0.1219695997591005,-0.0438745479173489,-0.1130020092847624,-0.0615837940730322,-0.1620967998165964,...,-0.6009988905192808,1.4613303209554889,1.4613689382001922,0.0182794000374458,-0.0874901561101501,-0.1311553038955873,-0.0975235937715251,-0.1206134067031157,-0.1197924596125166,unknown
1,5530458,1,-0.171484,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162112,...,0.673103,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,unknown
2,232022460,1,-0.172107,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.162749,...,0.439728,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792,unknown
3,232438397,1,0.163054,1.96379,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,...,-0.613614,0.241128,0.241406,1.072793,0.08553,-0.131155,0.677799,-0.120613,-0.119792,2
4,230460314,1,1.011523,-0.081127,-1.201369,1.153668,0.333276,1.312656,-0.061584,-0.163523,...,-0.400422,0.517257,0.579382,0.018279,0.277775,0.326394,1.29375,0.178136,0.179117,unknown


In [24]:
labeld = df[df['class'] != 'unknown']
labeld.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,159,160,161,162,163,164,165,166,167,class
3,232438397,1,0.163054,1.96379,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,...,-0.613614,0.241128,0.241406,1.072793,0.08553,-0.131155,0.677799,-0.120613,-0.119792,2
9,232029206,1,-0.005027,0.578941,-0.091383,4.380281,-0.063725,4.667146,0.851305,-0.163645,...,-0.613614,0.241128,0.241406,0.60412,0.008632,-0.131155,0.333211,-0.120613,-0.119792,2
10,232344069,1,-0.147852,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.137933,...,-0.613614,0.241128,0.241406,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,2
11,27553029,1,-0.151357,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.141519,...,-0.582077,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,2
16,3881097,1,-0.172306,-0.184668,-1.201369,0.028105,-0.043875,-0.02914,0.242712,-0.16364,...,-0.600999,0.241128,0.241406,0.018279,-0.068266,-0.084674,-0.05445,-1.760926,-1.760984,2


In [25]:
labeld['class'].value_counts()

class
2    42019
1     4545
Name: count, dtype: int64

In [44]:
X = labeld.drop([1, 'class'], axis=1)
y = labeld['class']
X.head()

Unnamed: 0,2,3,4,5,6,7,8,9,10,11,...,158,159,160,161,162,163,164,165,166,167
3,1,0.163054,1.96379,-0.646376,12.409294,-0.063725,9.782742,12.414558,-0.163645,-0.115831,...,-0.577099,-0.613614,0.241128,0.241406,1.072793,0.08553,-0.131155,0.677799,-0.120613,-0.119792
9,1,-0.005027,0.578941,-0.091383,4.380281,-0.063725,4.667146,0.851305,-0.163645,-0.144554,...,-0.577099,-0.613614,0.241128,0.241406,0.60412,0.008632,-0.131155,0.333211,-0.120613,-0.119792
10,1,-0.147852,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.137933,-0.144108,...,-0.577099,-0.613614,0.241128,0.241406,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
11,1,-0.151357,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,-0.141519,-0.147643,...,-0.539735,-0.582077,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
16,1,-0.172306,-0.184668,-1.201369,0.028105,-0.043875,-0.02914,0.242712,-0.16364,-0.169115,...,-0.577099,-0.600999,0.241128,0.241406,0.018279,-0.068266,-0.084674,-0.05445,-1.760926,-1.760984


In [45]:
X.dtypes

2      object
3      object
4      object
5      object
6      object
        ...  
163    object
164    object
165    object
166    object
167    object
Length: 166, dtype: object

In [46]:
# Create a list of columns from "2" to "167" (inclusive)
cols_to_convert = [i for i in range(2, 168)]

# Convert these columns to float
X[cols_to_convert] = X[cols_to_convert].astype(float)
X.dtypes

 

2      float64
3      float64
4      float64
5      float64
6      float64
        ...   
163    float64
164    float64
165    float64
166    float64
167    float64
Length: 166, dtype: object

In [None]:
# X = (X - X.mean())/X.std()
# X.head()

Unnamed: 0,2,3,4,5,6,7,8,9,10,11,...,158,159,160,161,162,163,164,165,166,167
3,-1.556861,0.307022,1.168808,-0.748101,7.643725,-0.053405,6.038959,21.528249,-0.146332,-0.084346,...,-0.570329,-0.619011,0.155524,0.155842,2.115907,0.016322,-0.117598,0.593436,-0.083319,-0.082051
9,-1.556861,0.067779,0.271909,-0.226078,2.628511,-0.053405,2.820385,1.501072,-0.146332,-0.127351,...,-0.570329,-0.619011,0.155524,0.155842,1.200404,-0.02814,-0.117598,0.270017,-0.083319,-0.082051
10,-1.556861,-0.135516,-0.222644,-1.270124,-0.183759,-0.041929,-0.187135,-0.080021,-0.102662,-0.126684,...,-0.570329,-0.619011,0.155524,0.155842,0.056025,-0.083718,-0.117598,-0.134256,-0.083319,-0.082051
11,-1.556861,-0.140504,-0.222644,-1.270124,-0.183759,-0.041929,-0.187135,-0.080021,-0.108752,-0.131977,...,-0.529543,-0.586047,-1.091727,-1.090216,0.056025,-0.083718,-0.117598,-0.134256,-0.083319,-0.082051
16,-1.556861,-0.170322,-0.222644,-1.270124,-0.090017,-0.041929,-0.134371,0.44701,-0.146323,-0.164126,...,-0.570329,-0.605825,0.155524,0.155842,0.056025,-0.072603,-0.091256,-0.093829,-1.725661,-1.726388


In [53]:
oversample = SMOTE(random_state=42)
lr = LogisticRegression(max_iter=1000, random_state=42)

pipeline = Pipeline([('smote', oversample), 
                     ('logreg', lr)])

param_grid = {
    'smote__k_neighbors': [3, 5],
    'logreg__C': [0.01, 0.1, 1, 10],
    'logreg__class_weight': ['balanced', None]
}

grid_search = GridSearchCV(
    pipeline, param_grid, cv=5, scoring='recall', n_jobs=-1
)
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_



In [55]:
y_pred = best_model.predict(X_test)

# Generate a classification report
# This shows precision, recall (which is the % of each category predicted correctly), and F1-score by class.
print("Classification Report:")
print(classification_report(y_test, y_pred))

# You can also print the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix (rows=actual, cols=predicted):")
print(cm)

Classification Report:
              precision    recall  f1-score   support

           1       0.37      0.90      0.53      1364
           2       0.99      0.84      0.91     12606

    accuracy                           0.84     13970
   macro avg       0.68      0.87      0.72     13970
weighted avg       0.93      0.84      0.87     13970


Confusion Matrix (rows=actual, cols=predicted):
[[ 1227   137]
 [ 2059 10547]]
