In [1]:
# Utility imports
import numpy as np
import pandas as pd 

# Graph theory imports
import networkx as nx

# Visualization imports
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots


In [5]:
class_df = pd.read_csv("elliptic_txs_classes.csv")
edge_df = pd.read_csv("elliptic_txs_edgelist.csv")
feature_df = pd.read_csv("elliptic_txs_features.csv", header=None)

In [6]:
class_counts = class_df['class'].value_counts()
print(class_counts)

class
unknown    157205
2           42019
1            4545
Name: count, dtype: int64


In [7]:
class_1_count = len(class_df[class_df['class'] == "1"])
print(class_1_count)  # Should print 4545

4545


In [8]:
class_df.info()
print("\n\n")
edge_df.info()
print("\n\n")
feature_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203769 entries, 0 to 203768
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   txId    203769 non-null  int64 
 1   class   203769 non-null  object
dtypes: int64(1), object(1)
memory usage: 3.1+ MB



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 234355 entries, 0 to 234354
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   txId1   234355 non-null  int64
 1   txId2   234355 non-null  int64
dtypes: int64(2)
memory usage: 3.6 MB



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203769 entries, 0 to 203768
Columns: 167 entries, 0 to 166
dtypes: float64(165), int64(2)
memory usage: 259.6 MB


In [9]:
# Rename the columns of feature_df
feature_df.columns = ['txID'] + [f'feature_{i}' for i in range(1, len(feature_df.columns))]

# Rename the columns of class_df
class_df.columns = ['txID'] + ['class']

# Convert txID to integer
class_df['txID'] = pd.to_numeric(class_df['txID'])
feature_df['txID'] = pd.to_numeric(feature_df['txID'])

# Merge the class and feature DataFrames
merged_df = pd.merge(class_df, feature_df, on='txID')

In [10]:
# Feature correlation analysis
corr_matrix = feature_df.corr()

# Plot correlation matrix using Plotly
fig = go.Figure(data=go.Heatmap(z=corr_matrix.values, x=corr_matrix.columns, y=corr_matrix.index, colorscale='Viridis'))
fig.update_layout(title='Feature Correlation Matrix')
fig.show()

In [15]:


# Network analysis (subset of nodes and edges)
G = nx.Graph()
G.add_edges_from(edge_df.values[:1000])  # Select first 1000 edges

# Calculate network metrics
degree_centrality = nx.degree_centrality(G)
betweenness_centrality = nx.betweenness_centrality(G)
closeness_centrality = nx.closeness_centrality(G)

# Plot network metrics using Plotly
fig = make_subplots(rows=1, cols=3, subplot_titles=['Degree Centrality', 'Betweenness Centrality', 'Closeness Centrality'])
fig.add_trace(go.Histogram(x=list(degree_centrality.values()), nbinsx=50), row=1, col=1)
fig.add_trace(go.Histogram(x=list(betweenness_centrality.values()), nbinsx=50), row=1, col=2)
fig.add_trace(go.Histogram(x=list(closeness_centrality.values()), nbinsx=50), row=1, col=3)
fig.update_layout(height=600, width=1200)
fig.show()

# Plot network structure using Plotly
pos = nx.spring_layout(G)
node_x = [pos[node][0] for node in G.nodes()]
node_y = [pos[node][1] for node in G.nodes()]
edge_x = []
edge_y = []
for edge in G.edges():
    source, target = edge
    x0, y0 = pos[source]
    x1, y1 = pos[target]
    edge_x.append(x0)
    edge_x.append(x1)
    edge_x.append(None)
    edge_y.append(y0)
    edge_y.append(y1)
    edge_y.append(None)

fig = go.Figure(data=[go.Scatter(x=edge_x, y=edge_y, mode='lines', line_shape='spline', opacity=0.5, hoverinfo='none'),
                      go.Scatter(x=node_x, y=node_y, mode='markers', hoverinfo='text', hovertext=[node for node in G.nodes()])])
fig.update_layout(title='Network Structure', showlegend=False, hovermode='x', margin=dict(b=20, l=5, r=5, t=40), xaxis=dict(showgrid=False, zeroline=False, showticklabels=False), yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
fig.show()

In [12]:
# Get list of class 1 nodes 
class_1_nodes = set(merged_df[merged_df['class'] == "1"]['txID'])

# Create full graph first without limiting edges
G = nx.Graph()
G.add_edges_from(edge_df.values)  # Remove the [:5000] slice

# Create subgraph with only class 1 nodes and their edges
class_1_edges = [(source, target) for source, target in G.edges()
                 if source in class_1_nodes and target in class_1_nodes]

# Print some info to understand our data better
print(f"Total number of class 1 nodes: {len(class_1_nodes)}")
print(f"Total number of edges in full graph: {len(G.edges())}")
print(f"Number of edges between class 1 nodes: {len(class_1_edges)}")

Total number of class 1 nodes: 4545
Total number of edges in full graph: 234355
Number of edges between class 1 nodes: 998


In [13]:
# Get list of class 1 nodes 
class_1_nodes = set(merged_df[merged_df['class'] == "1"]['txID'])

# Create full graph first
G = nx.Graph()
G.add_edges_from(edge_df.values)

# Create subgraph with only class 1 nodes and their edges
class_1_edges = [(source, target) for source, target in G.edges()
                 if source in class_1_nodes and target in class_1_nodes]
G_class_1 = nx.Graph()
G_class_1.add_edges_from(class_1_edges)

# Calculate network metrics for class 1 subgraph
degree_centrality = nx.degree_centrality(G_class_1)
betweenness_centrality = nx.betweenness_centrality(G_class_1)
closeness_centrality = nx.closeness_centrality(G_class_1)

# Plot network metrics using Plotly
fig = make_subplots(rows=1, cols=3, 
                    subplot_titles=['Degree Centrality (Class 1)', 
                                  'Betweenness Centrality (Class 1)', 
                                  'Closeness Centrality (Class 1)'])

fig.add_trace(go.Histogram(x=list(degree_centrality.values()), nbinsx=50), row=1, col=1)
fig.add_trace(go.Histogram(x=list(betweenness_centrality.values()), nbinsx=50), row=1, col=2)
fig.add_trace(go.Histogram(x=list(closeness_centrality.values()), nbinsx=50), row=1, col=3)
fig.update_layout(height=600, width=1200, title_text="Network Metrics for Class 1 Nodes")
fig.show()

# Plot network structure using Plotly
pos = nx.spring_layout(G_class_1, k=1, iterations=50)  # Adjusted layout parameters
node_x = [pos[node][0] for node in G_class_1.nodes()]
node_y = [pos[node][1] for node in G_class_1.nodes()]
edge_x = []
edge_y = []

for edge in G_class_1.edges():
    source, target = edge
    x0, y0 = pos[source]
    x1, y1 = pos[target]
    edge_x.extend([x0, x1, None])
    edge_y.extend([y0, y1, None])

# Create the network visualization
fig = go.Figure(data=[
    go.Scatter(x=edge_x, y=edge_y, mode='lines', line_shape='spline', 
              opacity=0.3, hoverinfo='none', name='Edges',
              line=dict(color='gray', width=0.5)),
    go.Scatter(x=node_x, y=node_y, mode='markers', 
              hoverinfo='text', hovertext=[str(node) for node in G_class_1.nodes()],
              marker=dict(size=5, color='red'), name='Nodes')
])

fig.update_layout(
    title='Network Structure - Class 1 Nodes Only',
    showlegend=True,
    hovermode='closest',
    margin=dict(b=20, l=5, r=5, t=40),
    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
    yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
)
fig.show()

# Print summary statistics
print(f"Number of Class 1 nodes in network: {len(G_class_1.nodes())}")
print(f"Number of edges between Class 1 nodes: {len(G_class_1.edges())}")

# Calculate additional network metrics
connected_components = list(nx.connected_components(G_class_1))
print(f"Number of connected components: {len(connected_components)}")
print(f"Size of largest connected component: {len(max(connected_components, key=len))}")

Number of Class 1 nodes in network: 1364
Number of edges between Class 1 nodes: 998
Number of connected components: 366
Size of largest connected component: 183


In [14]:
# Get list of class 1 nodes
class_1_nodes = set(merged_df[merged_df['class'] == "1"]['txID'])

# Create full graph first
G = nx.Graph()
G.add_edges_from(edge_df.values[:5000])

# Convert edge DataFrame values to same type as class_1_nodes
edge_df_subset = edge_df.values[:5000]
edge_df_subset = [(int(source), int(target)) for source, target in edge_df_subset]

# Create subgraph with only class 1 nodes and their edges
class_1_edges = [(source, target) for source, target in edge_df_subset 
                 if source in class_1_nodes and target in class_1_nodes]
G_class_1 = nx.Graph()
G_class_1.add_edges_from(class_1_edges)

# Check if the graph is empty
if len(G_class_1.nodes()) == 0:
    print("No class 1 nodes found in the network or no connections between class 1 nodes.")
else:
    # Calculate network metrics for class 1 subgraph
    degree_centrality = nx.degree_centrality(G_class_1)
    betweenness_centrality = nx.betweenness_centrality(G_class_1)
    closeness_centrality = nx.closeness_centrality(G_class_1)

    # Plot network metrics using Plotly
    fig = make_subplots(rows=1, cols=3, 
                        subplot_titles=['Degree Centrality (Class 1)', 
                                      'Betweenness Centrality (Class 1)', 
                                      'Closeness Centrality (Class 1)'])

    fig.add_trace(go.Histogram(x=list(degree_centrality.values()), nbinsx=50), row=1, col=1)
    fig.add_trace(go.Histogram(x=list(betweenness_centrality.values()), nbinsx=50), row=1, col=2)
    fig.add_trace(go.Histogram(x=list(closeness_centrality.values()), nbinsx=50), row=1, col=3)
    fig.update_layout(height=600, width=1200, title_text="Network Metrics for Class 1 Nodes")
    fig.show()

    # Plot network structure using Plotly
    pos = nx.spring_layout(G_class_1)
    node_x = [pos[node][0] for node in G_class_1.nodes()]
    node_y = [pos[node][1] for node in G_class_1.nodes()]
    edge_x = []
    edge_y = []

    for edge in G_class_1.edges():
        source, target = edge
        x0, y0 = pos[source]
        x1, y1 = pos[target]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])

    # Create the network visualization
    fig = go.Figure(data=[
        go.Scatter(x=edge_x, y=edge_y, mode='lines', line_shape='spline', 
                  opacity=0.5, hoverinfo='none', name='Edges'),
        go.Scatter(x=node_x, y=node_y, mode='markers', 
                  hoverinfo='text', hovertext=[str(node) for node in G_class_1.nodes()],
                  marker=dict(size=8, color='red'), name='Nodes')
    ])

    fig.update_layout(
        title='Network Structure - Class 1 Nodes Only',
        showlegend=True,
        hovermode='closest',
        margin=dict(b=20, l=5, r=5, t=40),
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
    )
    fig.show()

    # Print summary statistics
    print(f"Number of Class 1 nodes: {len(G_class_1.nodes())}")
    print(f"Number of edges between Class 1 nodes: {len(G_class_1.edges())}")

No class 1 nodes found in the network or no connections between class 1 nodes.
