In [None]:
pip install neo4j torch_geometric torch

In [3]:
import numpy as np
import pandas as pd
# from neo4j import GraphDatabase
from torch_geometric.data import Data
import torch
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
from tqdm import tqdm
from collections import defaultdict

In [None]:

########### TRANSACTION GRAPH ############
# Read datasets
transactions = pd.read_csv('dataset/user_transactions.csv')
contract_addresses = pd.read_csv('dataset/contract_addresses.csv').address.tolist()
transactions = transactions[:1000]

print("txs and contracts loaded")
# Neo4j connection setup
uri = "bolt://localhost:8092"  # default connection URI for local Neo4j
username = "neo4j"
password = "uWBOzDTQLXJLiFFF"
driver = GraphDatabase.driver(uri, auth=(username, password))

def sanitize_function_name(name):
    # Replace any non-alphanumeric character with an underscore
    sanitized_name = re.sub(r'\W+', '_', name)
    
    # Ensure it starts with a letter
    if not sanitized_name[0].isalpha():
        sanitized_name = 'F_' + sanitized_name

    return sanitized_name

def create_transaction(tx, row):
    # Check if 'from' and 'to' addresses are contracts or users
    from_type = 'Contract' if row['from'] in contract_addresses else 'User'
    to_type = 'Contract' if row['to'] in contract_addresses else 'User'

    # Check if functionName is empty or NaN
    raw_func_name = row['functionName'] if pd.notna(row['functionName']) and row['functionName'] != '' else 'UNKNOWN'
    func_name = sanitize_function_name(raw_func_name)
    
    # Cypher query
    #TODO: when creating contract nodes, find it's name and add as node feature, then going to contract level we can have custom generated tag from contract content
    query = (
        f"MERGE (a:{from_type} {{address: $from_address}}) "
        f"MERGE (b:{to_type} {{address: $to_address}}) "
        f"CREATE (a)-[r:{func_name} {{input: $input, timeStamp: $timeStamp}}]->(b)"
    )
    tx.run(query, from_address=row['from'], to_address=row['to'], input=row['input'], timeStamp=row['timeStamp'])

# Execute transaction for each row in the transactions dataframe
with driver.session() as session:
    for _, row in tqdm(transactions.iterrows()):
        session.write_transaction(create_transaction, row)

driver.close()

In [14]:
################# GRAPH CONVOLUTIONAL NETWORK ####################\
contract_names = pd.read_csv('dataset/contract_addresses_with_name.csv')
all_contracts = pd.read_csv('dataset/contract_addresses.csv')
contract_names = contract_names[contract_names['contract_name'].notna()]
user_transactions_df = pd.read_csv('dataset/user_transactions.csv')
# user_transactions_df = user_transactions_df[:10000]

In [15]:

user_contract_df = pd.read_parquet("dataset/user_contract_rating.parquet")
user_contract_df = user_contract_df[ user_contract_df['item'] != '']
user_contract_df = user_contract_df['item'].isin(contract_names['contract_name'])
all_contracts_set = set(all_contracts['address'])
print(len(user_contract_df))

# TODO: add node features into node embeddings

edge_index = []
edge_weights = defaultdict(int)
edge_features = []
node_to_id = {}
node_type_dict = {'user': 0, 'contract': 1}
node_features = []
current_id = 0


def get_node_name(address):
    # Check if address exists in contract_addresses_with_names
    name = contract_names[contract_names['contract_address'] == address]['contract_name']
    if name.empty:
        return ('unknown', 'contract') if address in all_contracts_set else (address, 'user')
    else:
        return (name.values[0], 'contract')

for idx, row in tqdm(user_contract_df.iterrows(), total = len(user_contract_df)):
    # from_name, from_type = get_node_name(row['from'])
    # to_name, to_type = get_node_name(row['to'])
    user_node, item_node = row['user'], row['item']

    # if (from_name == 'unknown' and from_type == 'contract') or (to_name == 'unknown' and to_type == 'contract'): continue # skip rows with unknown contract name (since we don't have their code too)

    if user_node not in node_to_id:
        node_to_id[user_node] = current_id
        node_features.append(node_type_dict['user'])
        current_id += 1
    
    if item_node not in node_to_id:
        node_to_id[item_node] = current_id
        # node_features.append([node_type_dict['contract'], item_node]) # Have contract name as another feature besides type
        node_features.append(node_type_dict['contract'])
        current_id += 1
    
    #TODO: defferentiate same from or to node but with diff function_name
    edge = [node_to_id[user_node], node_to_id[item_node]]
    if edge not in edge_index:
        edge_index.append(edge)
        edge_features.append(row['functionName'])
        edge_weights[(user_node, item_node)] += 1


x = torch.tensor(node_features, dtype=torch.long).unsqueeze(1) # Converting node features to tensor
edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
edge_weights = torch.tensor([edge_weights[tuple(edge)] for edge in edge_index.numpy().T], dtype=torch.float)


# print('1')

class SimpleGCN(torch.nn.Module):
    def __init__(self, num_features, hidden_channels):
        super(SimpleGCN, self).__init__()
        self.conv1 = GCNConv(num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, 1)

    def forward(self, x, edge_index, edge_weight):
        x = self.conv1(x, edge_index, edge_weight=edge_weight)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index, edge_weight=edge_weight)
        return x.squeeze(-1)

# def get_predictions(model, x, edge_index, edge_weights):
#     model.eval()
#     with torch.no_grad():
#         node_embeddings = model(x, edge_index, edge_weights)
#     source_embeddings = node_embeddings[edge_index[0]]
#     target_embeddings = node_embeddings[edge_index[1]]
#     return (source_embeddings * target_embeddings).sum(-1)

# model = SimpleGCN(x.size(1), 16)
# optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
# criterion = torch.nn.BCEWithLogitsLoss()

# def train(model, x, edge_index, edge_weights, epochs=100):
#     model.train()
#     losses = []
#     for epoch in range(epochs):
#         optimizer.zero_grad()
#         preds = get_predictions(model, x, edge_index, edge_weights)
#         print(preds.shape)
#         break
# #         loss = criterion(preds, torch.tensor(edge_weights, dtype=torch.float))
# #         loss.backward()
# #         optimizer.step()
        
# #         losses.append(loss.item())
        
# #         # Print every 10 epochs
# #         if epoch % 10 == 0:
# #             print(f"Epoch {epoch}/{epochs}, Loss: {loss.item()}")
    
# #     return losses

# # losses = train(model, x, edge_index, edge_weights)

# # # Plot the losses
# # plt.plot(losses)
# # plt.xlabel('Epoch')
# # plt.ylabel('Loss')
# # plt.title('Training Loss')
# # plt.show()

  0%|          | 1616/1738651 [00:19<5:45:44, 83.74it/s] 


KeyboardInterrupt: 

In [10]:
print(all_contracts[:5])

                                      address  tx_count
0  0x40a4294f8ea7cac3d93336b9a70c758c03535508         3
1  0x0ea17d0698cbf66b2cdda3eb27ef2b7c7d31135e         3
2  0x1997b40fea47d66cf2b48c5fab960c8dab80df7d         3
3  0xb2914f6db3ad77e5302d9b6b578c4119873e9b2e         3
4  0x1d025fd2547d04f1308757d43415ac628c402dcb         3
