In [None]:
########### TRANSACTION GRAPH ############
# Read datasets
transactions = pd.read_csv('dataset/user_transactions.csv')
contract_addresses = pd.read_csv('dataset/contract_addresses.csv').address.tolist()
transactions = transactions[:1000]

print("txs and contracts loaded")
# Neo4j connection setup
uri = "bolt://localhost:8092"  # default connection URI for local Neo4j
username = "neo4j"
password = "uWBOzDTQLXJLiFFF"
driver = GraphDatabase.driver(uri, auth=(username, password))

def sanitize_function_name(name):
    # Replace any non-alphanumeric character with an underscore
    sanitized_name = re.sub(r'\W+', '_', name)
    
    # Ensure it starts with a letter
    if not sanitized_name[0].isalpha():
        sanitized_name = 'F_' + sanitized_name

    return sanitized_name

def create_transaction(tx, row):
    # Check if 'from' and 'to' addresses are contracts or users
    from_type = 'Contract' if row['from'] in contract_addresses else 'User'
    to_type = 'Contract' if row['to'] in contract_addresses else 'User'

    # Check if functionName is empty or NaN
    raw_func_name = row['functionName'] if pd.notna(row['functionName']) and row['functionName'] != '' else 'UNKNOWN'
    func_name = sanitize_function_name(raw_func_name)
    
    # Cypher query
    #TODO: when creating contract nodes, find it's name and add as node feature, then going to contract level we can have custom generated tag from contract content
    query = (
        f"MERGE (a:{from_type} {{address: $from_address}}) "
        f"MERGE (b:{to_type} {{address: $to_address}}) "
        f"CREATE (a)-[r:{func_name} {{input: $input, timeStamp: $timeStamp}}]->(b)"
    )
    tx.run(query, from_address=row['from'], to_address=row['to'], input=row['input'], timeStamp=row['timeStamp'])

# Execute transaction for each row in the transactions dataframe
with driver.session() as session:
    for _, row in tqdm(transactions.iterrows()):
        session.write_transaction(create_transaction, row)

driver.close()

In [None]:
################# GRAPH CONVOLUTIONAL NETWORK ####################
# user_transactions = pd.read_csv('dataset/user_transactions.csv')
# contract_addresses_with_names = pd.read_csv('dataset/contract_addresses_with_name.csv')
# user_transactions = user_transactions[:100]

# print('data has been loaded')

edge_index = []
edge_weights = defaultdict(int)
edge_features = []
node_to_id = {}
current_id = 0

def get_node_name(address):
    # Check if address exists in contract_addresses_with_names
    name = contract_addresses_with_names[contract_addresses_with_names['contract_address'] == address]['contract_name']
    if name.empty:
        return address
    else:
        return name.values[0]

for idx, row in tqdm(user_transactions.iterrows(), total = len(user_transactions)):
    from_name = get_node_name(row['from'])
    to_name = get_node_name(row['to'])
    
    # Assign an id to each unique address/name
    if from_name not in node_to_id:
        node_to_id[from_name] = current_id
        current_id += 1
    
    if to_name not in node_to_id:
        node_to_id[to_name] = current_id
        current_id += 1
    
    #TODO: We supposed for same contract call always the function_name is the same
    # if [node_to_id[from_name], node_to_id[to_name]] not in edge_index:
    edge_index.append([node_to_id[from_name], node_to_id[to_name]])
    edge_features.append(row['functionName'])
    edge_weights[(from_name, to_name)] += 1

edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
#edge_weights = [edge_weights[tuple(edge)] for edge in edge_index.numpy().T]
edge_weights = torch.tensor([edge_weights[tuple(edge)] for edge in edge_index.numpy().T], dtype=torch.float)


print('1')

class SimpleGCN(torch.nn.Module):
    def __init__(self, num_features, hidden_channels):
        super(SimpleGCN, self).__init__()
        self.conv1 = GCNConv(num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, 1)

    def forward(self, x, edge_index, edge_weight):
        x = self.conv1(x, edge_index, edge_weight=edge_weight)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index, edge_weight=edge_weight)
        return x.squeeze(-1)

def get_predictions(model, x, edge_index, edge_weights):
    model.eval()
    with torch.no_grad():
        node_embeddings = model(x, edge_index, edge_weights)
    source_embeddings = node_embeddings[edge_index[0]]
    target_embeddings = node_embeddings[edge_index[1]]
    return (source_embeddings * target_embeddings).sum(-1)

# Assuming x is the input node features, which you haven't provided.
# If you don't have node features, you can simply use an identity matrix.
x = torch.eye(len(node_to_id))
print(x.size())

model = SimpleGCN(x.size(1), 16)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.BCEWithLogitsLoss()

def train(model, x, edge_index, edge_weights, epochs=100):
    model.train()
    losses = []
    for epoch in range(epochs):
        optimizer.zero_grad()
        preds = get_predictions(model, x, edge_index, edge_weights)
        print(preds.shape)
        break
#         loss = criterion(preds, torch.tensor(edge_weights, dtype=torch.float))
#         loss.backward()
#         optimizer.step()
        
#         losses.append(loss.item())
        
#         # Print every 10 epochs
#         if epoch % 10 == 0:
#             print(f"Epoch {epoch}/{epochs}, Loss: {loss.item()}")
    
#     return losses

# losses = train(model, x, edge_index, edge_weights)

# # Plot the losses
# plt.plot(losses)
# plt.xlabel('Epoch')
# plt.ylabel('Loss')
# plt.title('Training Loss')
# plt.show()