In [1]:
import pandas as pd
import numpy as np

In [None]:
############### DISCRIMINATOR OF EOA AND CONTRACT ADDRESSES *********************

# We checked for some of eoa_addresses and found all of them was eoa addresses (but we can run for all remaining ones)
import requests 
import json 
from tqdm import tqdm

def check_if_contract(address):
    url = "https://mainnet.infura.io/v3/8f890b3a78e740f2bd98be613da634f1"  # URL of your Ethereum node

    payload = {
        "method": "eth_getCode",
        "params": [address, "latest"],
        "id": 1,
        "jsonrpc": "2.0"
    }

    headers = {"Content-Type": "application/json"}

    response = requests.post(url, data=json.dumps(payload), headers=headers)

    result = response.json()['result']

    return result != '0x'

# Open the CSV file and read it into memory
with open('dataset/eoa_addresses.csv', 'r') as input_file:
    reader = csv.reader(input_file)
    headers = next(reader)  # Extract header row
    data = list(reader)

# Add the 'is_eoa' column if it doesn't exist
if 'is_eoa' not in headers:
    headers.append('is_eoa')
    for row in data:
        row.append('')  # Initialize with an empty string or any other default value

# Find the index of the is_eoa column
is_eoa_index = headers.index('is_eoa')

# Open the output file
with open('dataset/checked_eoa_addresses.csv', 'w', newline='') as output_file:
    writer = csv.writer(output_file)
    writer.writerow(headers)  # Write the header row

    # Iterate over the data and check the is_eoa value for each address
    for row in tqdm(data, desc="Processing addresses"):
        if row[is_eoa_index]:  # Skip if is_eoa is already populated
            writer.writerow(row)
            continue
        # Check if address is a contract
        is_contract = check_if_contract(row[0])
        # Update is_eoa field in the row
        row[is_eoa_index] = 0 if is_contract else 1
        # Write the updated row to the output file
        writer.writerow(row)

# prev progress bar: Processing addresses:   0%|          | 56/84234 [01:17<32:22:51,  1.38s/it]

In [None]:
#******************* DONWLOAD CONTRACTS FROM DB ************************

# Download the contracts collection directly from MongoDB
client = MongoClient('mongodb://seshatadmin:uWBOzDTQLXJLiFFF@lg-research-1.uwaterloo.ca:8094/')
db = client['test']
collection = db['contracts']

# Retrieve all documents in the collection
results = collection.find()

# Convert the cursor to a list of dictionaries
documents = list(results)

# Save the documents as JSON in a file
with open('dataset/contracts.json', 'w') as file:
    json.dump(documents, file)

print("Data saved as contracts.json")

In [None]:

# Load data into a pandas DataFrame
with open('dataset/transactions.json') as f:
    # data = [json.loads(line) for line in f]
    data = json.load(f)

df = pd.json_normalize(data)

column_names = df.columns.to_list()
print(column_names)

columns_to_delete = [col for col in df.columns if col.startswith('func_args')]
df = df.drop(columns=columns_to_delete)

columns_to_keep = ['from', 'to']
df = df.drop(columns=df.columns.difference(columns_to_keep))

df['from'] = df['from'].astype(str)
df['to'] = df['to'].astype(str)

# Get unique Ethereum public keys from 'from' and 'to' columns
unique_addresses = np.unique(np.concatenate([df['from'].unique(), df['to'].unique()]))

print(len(unique_addresses))

# Load the contracts.json file
with open('dataset/contracts.json') as file:
    contracts_data = json.load(file)

# Create a set of contract addresses for faster lookup
contract_addresses = set(contract['contractAddress'] for contract in contracts_data)

# Create empty arrays for EOA accounts (rows) and contract accounts (columns)
eoa_accounts = []
contract_accounts = []

# Iterate over unique addresses and categorize them
for address in tqdm(unique_addresses, desc="Categorizing Addresses"):
    if address in contract_addresses:
        contract_accounts.append(address)
    else:
        eoa_accounts.append(address)

address_df = pd.DataFrame(unique_addresses, columns=['Address'])

# Save as a CSV file
address_df.to_csv('unique_addresses.csv', index=False)

print("Unique addresses saved as unique_addresses.csv")


In [None]:
#################### EOA ADDRESSES DATASET BUILDER ##################

'''
The script to go over unique addresses, lookup within contract_addresses, 
If not exist, append to the EoA_addresses and save the csv
'''

# Read unique addresses from unique_addresses.csv
unique_addresses = set()
with open('dataset/unique_addresses.csv', 'r') as file:
    reader = csv.DictReader(file)
    for row in reader:
        address = row['Address']
        unique_addresses.add(address)

# Read contract addresses from contract_addresses.csv
contract_addresses = set()
with open('dataset/contract_addresses.csv', 'r') as file:
    reader = csv.DictReader(file)
    total_lines = sum(1 for _ in file)  # Count total lines in the file
    file.seek(0)  # Reset file position
    progress_bar = tqdm(reader, total=total_lines, desc="Processing Contract Addresses")
    for row in progress_bar:
        address = row['address']
        contract_addresses.add(address)

# Find EOA addresses
eoa_addresses = unique_addresses - contract_addresses

# Append EOA addresses to eoa_addresses.csv
fieldnames = ['address']
with open('dataset/eoa_addresses.csv', 'a', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    if file.tell() == 0:
        writer.writeheader()
    for address in eoa_addresses:
        writer.writerow({'address': address})

print("EOA addresses saved to eoa_addresses.csv.")
print(len(eoa_addresses)) # 84234


In [None]:
################# PRE-RUN SCRIPT FOR TX_FETCHER>PY ###################

'''
When stop tx_fetcher, re-run these four steps before running the script again
Step0: merging user-tx dataset json files
Step1: To extract all the top level keys (addresses) from new fetched txs
Step2: just delete the latest one and rename the new_... to latest_..., To create the local_remaining_addresses, 
just finding all addresses that don't exist in processed.csv and exists in remaining.csv
Step 3: update user-tx csv dataset
'''

# Directory containing the JSON files
directory = "temp_tx"

# Initialize an empty dictionary to store the merged data
merged_data = {}

# Initialize a counter for skipped files
skipped = 0

# Initialize a list to store the paths of successfully processed files
processed_files = []

# Iterate over each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".json"):
        file_path = os.path.join(directory, filename)
        
        # Open and load the JSON file
        with open(file_path, "r") as file:
            try:
                data_list = json.load(file)
                
                # Iterate over each dictionary in the list
                for data in data_list:
                    # Merge the data into the merged_data dictionary
                    merged_data.update(data)
                
                # Add the path of the successfully processed file to the list
                processed_files.append(file_path)
            except ValueError as e:
                # If an error occurs, increment the skipped counter and continue
                skipped += 1
                print(f"Skipping file {filename} due to error: {str(e)}")
                continue

print(f"Skipped {skipped} files.")

# Write the merged data to a new file
output_file = "dataset/merged_user_transactions.json"
with open(output_file, "w") as file:
    json.dump(merged_data, file, indent=4)

# Delete the successfully processed files
for file_path in processed_files:
    os.remove(file_path)




In [None]:

# Step1
#To extract all the top level keys (addresses) from new fetched txs
import json
import os
import csv

# specify the directory you want to parse json files from
directory = 'dataset'
filename = 'merged_user_transactions.json'


with open(os.path.join(directory, filename), 'r') as f:
    data_list = json.load(f)

addresses = data_list.keys()

# If you want to convert it to a list
addresses_list = list(addresses)

# Write the list of addresses to a csv file
with open('dataset/processed_eoa_addresses.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['address'])
    for address in addresses_list:
        writer.writerow([address])


In [None]:
# Step2, TODO: just delete the latest one and rename the new_... to latest_...
#To create the local_remaining_addresses, just finding all addresses that don't exist in processed.csv and exists in remaining.csv
import pandas as pd

# Load data from csv files
remaining_addresses_df = pd.read_csv('dataset/latest_remaining_eoa_addresses.csv')
processed_addresses_df = pd.read_csv('dataset/processed_eoa_addresses.csv')

# Find addresses that are in addresses.csv but not in remaining_addresses.csv
difference_df = remaining_addresses_df.loc[~remaining_addresses_df['address'].isin(processed_addresses_df['address'])]

# Save these addresses to a new csv file
difference_df.to_csv('dataset/new_remaining_eoa_addresses.csv', index=False)

# delete the latest_remaining_eoa_addresses, and rename the new_remaining_... to latest_remaining_...
os.remove("dataset/latest_remaining_eoa_addresses.csv") 
os.rename("dataset/new_remaining_eoa_addresses.csv", "dataset/latest_remaining_eoa_addresses.csv")


In [None]:
#Step 3
# update user-tx csv dataset
import json
import csv

# Load the merged data
with open("dataset/merged_user_transactions.json", "r") as file:
    merged_data = json.load(file)

# Open a new CSV file for appending
with open("dataset/user_transactions.csv", "a", newline='') as file:
    # Define the fieldnames for the CSV
    fieldnames = ["address", "timeStamp", "from", "to", "value", "gas", "gasPrice", "input", "contractAddress", "methodId", "functionName"]
    
    # Create a CSV writer
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    
    # If file doesn't exist, write the header row
    if os.stat("dataset/user_transactions.csv").st_size == 0:
        writer.writeheader()
    
    # Initialize a counter for skipped addresses
    skipped_addresses = 0
    
    # Iterate over the merged data
    for address, data in merged_data.items():
        # Check if data["normal"] is a list
        if isinstance(data, dict) and "normal" in data and isinstance(data["normal"], list):
            # For each address, iterate over the transactions
            for tx in data["normal"]:
                # Create a row for each transaction
                row = {
                    "address": address,
                    "timeStamp": tx.get("timeStamp", ""),
                    "from": tx.get("from", ""),
                    "to": tx.get("to", ""),
                    "value": tx.get("value", ""),
                    "gas": tx.get("gas", ""),
                    "gasPrice": tx.get("gasPrice", ""),
                    "input": tx.get("input", ""),
                    "contractAddress": tx.get("contractAddress", ""),
                    "methodId": tx.get("methodId", ""),
                    "functionName": tx.get("functionName", "")
                }
                
                # Write the row to the CSV
                writer.writerow(row)
        else:
            # Increment the counter of skipped addresses
            skipped_addresses += 1

# Print the number of skipped addresses
print(f"Skipped addresses: {skipped_addresses}")

# Remove the temp merged_tx dataset 
os.remove("dataset/merged_user_transactions.json")






In [2]:
# How many users in user-tx dataset have >=50 transactions?
temp_df = pd.read_csv("dataset/user_transactions.csv")
grouped = temp_df.groupby('address').size().reset_index(name='count')
target = grouped[grouped['count'] == 50]
print(target)



                                          address  count
0      0x0000000000000000000000000000000000000000     50
1      0x000000000000000000000000000000000000dead     50
2      0x00000000000357848314f068feca5d42e878a1d9     50
3      0x000000000074b24153d2d44e9d1beb308d8a1eb6     50
4      0x0000000048429ba5463a4a9aa866460087dcebd0     50
...                                           ...    ...
50646  0xfffe695cb09f056e0ddfdd8f0e447037610c945b     50
50647  0xffff14106945bcb267b34711c416aa3085b8865f     50
50648  0xffff3dcb664c3f69b049d121fba7b7d7273961ef     50
50649  0xffff46e05a09314daae9176fc32dba0f4172dcdb     50
50650  0xffff83075509851dca62ca604f191478ff041fd3     50

[28833 rows x 2 columns]


In [4]:
print(len(temp_df['address'].unique()))

50651


In [None]:
#BUGFIX: 
''' If we want to refetch these 28833 addresses transactions, first we need to run the pre-run 4 steps 
before running the fetch_tx.py script. Then, Run the fetch_tx.py, but update the Step3 to check if we had the from to timestamp
skip the tx, otherwise append that. Then run the script to find all unique contract addresses within the new user_tx.csv that were not
before in old user_tx.csv. Now run the smart contract downloader and verified_smart_contract to first download the new contracts
and then parse them (Now we have the final parquet file of new parsed contracts). Now from parquets we update the 
contract_addressed_with_names.csv (append new unique ones). EOA addresses are the same but we need to run the EDA for anything related 
to contracts. Then rerun all recommenders.
'''