
### Ethereum Address Scan is an intuitive tool designed for users to easily retrieve detailed information about any Ethereum address. Just input an Ethereum address, and get a comprehensive overview in a user-friendly format.

## Full Project Solution

### Install the necesarry libraries. 

In [1]:
pip install openai

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install openai[datalib]

zsh:1: no matches found: openai[datalib]
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install urllib3==1.26.6 

Collecting urllib3==1.26.6
  Downloading urllib3-1.26.6-py2.py3-none-any.whl.metadata (44 kB)
Downloading urllib3-1.26.6-py2.py3-none-any.whl (138 kB)
Installing collected packages: urllib3
  Attempting uninstall: urllib3
    Found existing installation: urllib3 2.2.2
    Uninstalling urllib3-2.2.2:
      Successfully uninstalled urllib3-2.2.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
types-requests 2.32.0.20240914 requires urllib3>=2, but you have urllib3 1.26.6 which is incompatible.[0m[31m
[0mSuccessfully installed urllib3-1.26.6
Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.8.0-cp311-cp311-macosx_10_9_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.8.0-cp311-cp311-macosx_10_9_x86_64.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.8.0
Note: you may need to restart the kernel to use updated packages.


### Import the libraries and enviornment file to gain access to the Open API Key
#### The key can be generated here: https://platform.openai.com/account/api-keys

In [6]:
import os
from openai import OpenAI

### Authenticate to the API using the API Key
#### Pull from environment variables or use openai.api_key = ("your_key_here") to hardcode the key

In [7]:

client = OpenAI(api_key="your_key_here")

### Helper Functions, taken from https://platform.openai.com/docs/guides/fine-tuning docs

### Convert JSON to JSONL

In [9]:
import csv
import json

# Read CSV and Convert to JSONL
csv_file = 'eth_addresses.csv'  # Replace with your CSV file path
jsonl_file = 'output.jsonl'  # Output JSONL file path

# Open CSV file and JSONL output file
with open(csv_file, mode='r', newline='', encoding='utf-8') as csvfile, open(jsonl_file, mode='w', encoding='utf-8') as jsonlfile:
    # Read CSV using DictReader to maintain column names
    reader = csv.DictReader(csvfile)
    
    for row in reader:
        # Convert each row to JSON and write to the JSONL file
        json.dump(row, jsonlfile)
        jsonlfile.write('\n')

print(f"Successfully converted {csv_file} to {jsonl_file}")


Successfully converted eth_addresses.csv to output.jsonl


In [11]:
import csv
import json

# Input CSV file and output JSONL file paths
csv_file = 'eth_addresses.csv'  # Replace with your CSV file path
jsonl_file = 'output.jsonl'  # Desired output JSONL file path

# Open CSV and JSONL files
with open(csv_file, mode='r', newline='', encoding='utf-8') as csvfile, open(jsonl_file, mode='w', encoding='utf-8') as jsonlfile:
    # Read the CSV file using DictReader
    reader = csv.DictReader(csvfile)
    
    for row in reader:
        # Construct the JSON structure based on the required format
        message = {
            "messages": [
                {"role": "system", "content": "This tool verifies Ethereum addresses. Simply enter the address you want to retrieve details about"},
                {"role": "user", "content": row['Address']},
                {
                    "role": "assistant",
                    "content": f'"Name": "{row["Name"]}", '
                               f'"Account Type": "{row["Account Type"]}", '
                               f'"Contract Type": "{row["Contract Type"]}", '
                               f'"Entity": "{row["Entity"]}", '
                               f'"Label": "{row["Label"]}", '
                               f'"Tags": "{row["Tags"]}"'
                }
            ]
        }
        
        # Write each message object as a new line in JSONL format
        jsonlfile.write(json.dumps(message) + "\n")

print(f"Successfully converted {csv_file} to {jsonl_file}")


Successfully converted eth_addresses.csv to output.jsonl


### Check File Format

https://cookbook.openai.com/examples/chat_finetuning_data_prep

In [16]:
data_path = "output.jsonl"

# Load the dataset
with open(data_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)

Num examples: 19138
First example:
{'role': 'user', 'content': '0x8ab7404063ec4dbcfd4598215992dc3f8ec853d7'}
{'role': 'assistant', 'content': '"Name": " Akropolis (AKRO)", "Account Type": "Smart Contract", "Contract Type": "Token", "Entity": "DeFi", "Label": "Legit", "Tags": "DeFi"'}


In [17]:
# Format validation
check_file_format(dataset)

No errors found


### Cost Estimation

In [18]:
# Get the length of the conversation
conversation_length = []

for msg in dataset:
    messages = msg["messages"]
    conversation_length.append(num_tokens_from_messages(messages))
    
# Pricing and default n_epochs estimate
MAX_TOKENS_PER_EXAMPLE = 4096
TARGET_EPOCHS = 5
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)

if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in conversation_length)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")

num_tokens = n_epochs * n_billing_tokens_in_dataset

Dataset has ~1473815 tokens that will be charged for during training
By default, you'll train for 1 epochs on this dataset
By default, you'll be charged for ~1473815 tokens


In [20]:
# gpt-4o-mini-2024-07-18 cost is $0.0030 / 1K tokens
cost = (num_tokens/1000) * 0.0030 
print(cost)

4.421445


### Upload File 
#### Once you have the data validated, the file needs to be uploaded using the 
#### Files API in order to be used with a fine-tuning jobs

In [21]:
client.files.create(
  file=open("output.jsonl", "rb"),
  purpose="fine-tune"
)

FileObject(id='file-44IgH9zSsw2N1UlRmAfpg0nF', bytes=5519454, created_at=1728295894, filename='output.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

### Create fine-tuned model

In [22]:
# Start the fine-tuning job 
# After you've started a fine-tuning job, it may take some time to complete. Your job may be queued 
# behind other jobs and training a model can take minutes or hours depending on the 
# model and dataset size. 

client.fine_tuning.jobs.create(
  training_file="file-44IgH9zSsw2N1UlRmAfpg0nF", 
  model="gpt-4o-mini-2024-07-18",
  hyperparameters={
    "n_epochs":3
  }
)

FineTuningJob(id='ftjob-jjY1hv3Z3mc5sbp9J0xLW1I2', created_at=1728295982, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=2, batch_size='auto', learning_rate_multiplier='auto'), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-Xpq7QD5G5VDB9ibrYpVDO9at', result_files=[], seed=968622431, status='validating_files', trained_tokens=None, training_file='file-44IgH9zSsw2N1UlRmAfpg0nF', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix=None)

In [23]:
# Retrieve job status
job_id = "ftjob-jjY1hv3Z3mc5sbp9J0xLW1I2"

# Retrieve the state of a fine-tune
# Status field can contain: running or succeeded or failed, etc.
client.fine_tuning.jobs.retrieve(job_id)

FineTuningJob(id='ftjob-jjY1hv3Z3mc5sbp9J0xLW1I2', created_at=1728295982, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=2, batch_size=25, learning_rate_multiplier=1.8), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-Xpq7QD5G5VDB9ibrYpVDO9at', result_files=[], seed=968622431, status='validating_files', trained_tokens=None, training_file='file-44IgH9zSsw2N1UlRmAfpg0nF', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix=None)

### Evaluate results 

In [24]:
import io
import pandas as pd

#once training is finished, you can retrieve the file in "result_files=[]"
result_file = "file-44IgH9zSsw2N1UlRmAfpg0nF"

file_data = client.files.content(result_file)

# its binary, so read it and then make it a file like object
file_data_bytes = file_data.read()
file_like_object = io.BytesIO(file_data_bytes)

#now read as csv to create df
df = pd.read_csv(file_like_object)
df

Unnamed: 0,step,train_loss,train_accuracy,valid_loss,valid_mean_token_accuracy
0,1,1.62449,0.74194,,
1,2,1.58815,0.62963,,
2,3,1.68213,0.60000,,
3,4,2.31334,0.60870,,
4,5,1.89790,0.61290,,
...,...,...,...,...,...
500,501,0.16619,0.92593,,
501,502,0.49473,0.79412,,
502,503,0.24529,0.90909,,
503,504,0.28140,0.89286,,


### Iterate on the Model results  

In [4]:
client.fine_tuning.jobs.create(
  training_file="file-44IgH9zSsw2N1UlRmAfpg0nF", 
  model="gpt-4o-mini-2024-07-18",
  hyperparameters={
    "n_epochs":3
  }
)

FineTuningJob(id='ftjob-ddj5DgpOc0khmh3OCDkUxytC', created_at=1709004655, error=Error(code=None, message=None, param=None, error=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=4, batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0613', object='fine_tuning.job', organization_id='org-RZLvEijW4GW0KmC3rLIAjZlu', result_files=[], status='validating_files', trained_tokens=None, training_file='file-IntFuYDWVfJwMp6TpSrJa8aq', validation_file=None, user_provided_suffix=None)

In [15]:
# Retrieve job status
job_id = "ftjob-ddj5DgpOc0khmh3OCDkUxytC"

# Retrieve the state of a fine-tune
# Status field can contain: running or succeeded or failed, etc.
client.fine_tuning.jobs.retrieve(job_id)

FineTuningJob(id='ftjob-ddj5DgpOc0khmh3OCDkUxytC', created_at=1709004655, error=Error(code=None, message=None, param=None, error=None), fine_tuned_model='ft:gpt-3.5-turbo-0613:keysoft::8wiiPbKa', finished_at=1709005600, hyperparameters=Hyperparameters(n_epochs=4, batch_size=1, learning_rate_multiplier=2), model='gpt-3.5-turbo-0613', object='fine_tuning.job', organization_id='org-RZLvEijW4GW0KmC3rLIAjZlu', result_files=['file-rhw44JG1hrIRtpqXjGE0PK7C'], status='succeeded', trained_tokens=27388, training_file='file-IntFuYDWVfJwMp6TpSrJa8aq', validation_file=None, user_provided_suffix=None)

In [20]:
#once training is finished, you can retrieve the file in "result_files=[]"
result_file = "file-44IgH9zSsw2N1UlRmAfpg0nF"

file_data = client.files.content(result_file)

# its binary, so read it and then make it a file like object
file_data_bytes = file_data.read()
file_like_object = io.BytesIO(file_data_bytes)

#now read as csv to create df
df = pd.read_csv(file_like_object)
df

Unnamed: 0,step,train_loss,train_accuracy,valid_loss,valid_mean_token_accuracy
0,1,0.65891,0.72727,,
1,2,0.77342,0.77778,,
2,3,1.66960,0.76923,,
3,4,0.85210,0.81081,,
4,5,2.34291,0.60870,,
...,...,...,...,...,...
399,400,1.03416,0.66667,,
400,401,0.75915,0.75862,,
401,402,0.38260,0.83871,,
402,403,0.43998,0.86207,,


### Use a fine-tuned model

In [24]:
fine_tuned_model = "ft:gpt-3.5-turbo-0125:personal::AFatPoOi"  

response = client.chat.completions.create(
  model=fine_tuned_model,
  messages=[
    {"role": "system", "content": "Ethereum Address Scan is an intuitive tool designed for users to easily retrieve detailed information about any Ethereum address. Just input an Ethereum address, and get a comprehensive overview in a user-friendly format.",
     "role": "user", "content": "Is this address risky - 0x9d5765ae1c95c21d4cc3b1d5bba71bad3b012b68 ?"}
  ]
)
print(response.choices[0].message.content)

No, sharing a cryptocurrency address like 0x9d5765ae1c95c21d4cc3b1d5bba71bad3b012b68 is not risky as it is a public identifier for a wallet. However, make sure not to share your private key with anyone.
