# Pretraining data preprocesing

Now, we continue in our tutorial series with a set of steps necessary to extract the most meaningful information from the raw smart contract byte code. The bytecode is composed of instructions related to the Ethereum Virtual Machine. Each line contains instructions and their set of codes and operands, for instance, "PUSH1 0x60."

We could train our LLM with all the continuous instructions in smart contracts, but this does not really represent the way the contract is executed. The contract is composed of different functions, and their order of execution varies depending on the code logic. Hence, it makes more sense to separate the SC into distinct functions and use their instructions in the correct sequence to train our mode.

We used an external tool to get the code functions for this task. You can find that tool here: [https://github.com/franck44/evm-dis](https://github.com/franck44/evm-dis)

In [None]:
# First we call the necessary libraries
import os
import re
import tiktoken
import logging
import gc
import subprocess

gc.collect()

import pandas as pd
import numpy as np

In [None]:
# disable warning logs from evmdasm tool
# logging.getLogger("evmdasm").setLevel(logging.CRITICAL)

In [None]:
# Define the data we are going to use
COLS = ['contract_creator', 'contract_address', 'contract_name', 'decompiled_opcodes', 'malicious', 'creation_bytecode']
pretraining_data = pd.read_parquet('/data/forta/ethereum/text/pretraining/raw/verified/verified-smart-contracts.parquet', columns=COLS)

# This notebook also preprocess the data for later finetunning phase
# Is anomaly detection?
# We only want to extract the functions at the moment
extract_SC_functions = True
anomaly_detection_training = False
only_evaluation = False

In [None]:
count = pretraining_data['malicious'].value_counts()
number_normal = count.iloc[0]
number_malicious = count.iloc[1]

In [None]:
pretraining_data.columns

In [None]:
# We define a method to clear the bytecode data
# We delete things that does not contribute to the training
# Such as big numerical operands
def get_exp_2_features(row):
    creator = row['contract_creator']
    opcodes = row['decompiled_opcodes'].split()
    mask = '0xffffffffffffffffffffffffffffffffffffffff'
    features = []
    for i in range(len(opcodes)-1):
        first = opcodes[i]
        second = opcodes[i+1]
        if not first.startswith('0x'):
            token = first
            if first.startswith('UNKNOWN') or first.startswith('INVALID'):
                token = first.split('_')[0]
            features.append(token)
        elif first == 'PUSH4':
            features.append(second)
        elif first == 'PUSH20':
            if second == creator:
                features.append('creator')
            elif second == mask:
                features.append(mask)
            else:
                features.append('address')
        elif first == 'PUSH32':
            features.append(second)
    return " ".join(features)

def get_exp_2_features_function(opcodes_string, creator):
    opcodes = opcodes_string.split()
    mask = '0xffffffffffffffffffffffffffffffffffffffff'
    features = []
    for i in range(len(opcodes)-1):
        first = opcodes[i]
        second = opcodes[i+1]
        if not first.startswith('0x'):
            token = first
            if first.startswith('UNKNOWN') or first.startswith('INVALID'):
                token = first.split('_')[0]
            features.append(token)
        elif first == 'PUSH4':
            features.append(second)
        elif first == 'PUSH20':
            if second == creator:
                features.append('creator')
            elif second == mask:
                features.append(mask)
            else:
                features.append('address')
        elif first == 'PUSH32':
            features.append(second)
    return " ".join(features)

In [None]:
# We use the evm-dis tool to get a list of functions
# for each smart contract
# Call decompiler command and return functions
def get_SC_opcodes(bytecode_hex, creator):
    command = "evm-dis/build/libs/driver-py/__main__.py"
    result = None
    try:
        result = subprocess.run(['python',
                                 command,
                                 '--segment',
                                 '--raw',
                                 bytecode_hex],
                                 stdout=subprocess.PIPE)
    except Exception as error:
        print(error)
        
    separator = '--------------------------------------------'
    functions = result.stdout.decode('ascii').split(separator)
    smart_contract_text = ""
    for function in functions:
        function_line = ""
        instructions = function.splitlines()
        for instruction in instructions:
            if instruction.startswith('00'):
                function_line += str(instruction.split(' ', 1)[1]) + " "
        function_line = get_exp_2_features_function(function_line, creator)
        smart_contract_text += function_line + "\n"

    return smart_contract_text

# Get the opcode of the smart contract separated by its functions
def get_SC_functions(row):
    creator = row['contract_creator']
    address = row['contract_address']
    bytecode_hex = row['creation_bytecode']
    malicious = row['malicious']

    smart_contract_text = get_SC_opcodes(bytecode_hex, creator)

    store_SC_functions(smart_contract_text,
                       address,
                       malicious
                      )

def store_SC_functions(SC_function_opcode, file_name, malicious):
    file_path = '/data/forta/ethereum/text/finetuning/training/'
    if bool(malicious):
        file_path += 'malicious/'
    else:
        file_path += 'normal/'
    file_path += file_name + '.csv'
    try:
        with open(file_path, 'w') as file:
            file.write(SC_function_opcode)
    except Exception as e:
        print(e)


In [None]:
# Prepare data for pretraining phase
# First clean and process the opcode data
pretraining_data['experiment_2_opcodes'] = pretraining_data.apply(get_exp_2_features, axis=1)
training_data = None
validation_data = None
normal_data = None
malicious_data = None

# Files to store the data
train_file_path = '/data/forta/ethereum/text/pretraining/small_pretraining_train.csv'
val_file_path = '/data/forta/ethereum/text/pretraining/small_pretraining_val.csv'

if only_evaluation: 
    evaluation_file_path = "/data/forta/ethereum/text/evaluation/malicious-eval.csv"
    pretraining_data.loc[pretraining_data['malicious'] == True]['experiment_2_opcodes'].to_csv(
        evaluation_file_path, sep=',', index=False)
elif not anomaly_detection_training:
    percentaje_normal = int(number_normal - (number_normal*0.1))
    percentaje_malicious = int(number_malicious - (number_malicious*0.1))
    # Suffle normal and malicious data if no anomaly detection
    normal_data = pretraining_data.loc[pretraining_data['malicious'] == False]
    malicious_data = pretraining_data.loc[pretraining_data['malicious'] == True]

    normal_data = normal_data.sample(frac = 1)
    malicious_data = malicious_data.sample(frac = 1)

    training_data = pd.concat([normal_data[:percentaje_normal-1], malicious_data[:percentaje_malicious-1]])
    validation_data = pd.concat([normal_data[percentaje_normal:], malicious_data[percentaje_malicious:]])
    
    training_data = training_data.sample(frac = 1)
    validation_data = validation_data.sample(frac = 1)
else:
    # If anomaly detection only train with normal data
    normal_data = pretraining_data.loc[pretraining_data['malicious'] == False]
    malicious_data = pretraining_data.loc[pretraining_data['malicious'] == True]
    normal_data = normal_data.sample(frac = 1)
    malicious_data = malicious_data.sample(frac = 1)
    training_data = normal_data[:number_normal]
    validation_data = malicious_data

In [None]:
if training_data is not None and validation_data is not None:
    # Save the data to disk
    training_data['experiment_2_opcodes'].to_csv(train_file_path, sep='\t', index=False)
    validation_data['experiment_2_opcodes'].to_csv(val_file_path, sep='\t', index=False)
    if not anomaly_detection_training:
        # Prepare data for anomaly data selection phase
        # Training
        training_data.loc[training_data['malicious'] == False].to_csv('/data/forta/ethereum/text/pretraining/training/normal/normal.csv',
                                                columns=['experiment_2_opcodes'], sep='\t', index=False)
        training_data.loc[training_data['malicious'] == True].to_csv('/data/forta/ethereum/text/pretraining/training/malicious/malicious.csv',
                                                columns=['experiment_2_opcodes'], sep='\t', index=False)
        
        # Validation
        validation_data.loc[validation_data['malicious'] == False].to_csv('/data/forta/ethereum/text/pretraining/validation/normal/normal.csv',
                                                columns=['experiment_2_opcodes'], sep='\t', index=False)
        validation_data.loc[validation_data['malicious'] == True].to_csv('/data/forta/ethereum/text/pretraining/validation/malicious/malicious.csv',
                                                columns=['experiment_2_opcodes'], sep='\t', index=False)

In [None]:
if extract_SC_functions:
    pretraining_data.apply(get_SC_functions, axis=1)