## Dataset Preprocessor

Initial experiment setup

- Initial idea: Classifier to identify vulnerable lines of code. 
- Dataset : smartbugs curated- vulnerable lines of code




In [2]:
# First, get the map of contracts and respective line of code

import os
import sys
import json

dataset= "smartbugs-curated"

path='../dataset/'+dataset
if dataset == 'smartbugs-curated' :
    vulnerability_localization= json.load(open(path+'/vulnerabilities.json'))


In [3]:


## now prepare the dataset, for each contract, get the line of code that is vulnerable
base_path = f'../dataset/{dataset}'
# Function to extract vulnerable lines
def extract_vulnerable_lines(contract_data):
    vulnerable_lines = []
    
    for contract in contract_data:
        contract_name = contract['name']
        relative_path = contract['path']
        full_path = os.path.join(base_path, relative_path)
        
        # Check if file exists
        if not os.path.exists(full_path):
            print(f"Warning: File not found - {full_path}")
            continue
        
        # Read the file content
        try:
            with open(full_path, 'r', encoding='utf-8') as file:
                file_lines = file.readlines()
        except Exception as e:
            print(f"Error reading {full_path}: {e}")
            continue
            
        # Process vulnerabilities
        for vulnerability in contract['vulnerabilities']:
            category = vulnerability['category']
            
            for line_number in vulnerability['lines']:
                # Adjust for 0-based indexing
                adjusted_line_number = line_number - 1
                
                # Extract the actual code (if line number is valid)
                code_line = ""
                if 0 <= adjusted_line_number < len(file_lines):
                    code_line = file_lines[adjusted_line_number].strip()
                else:
                    print(f"Warning: Line {line_number} out of range in {full_path}")
                
                vulnerable_lines.append({
                    'contract_name': contract_name,
                    'contract_path': relative_path,
                    'full_path': full_path,
                    'pragma_version': contract['pragma'],
                    'source': contract['source'],
                    'line_number': line_number,
                    'vulnerability_category': category,
                    'code': code_line
                })
    
    return vulnerable_lines
        
    

In [4]:
vulnerable_lines = extract_vulnerable_lines(vulnerability_localization)
print(len(vulnerable_lines))
print(vulnerable_lines[0:3])

## storage data
with open(f'../dataset/{dataset}_vulnerable_lines.json', 'w') as file:
    json.dump(vulnerable_lines, file, indent=4)





222
[{'contract_name': 'FibonacciBalance.sol', 'contract_path': 'dataset/access_control/FibonacciBalance.sol', 'full_path': '../dataset/smartbugs-curated/dataset/access_control/FibonacciBalance.sol', 'pragma_version': '0.4.22', 'source': 'https://github.com/sigp/solidity-security-blog', 'line_number': 31, 'vulnerability_category': 'access_control', 'code': 'require(fibonacciLibrary.delegatecall(fibSig, withdrawalCounter));'}, {'contract_name': 'FibonacciBalance.sol', 'contract_path': 'dataset/access_control/FibonacciBalance.sol', 'full_path': '../dataset/smartbugs-curated/dataset/access_control/FibonacciBalance.sol', 'pragma_version': '0.4.22', 'source': 'https://github.com/sigp/solidity-security-blog', 'line_number': 38, 'vulnerability_category': 'access_control', 'code': 'require(fibonacciLibrary.delegatecall(msg.data));'}, {'contract_name': 'arbitrary_location_write_simple.sol', 'contract_path': 'dataset/access_control/arbitrary_location_write_simple.sol', 'full_path': '../dataset/s

### Now Find a way to represent the data
Feature Engineering Pipeline
#### 1. Tokenization (code)

    Tokenize Solidity code by splitting on non-alphanumeric characters.
    Remove comments and unnecessary whitespace.
    Convert tokens into numerical representations (e.g., TF-IDF, one-hot encoding, or embeddings).

#### 2. Presence of External Calls

    Check if the line contains low-level function calls like:
        call, delegatecall, staticcall, send, transfer
    Store this as a binary feature (1 if present, 0 otherwise).

#### 3. Use of require or assert

    Check if require(...) or assert(...) appears in the line.
    Store as a binary feature (1 if present, 0 otherwise).

#### 4. Encoding Categorical Features

    pragma_version: Convert Solidity versions into numerical features (e.g., split into major, minor, and patch).
    vulnerability_category: Use label encoding or one-hot encoding.
    contract_name: Encode as a categorical variable (or use hashing to avoid high-dimensionality).

#### 5. Normalization of line_number

    Scale it to a [0,1] range using Min-Max Scaling.


In [30]:
import re
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Helper function to tokenize Solidity code
def tokenize_code(code):
    # Remove comments
    code = re.sub(r"//.*|/\*[\s\S]*?\*/", "", code)
    # Tokenize by splitting on non-alphanumeric characters
    tokens = re.findall(r"\w+", code)
    return tokens

# Feature extraction function
def extract_features(vulnerable_lines):
    df = pd.DataFrame(vulnerable_lines)

    #Code Encoding
    code_enc = LabelEncoder()
    df["code_enc"] = code_enc.fit_transform(df["code"])
    # Tokenization
    df["tokens"] = df["code"].apply(tokenize_code)
    df['tokens']= df['tokens'].apply(len)
    
    # Presence of external calls
    external_calls = ["call", "delegatecall", "staticcall", "send", "transfer"]
    df["has_external_call"] = df["code"].apply(lambda x: any(call in x for call in external_calls)).astype(int)
    
    # Presence of `require` or `assert`
    df["has_require_assert"] = df["code"].apply(lambda x: "require" in x or "assert" in x).astype(int)

    # Encoding categorical variables
    df["pragma_version"] = df["pragma_version"].apply(lambda x: tuple(map(int, x.lstrip("^").split("."))) if x else (0, 0, 0))
    df[["pragma_major", "pragma_minor", "pragma_patch"]] = pd.DataFrame(df["pragma_version"].tolist(), index=df.index)

    label_enc = LabelEncoder()
    df["vulnerability_category"] = label_enc.fit_transform(df["vulnerability_category"])

    df["contract_name"] = label_enc.fit_transform(df["contract_name"])

    # Normalize `line_number`
    scaler = MinMaxScaler()
    df["line_number"] = scaler.fit_transform(df[["line_number"]])

    df["label"]=True

    return df.drop(columns=["pragma_version", "code", "contract_path", "full_path", "source"])

# Usage
processed_df = extract_features(vulnerable_lines)
#print(processed_df.head())

In [31]:
processed_df

Unnamed: 0,contract_name,line_number,vulnerability_category,code_enc,tokens,has_external_call,has_require_assert,pragma_major,pragma_minor,pragma_patch,label
0,68,0.007739,0,115,5,1,1,0,4,22,True
1,68,0.010591,0,116,5,1,1,0,4,22,True
2,70,0.006110,0,109,4,0,1,0,4,25,True
3,85,0.003259,0,59,2,0,0,0,4,24,True
4,86,0.002444,0,67,2,0,0,0,4,24,True
...,...,...,...,...,...,...,...,...,...,...,...
217,97,0.065988,9,158,3,1,0,0,4,0,True
218,100,0.003259,9,156,3,1,0,0,4,18,True
219,100,0.006110,9,94,5,1,0,0,4,18,True
220,104,0.000815,9,90,4,1,0,0,4,0,True


In [None]:
### Generate non vulnerable lines

In [6]:
### generate training and eval data
import random
random.seed(42)


