## Dataset Preprocessor

Initial experiment setup

- Initial idea: Classifier to identify vulnerable lines of code. 
- Dataset : smartbugs curated- vulnerable lines of code




In [8]:
# First, get the map of contracts and respective line of code

import os
import sys
import json

dataset= "smartbugs-curated"

path='../dataset/'+dataset
if dataset == 'smartbugs-curated' :
    vulnerability_localization= json.load(open(path+'/vulnerabilities.json'))


In [19]:


## now prepare the dataset, for each contract, get the line of code that is vulnerable
base_path = f'../dataset/{dataset}'
# Function to extract vulnerable lines
def extract_vulnerable_lines(contract_data):
    vulnerable_lines = []
    
    for contract in contract_data:
        contract_name = contract['name']
        relative_path = contract['path']
        full_path = os.path.join(base_path, relative_path)
        
        # Check if file exists
        if not os.path.exists(full_path):
            print(f"Warning: File not found - {full_path}")
            continue
        
        # Read the file content
        try:
            with open(full_path, 'r', encoding='utf-8') as file:
                file_lines = file.readlines()
        except Exception as e:
            print(f"Error reading {full_path}: {e}")
            continue
            
        # Process vulnerabilities
        for vulnerability in contract['vulnerabilities']:
            category = vulnerability['category']
            
            for line_number in vulnerability['lines']:
                # Adjust for 0-based indexing
                adjusted_line_number = line_number - 1
                
                # Extract the actual code (if line number is valid)
                code_line = ""
                if 0 <= adjusted_line_number < len(file_lines):
                    code_line = file_lines[adjusted_line_number].strip()
                else:
                    print(f"Warning: Line {line_number} out of range in {full_path}")
                
                vulnerable_lines.append({
                    'contract_name': contract_name,
                    'contract_path': relative_path,
                    'full_path': full_path,
                    'pragma_version': contract['pragma'],
                    'source': contract['source'],
                    'line_number': line_number,
                    'vulnerability_category': category,
                    'code': code_line
                })
    
    return vulnerable_lines
        
    

In [21]:
vulnerable_lines = extract_vulnerable_lines(vulnerability_localization)
print(len(vulnerable_lines))
print(vulnerable_lines[0:3])

## storage data
with open(f'../dataset/{dataset}_vulnerable_lines.json', 'w') as file:
    json.dump(vulnerable_lines, file, indent=4)



222
[{'contract_name': 'FibonacciBalance.sol', 'contract_path': 'dataset/access_control/FibonacciBalance.sol', 'full_path': '../dataset/smartbugs-curated/dataset/access_control/FibonacciBalance.sol', 'pragma_version': '0.4.22', 'source': 'https://github.com/sigp/solidity-security-blog', 'line_number': 31, 'vulnerability_category': 'access_control', 'code': 'require(fibonacciLibrary.delegatecall(fibSig, withdrawalCounter));'}, {'contract_name': 'FibonacciBalance.sol', 'contract_path': 'dataset/access_control/FibonacciBalance.sol', 'full_path': '../dataset/smartbugs-curated/dataset/access_control/FibonacciBalance.sol', 'pragma_version': '0.4.22', 'source': 'https://github.com/sigp/solidity-security-blog', 'line_number': 38, 'vulnerability_category': 'access_control', 'code': 'require(fibonacciLibrary.delegatecall(msg.data));'}, {'contract_name': 'arbitrary_location_write_simple.sol', 'contract_path': 'dataset/access_control/arbitrary_location_write_simple.sol', 'full_path': '../dataset/s