In [1]:
import os
import re
import ast
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE


In [2]:
# Step 1: Load smart contracts (Solidity files)

def load_smart_contracts(contract_dir):
    contracts = []
    labels = []
    for filename in os.listdir(contract_dir):
        if filename.endswith(".sol"):
            with open(os.path.join(contract_dir, filename), 'r') as file:
                contracts.append(file.read())
            # Assuming the label (secure/insecure) is part of the filename
            label = 1 if "secure" in filename else 0
            labels.append(label)
    return contracts, labels

In [3]:
# Step 2: Preprocessing the smart contracts
def preprocess_contracts(contracts):
    preprocessed_contracts = []
    for contract in contracts:
        # Remove comments and unnecessary spaces
        contract = re.sub(r'//.*?(\n|$)|/\*.*?\*/', '', contract, flags=re.S)
        preprocessed_contracts.append(contract)
    return preprocessed_contracts


In [4]:
# Step 3: Feature extraction using TF-IDF
def extract_features(contracts):
    vectorizer = TfidfVectorizer(token_pattern=r'[A-Za-z_][A-Za-z0-9_]*', max_features=1000)
    X = vectorizer.fit_transform(contracts)
    print(f"TF-IDF Vocabulary Size: {len(vectorizer.vocabulary_)}")  # Debugging
    return X

In [5]:
# Step 4: Model Training
def train_model(X, y):
    smote = SMOTE(random_state=42, k_neighbors=1)
    try:
        X_resampled, y_resampled = smote.fit_resample(X, y)
    except ValueError as e:
        print(f"SMOTE Error: {e}")
        return None

    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
    model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred, zero_division=0))
    return model

# Main execution
contract_dir = './contracts'  # Path to the smart contracts folder
contracts, labels = load_smart_contracts(contract_dir)
if contracts and labels:
    contracts = preprocess_contracts(contracts)
    X = extract_features(contracts)
    y = np.array(labels)
    model = train_model(X, y)
else:
    print("No contracts found or invalid labels.")

TF-IDF Vocabulary Size: 495
Accuracy: 0.859375
              precision    recall  f1-score   support

           0       1.00      0.72      0.84        32
           1       0.78      1.00      0.88        32

    accuracy                           0.86        64
   macro avg       0.89      0.86      0.86        64
weighted avg       0.89      0.86      0.86        64



In [6]:
# Main execution
contract_dir = './contracts'  # Path to the smart contracts folder
contracts, labels = load_smart_contracts(contract_dir)
contracts = preprocess_contracts(contracts)
X = extract_features(contracts)
y = np.array(labels)
model = train_model(X, y)

TF-IDF Vocabulary Size: 495
Accuracy: 0.859375
              precision    recall  f1-score   support

           0       1.00      0.72      0.84        32
           1       0.78      1.00      0.88        32

    accuracy                           0.86        64
   macro avg       0.89      0.86      0.86        64
weighted avg       0.89      0.86      0.86        64



In [None]:
#Analysis of Results
#TF-IDF Vocabulary Size: A vocabulary size of 495 indicates a reasonably diverse set of features extracted from the contracts. 
#                        This size is reflective of the structure and terms used in Solidity code.

#Accuracy:  The overall accuracy of 85.94% suggests the model is performing reasonably well but is not yet perfect.
#                        This indicates that the model can differentiate between secure and insecure contracts effectively but struggles with some edge cases.

#Precision and Recall:

#Class 0 (Insecure Contracts):
#Precision is 1.00, indicating no false positives (no secure contracts were misclassified as insecure).
#Recall is 0.72, meaning 28% of insecure contracts were not identified (false negatives).
#Class 1 (Secure Contracts):
#Precision is 0.78, meaning some insecure contracts were misclassified as secure (false positives).
#Recall is 1.00, indicating all secure contracts were correctly identified.
#This imbalance shows that while the model is cautious about labeling contracts as "secure," it occasionally misses insecure ones.

#Weighted Metrics: Weighted metrics align closely with macro averages because the dataset is balanced (equal numbers of secure and insecure contracts).