<a href="https://colab.research.google.com/github/sallywang147/SCInvarinfer/blob/main/%5Bepxeriment%5D_T5_for_smart_contracts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
!pip install sentencepiece
!pip install transformers
!pip install rich[jupyter]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [18]:
from google.colab import auth
from google.auth import default
from numpy import random
import pandas as pd
import gspread
import gc
#autenticating to google
auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)
#defining my worksheet
worksheet = gc.open('invariants_line_number').sheet1
#get_all_values gives a list of rows
rows = worksheet.get_all_values()
#Convert to a DataFrame 
cols = ['Source', 'Target', 'Verify_Success']
df = pd.DataFrame(rows, columns=cols)

In [19]:
df

Unnamed: 0,Source,Target,Verify_Success
0,Source,Target,Verify Success
1,1 pragma solidity >=0.4.24 <0.6.0;\n\n2 contra...,14+ assert (y == x + 4);,
2,1 // SPDX-License-Identifier: MIT\n2 pragma ex...,"109+ require(admin == msg.sender, ""Ownable: c...",
3,1 pragma solidity >=0.4.24 <0.6.0;\n\n2 contra...,7+ assert (y == x + 2);,
4,1 pragma solidity >=0.4.24 <0.6.0;\n2 contract...,20+ assert(funcA2(funcA1())==12);\n20+ assert(...,
5,1 pragma solidity >=0.4.24 <0.6.0;\n\n2 // Thi...,24+ assert(a == x + 1);\n32+ assert(a == x);\n...,
6,1 pragma solidity >=0.4.24 <0.6.0;\n2 contract...,20+ assert(funcA2(funcA1())==12);\n20+ assert(...,
7,1 pragma solidity >=0.4.24 <0.6.0;\n\n2 // Thi...,24+ assert(a == x + 1);\n32+ assert(a == x); ...,
8,"1 pragma solidity >=0.4.24<0.6.0;\n2 import ""....",6+ assert(x == y);\n6+ assert(y >= 0);\n19+ ...,
9,pragma solidity >=0.4.24 <0.6.0;\n\n//simple l...,5+ assert(x == msg.sender);\n14+ assert (z == 3);,


In [20]:
# Importing libraries
import os
import numpy as np
import pandas as pd
import plotly.express as px
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler


# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration

from rich.table import Column, Table
from rich import box
from rich.console import Console

# define a rich console logger
console=Console(record=True)

def display_df(df):
  """display dataframe in ASCII format"""

  console=Console()
  table = Table(Column("source_text", justify="center" ), Column("target_text", justify="center"), title="Sample Data",pad_edge=False, box=box.ASCII)

  for i, row in enumerate(df.values.tolist()):
    table.add_row(row[0], row[1])

  console.print(table)

def plot_loss(index_list, loss_list):
  results = {
      "epochs": index_list,
      "cross entropy loss": loss_list,
  }
  df = pd.DataFrame(results)
  fig = px.line(df, x ="epochs", y="cross entropy loss",  title="Evaluation")
  fig.show()

training_logger = Table(Column("Epoch", justify="center" ), 
                        Column("Steps", justify="center"),
                        Column("Cross Entropy Loss", justify="center"), 
                        title="Training Status",pad_edge=False, box=box.ASCII)


In [21]:
# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [22]:
class YourDataSetClass(Dataset):
  """
  Creating a custom dataset for reading the dataset and 
  loading it into the dataloader to pass it to the neural network for finetuning the model

  """

  def __init__(self, dataframe, tokenizer, source_len, target_len, source_text, target_text):
    self.tokenizer = tokenizer
    self.data = dataframe
    self.source_len = source_len
    self.summ_len = target_len
    self.target_text = self.data[target_text]
    self.source_text = self.data[source_text]

  def __len__(self):
    return len(self.target_text)

  def __getitem__(self, index):
    source_text = str(self.source_text[index])
    target_text = str(self.target_text[index])

    #cleaning data so as to ensure data is in string type
    source_text = ' '.join(source_text.split())
    target_text = ' '.join(target_text.split())
    
    source = self.tokenizer.batch_encode_plus([source_text], \
                                              max_length= self.source_len, \
                                              pad_to_max_length=True, \
                                              truncation=True, \
                                            #  padding='longest',\
                                              return_tensors='pt')
    target = self.tokenizer.batch_encode_plus([target_text], \
                                              max_length= self.summ_len, \
                                              pad_to_max_length=True, \
                                              truncation=True, \
                                           #   padding='longest', \
                                              return_tensors='pt')   
    source_ids = source['input_ids'].squeeze()
    source_mask = source['attention_mask'].squeeze()
    target_ids = target['input_ids'].squeeze()
    target_mask = target['attention_mask'].squeeze()

    return {
        'source_ids': source_ids.to(dtype=torch.long), 
        'source_mask': source_mask.to(dtype=torch.long), 
        'target_ids': target_ids.to(dtype=torch.long),
        'target_ids_y': target_ids.to(dtype=torch.long)
    }

In [23]:
def train(epoch, tokenizer, model, device, loader, optimizer):

  """
  Function to be called for training with the parameters passed from main function

  """

  model.train()
  loss_list = []
  for _, data in enumerate(loader, 0):
    y = data['target_ids'].to(device, dtype = torch.long)
    y_ids = y[:, :-1].contiguous()
    lm_labels = y[:, 1:].clone().detach()
    lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
    ids = data['source_ids'].to(device, dtype = torch.long)
    mask = data['source_mask'].to(device, dtype = torch.long)
    outputs = model(input_ids = ids, attention_mask = mask, \
                    decoder_input_ids=y_ids, labels=lm_labels)  
    total_loss = float(outputs[0].item())
    loss_list.append(total_loss)
    if _%10==0:
      training_logger.add_row(str(epoch), str(_), str(total_loss))
      console.print(training_logger)

    optimizer.zero_grad()
    outputs[0].backward()
    optimizer.step()
    return loss_list



In [24]:
def validate(epoch, tokenizer, model, device, loader):

  """
  Function to evaluate model for predictions

  """
  model.eval()
  predictions = []
  actuals = []
  with torch.no_grad():
      for _, data in enumerate(loader, 0):
          y = data['target_ids'].to(device, dtype = torch.long)
          ids = data['source_ids'].to(device, dtype = torch.long)
          mask = data['source_mask'].to(device, dtype = torch.long)

          generated_ids = model.generate(
              input_ids = ids,
              attention_mask = mask, 
              max_length=700, 
              num_beams=2,
              repetition_penalty=2.5, 
              length_penalty=5.0, 
              early_stopping=True
              )
          print("generated token length: \n", len(generated_ids[0]))
          preds = [tokenizer.decode(g, skip_special_tokens=True, \
                                    clean_up_tokenization_spaces=True) for g in generated_ids]
          target = [tokenizer.decode(t, skip_special_tokens=True, \
                                     clean_up_tokenization_spaces=True)for t in y]
          if _%10==0:
              console.print(f'Completed {_}')
          predictions.extend(preds)
          actuals.extend(target)
  print('predictions: \n', predictions)
  print('actuals: \n', actuals)
  return predictions, actuals

In [25]:
import gc
def T5Trainer(dataframe, source_text, target_text, model_params, output_dir="./output/"):
  
  """
  T5 trainer

  """

  # Set random seeds and deterministic pytorch for reproducibility
  torch.manual_seed(model_params["SEED"]) # pytorch random seed
  np.random.seed(model_params["SEED"]) # numpy random seed
  torch.backends.cudnn.deterministic = True

  # logging
  console.log(f"""[Model]: Loading {model_params["MODEL"]}...\n""")

  # tokenzier for encoding the text
  tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])

  gc.collect()
  torch.cuda.empty_cache()
  os.environ['CUDA_VISIBLE_DEVICES']='0, 1, 2, 3'
  os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
  # Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary. 
  # Further this model is sent to device (GPU/TPU) for using the hardware.
  model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
  model = model.to(device)
  
  # logging
  console.log(f"[Data]: Reading data...\n")

  # Importing the raw dataset
  dataframe = dataframe[[source_text,target_text]]
  display_df(dataframe.head(2))

  
  # Creation of Dataset and Dataloader
  # Defining the train size. So 80% of the data will be used for training and the rest for validation. 
  train_size = 0.8
  train_dataset=dataframe.sample(frac=train_size,random_state = model_params["SEED"])
  val_dataset=dataframe.drop(train_dataset.index).reset_index(drop=True)
  train_dataset = train_dataset.reset_index(drop=True)

  console.print(f"FULL Dataset: {dataframe.shape}")
  console.print(f"TRAIN Dataset: {train_dataset.shape}")
  console.print(f"TEST Dataset: {val_dataset.shape}\n")


  # Creating the Training and Validation dataset for further creation of Dataloader
  training_set = YourDataSetClass(train_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"], source_text, target_text)
  val_set = YourDataSetClass(val_dataset, tokenizer, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"], source_text, target_text)


  # Defining the parameters for creation of dataloaders
  train_params = {
      'batch_size': model_params["TRAIN_BATCH_SIZE"],
      'shuffle': True,
      'num_workers': 0
      }


  val_params = {
      'batch_size': model_params["VALID_BATCH_SIZE"],
      'shuffle': False,
      'num_workers': 0
      }


  # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
  training_loader = DataLoader(training_set, **train_params)
  val_loader = DataLoader(val_set, **val_params)


  # Defining the optimizer that will be used to tune the weights of the network in the training session. 
  optimizer = torch.optim.Adam(params = model.parameters(), lr=model_params["LEARNING_RATE"])


  # Training loop
  console.log(f'[Initiating Fine Tuning]...\n')

  loss_result = []
  epoch_list = []
  for epoch in range(model_params["TRAIN_EPOCHS"]):
      loss = train(epoch, tokenizer, model, device, training_loader, optimizer)
      loss_result.extend(loss)
      epoch_list.append(epoch)

  plot_loss(epoch_list, loss_result)
  console.log(f"[Saving Model]...\n")
  #Saving the model after training
  path = os.path.join(output_dir, "model_files")
  model.save_pretrained(path)
  tokenizer.save_pretrained(path)


  # evaluating test dataset
  console.log(f"[Initiating Validation]...\n")
  for epoch in range(model_params["VAL_EPOCHS"]):
    predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
    final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
    final_df.to_csv(os.path.join(output_dir,'predictions.csv'))
  
  console.save_text(os.path.join(output_dir,'logs.txt'))
  
  console.log(f"[Validation Completed.]\n")
  console.print(f"""[Model] Model saved @ {os.path.join(output_dir, "model_files")}\n""")
  console.print(f"""[Validation] Generation on Validation data saved @ {os.path.join(output_dir,'predictions.csv')}\n""")
  console.print(f"""[Logs] Logs saved @ {os.path.join(output_dir,'logs.txt')}\n""")

In [26]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [27]:
model_params={
    "MODEL":"t5-small",             # model_type: t5-large
    "TRAIN_BATCH_SIZE": 8,          # training batch size: 
    "VALID_BATCH_SIZE":8,          # validation batch size
    "TRAIN_EPOCHS":30,              # number of training epochs:20 seems optimal based on experiments
    "VAL_EPOCHS":1,                # number of validation epochs
    "LEARNING_RATE":1e-4,          # learning rate
    "MAX_SOURCE_TEXT_LENGTH": 1000,  # max length of source text
    "MAX_TARGET_TEXT_LENGTH": 1000,   # max length of target text
    "SEED": random.randint(1000)    # randomized seeds to shuffle test set

}

In [28]:
#To solve CUDA out of memory error
import gc
gc.collect()
torch.cuda.empty_cache()
os.environ['CUDA_VISIBLE_DEVICES']='0, 1, 2, 3'
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

In [29]:
T5Trainer(dataframe=df, source_text="Source", target_text="Target", model_params=model_params, output_dir="outputs")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.





The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).



generated token length: 
 115


predictions: 
 ['Source Source', '1 pragma solidity >=0.4.24 0.6.0; 2 contract B  3 function funcB() public pure returns (uint)  4 return 42; 5  6 constructor() public  7  8 contract A  9 function funcA1() public pure returns (uint)  10 return 42; 11  12 function funcA2(uint x) public pure returns (uint); 17 return x+1;', '(address recipient, uint256 amount) public view returns (uint256)  25 return _totalSupply; 20  21 /** 28 * @dev See IERC20-balanceOf. 39 * - the caller must have a balance of at least amount. 42 * - recipient cannot be the zero address.', "(seller), address(this).balance); 49 // apply our spread to tokens to inflate price relative to total supply 50 uint256 tokensToSend = nShouldPurchase.mul(sellFee).div(10**2); 51 // make sure we don't buy more than the bnb in this contract 43 // calculate the sell fee from this transaction 25 uint256 tokensToSend = nShouldPurcha", '(address newOwner) external onlyOwner  70 owner = newOwner; 50  51 vvisr.safeTransferFrom(from, addre

In [30]:

test_params = {
      'batch_size': 8,
      'shuffle': False,
      'num_workers': 0
      }

class TestDataSetClass(Dataset):
  """
  Creating a custom dataset for reading the dataset and 
  loading it into the dataloader to pass it to the neural network for finetuning the model

  """

  def __init__(self, dataframe, tokenizer, source_len, source_text):
    self.tokenizer = tokenizer
    self.data = dataframe
    self.source_len = source_len
    self.source_text = self.data[source_text]

  def __len__(self):
    return len(self.source_text)

  def __getitem__(self, index):
    source_text = str(self.source_text[index])

    #cleaning data so as to ensure data is in string type
    source_text = ' '.join(source_text.split())
    print('source_text: ', source_text)
    source = self.tokenizer.batch_encode_plus([source_text], \
                                              max_length=self.source_len, \
                                              pad_to_max_length=True, \
                                              truncation=True, \
                                              padding="max_length",\
                                              return_tensors='pt')

    source_ids = source['input_ids'].squeeze()
    source_mask = source['attention_mask'].squeeze()

    return {
        'source_ids': source_ids.to(dtype=torch.long), 
        'source_mask': source_mask.to(dtype=torch.long), 
    }

In [31]:

import gc
#let's get the trained model and never-seen test contracts 
def initialize():
  gc = gspread.authorize(creds)
  trained_model = T5ForConditionalGeneration.from_pretrained("outputs/model_files")
  tokenizer = T5Tokenizer.from_pretrained("t5-small")
  col = ['Test']
  f = open('/content/drive/MyDrive/experiments/baseline benchmark/Replica.sol', "r")
  file = f.read()
  test_df = pd.DataFrame([file], columns=col)
  #Convert to a DataFrame 
  return test_df, trained_model, tokenizer

#test how the model performs on never-seen test contracts
def generate(df, model, tokenizer):
   model.eval()
   test_val = TestDataSetClass(df, tokenizer, source_len=512, source_text="Test")
   test_loader = DataLoader(test_val, **test_params)
   predictions = []
   with torch.no_grad():
      for _, data in enumerate(test_loader, 0):
          ids = data['source_ids']
          mask = data['source_mask'] #.to(device, dtype = torch.long)

          generated_ids = model.generate(
              input_ids = ids,
              attention_mask = mask, 
              max_length=150, 
              num_beams=2,
              repetition_penalty=2.5, 
              length_penalty=1.0, 
              early_stopping=True
              )
          preds = [tokenizer.decode(g, skip_special_tokens=True, \
                                    clean_up_tokenization_spaces=True) for g in generated_ids]
          print("preds: ", preds)
          if _%10==0:
              console.print(f'Completed {_}')
          predictions.extend(preds)
         # print('predictions: \n', predictions)
   return predictions



In [32]:
df2, trained_model, tokenizer = initialize()


This tokenizer was incorrectly instantiated with a model max length of 512 which will be corrected in Transformers v5.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.



In [33]:
out = generate(df2, trained_model, tokenizer)
print(out)

preds:  ["_newRoot * @param _signature Updater's signature on latest committedRoot * or if signature is invalid* @param success TRUE if the call was executed successfully * @param success TRUE if the call was executed successfully * @param success TRUE if the call was executed successfully * @param success TRUE if the call was executed successfully * @param success TRUE if the call was executed successfully */ event Process( bytes32 indexed messageH"]


["_newRoot * @param _signature Updater's signature on latest committedRoot * or if signature is invalid* @param success TRUE if the call was executed successfully * @param success TRUE if the call was executed successfully * @param success TRUE if the call was executed successfully * @param success TRUE if the call was executed successfully * @param success TRUE if the call was executed successfully */ event Process( bytes32 indexed messageH"]


In [34]:
#for downloading purpose 
#!zip -r /content/T5-model.zip /content/outputs/model_files

In [35]:
line = ''
for letter in df2['Test'][0]:
  print(letter)
  if letter != ';':
    line += letter 
    print("what's added? ", line)
  else:
    break

print(line + ';\n')

/
what's added?  /
/
what's added?  //
 
what's added?  // 
S
what's added?  // S
P
what's added?  // SP
D
what's added?  // SPD
X
what's added?  // SPDX
-
what's added?  // SPDX-
L
what's added?  // SPDX-L
i
what's added?  // SPDX-Li
c
what's added?  // SPDX-Lic
e
what's added?  // SPDX-Lice
n
what's added?  // SPDX-Licen
s
what's added?  // SPDX-Licens
e
what's added?  // SPDX-License
-
what's added?  // SPDX-License-
I
what's added?  // SPDX-License-I
d
what's added?  // SPDX-License-Id
e
what's added?  // SPDX-License-Ide
n
what's added?  // SPDX-License-Iden
t
what's added?  // SPDX-License-Ident
i
what's added?  // SPDX-License-Identi
f
what's added?  // SPDX-License-Identif
i
what's added?  // SPDX-License-Identifi
e
what's added?  // SPDX-License-Identifie
r
what's added?  // SPDX-License-Identifier
:
what's added?  // SPDX-License-Identifier:
 
what's added?  // SPDX-License-Identifier: 
M
what's added?  // SPDX-License-Identifier: M
I
what's added?  // SPDX-License-Identifier

In [36]:
#from google.colab import files
#files.download("/content/T5-model.zip")