<a href="https://colab.research.google.com/github/sallywang147/SCInvarinfer/blob/main/GPT2_for_smart_contracts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install sentencepiece
!pip install transformers
!pip install rich[jupyter]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from google.colab import auth
from google.auth import default
from numpy import random
import gspread
import gc
#autenticating to google
auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)

In [3]:
import pandas as pd
#defining my worksheet
worksheet = gc.open('invariants3').sheet1
#get_all_values gives a list of rows
rows = worksheet.get_all_values()
#Convert to a DataFrame 
cols = ['Source', 'Target', 'Verify_Success']
df = pd.DataFrame(rows, columns=cols)

In [4]:
df

Unnamed: 0,Source,Target,Verify_Success
0,pragma solidity >=0.4.24 <0.6.0; contract C {\...,pragma solidity >=0.4.24 <0.6.0; contract C {\...,1
1,pragma solidity >=0.4.24 <0.6.0;\n\ncontract A...,pragma solidity >=0.4.24 <0.6.0;\n\ncontract A...,1
2,"pragma solidity >=0.4.24<0.6.0;\nimport ""./Lib...","pragma solidity >=0.4.24<0.6.0;\nimport ""./Lib...",0
3,pragma solidity >=0.4.24 <0.6.0;\n\ncontract A...,pragma solidity >=0.4.24 <0.6.0;\n\ncontract A...,1
4,pragma solidity >=0.4.24 <0.6.0;\n\ncontract L...,pragma solidity >=0.4.24 <0.6.0;\n\ncontract L...,1
...,...,...,...
65,pragma solidity >=0.4.24 <0.6.0;\n\n// This te...,pragma solidity >=0.4.24 <0.6.0;\n\n// This te...,
66,pragma solidity >=0.4.24 <0.6.0;\ncontract B {...,pragma solidity >=0.4.24 <0.6.0;\ncontract B {...,
67,pragma solidity >=0.4.24 <0.6.0;\n\ncontract E...,pragma solidity >=0.4.24 <0.6.0;\n\ncontract E...,
68,// SPDX-License-Identifier: MIT\npragma experi...,// SPDX-License-Identifier: MIT\npragma experi...,


In [5]:
# Importing libraries
import os
import numpy as np
import pandas as pd
import plotly.express as px
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler


# Importing the T5 modules from huggingface/transformers
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import random
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv

from rich.table import Column, Table
from rich import box
from rich.console import Console

# define a rich console logger
console=Console(record=True)

def display_df(df):
  """display dataframe in ASCII format"""

  console=Console()
  table = Table(Column("source_text", justify="center" ), Column("target_text", justify="center"), title="Sample Data",pad_edge=False, box=box.ASCII)

  for i, row in enumerate(df.values.tolist()):
    table.add_row(row[0], row[1])

  console.print(table)

def plot_loss(index_list, loss_list):
  results = {
      "epochs": index_list,
      "cross entropy loss": loss_list,
  }
  df = pd.DataFrame(results)
  fig = px.line(df, x ="epochs", y="cross entropy loss",  title="Evaluation")
  fig.show()

training_logger = Table(Column("Epoch", justify="center" ),
                        Column("Cross Entropy Loss", justify="center"), 
                        title="Training Status",pad_edge=False, box=box.ASCII)


In [6]:
# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [7]:
model_params={
    "MODEL":"gpt2",             # model_type: t5-large
    "MAX_LENGTH": 1024,  # max length of source text
   # "SEED": random.randint(1000)    # randomized seeds to shuffle test set

}

In [8]:
class GPTDataSetClass(Dataset):
  """
  Creating a custom dataset for reading the dataset and 
  loading it into the dataloader to pass it to the neural network for finetuning the model

  """

  def __init__(self, target_label, truncate=False, \
               gpt2_type=model_params['MODEL'], \
               max_length=model_params["MAX_LENGTH"]):
    self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
    self.target_tokens = []

    for row in df['Target']:
        self.target_tokens.append(torch.tensor(
            self.tokenizer.encode(f"<|{target_label}|>{row[:max_length]}<|endoftext|>")
            ))  
    if truncate:
            self.target_tokens = self.target_tokens[:20000]
    self.length = len(self.target_tokens)   

  def __len__(self):
    return self.length

  def __getitem__(self, index):
    return self.target_tokens[index]   

In [9]:
#Accumulated batch size (since GPT2 is so big)
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None

In [10]:
def train(
    dataset, model, tokenizer,
    batch_size=16, epochs=30, lr=2e-5,
    max_seq_len=400, warmup_steps=200,
    gpt2_type="gpt2", output_dir=".", output_prefix="wreckgar",
    test_mode=False, save_model_on_epoch=False,
):
    acc_steps = 100
    device=torch.device("cuda")
    model = model.cuda()
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None
    loss_list = []
    epoch_list = []
    for epoch in range(epochs):
        total_loss = []
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)  
            loss = outputs[0] 
            total_loss.append(float(loss.item()))       
            loss.backward()                      
            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()
  
            accumulating_batch_count += 1
            input_tensor = None

        training_logger.add_row(str(epoch), str(np.mean(total_loss)))       
        if save_model_on_epoch:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{output_prefix}-{epoch}.pt"),
            )
        loss_list.append(np.mean(total_loss))
        epoch_list.append(epoch) 
        print(f"for epoch {epoch} the loss is {np.mean(total_loss)}\n")
    console.print(training_logger)   
    plot_loss(epoch_list, loss_list)
    return model

In [11]:
def fine_tune_GPT2(df, model_params):   
  console.log(f"""[Model]: Loading {model_params["MODEL"]}...\n""")

  # tokenzier for encoding the text
  dataset = GPTDataSetClass(df['Target'], truncate=True, gpt2_type=model_params["MODEL"]) 
  #Get the tokenizer and model
  tokenizer = GPT2Tokenizer.from_pretrained(model_params["MODEL"])
  model = GPT2LMHeadModel.from_pretrained(model_params["MODEL"])  
  trained_model = train(dataset, model, tokenizer)
  console.log(f"[Saving Model]...\n")
  #Saving the model after training
  path = os.path.join('/content/output', "model_files")
  model.save_pretrained(path)
  tokenizer.save_pretrained(path)
  console.print(f"""[Model] Model saved @ {os.path.join('/content/output', "model_files")}\n""")
  
  # logging
  console.log(f"[Data]: Reading Raw data...\n")

  # Importing the raw dataset
  display_df(df.head(2))

  # Creation of Dataset and Dataloader
  # Defining the train size. So 80% of the data will be used for training and the rest for validation. 

  console.print(f"FULL Dataset: {df.shape}")
  return trained_model, tokenizer


In [12]:

def generate(
    model,
    tokenizer,
    prompt,
    entry_count=10,
    entry_length=100, #maximum number of words
    top_p=0.8,
    temperature=1.,
):
    model.eval()
    generated_num = 0
    generated_list = []

    filter_value = -float("Inf")

    with torch.no_grad():

        for entry_idx in trange(entry_count):

            entry_finished = False
            generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)

            for i in range(entry_length):
                outputs = model(generated, labels=generated)
                loss, logits = outputs[:2]
                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)

                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
                    ..., :-1
                ].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = filter_value

                next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
                generated = torch.cat((generated, next_token), dim=1)

                if next_token in tokenizer.encode("<|endoftext|>"):
                    entry_finished = True

                if entry_finished:

                    generated_num = generated_num + 1

                    output_list = list(generated.squeeze().numpy())
                    output_text = tokenizer.decode(output_list)
                    generated_list.append(output_text)
                    break
            
            if not entry_finished:
              output_list = list(generated.squeeze().numpy())
              output_text = f"{tokenizer.decode(output_list)}<|endoftext|>" 
              generated_list.append(output_text)
                
    return generated_list

#Function to generate multiple sentences. Test data should be a dataframe
def text_generation(model, tokenizer, test_data):
  generated_code = []
  for i in range(len(test_data)):
    x = generate(model.to('cpu'), tokenizer, test_data['Source'][i], entry_count=1)
    generated_code.append(x)
  return generated_code

#Run the functions to generate the lyrics

def test_fine_tuned_gpt2(model, tokenizer, df): 
    test_set = df.sample(n=1)
    df = df.loc[~df.index.isin(test_set.index)]

    #Reset the indexes
    test_set = test_set.reset_index()
    df = df.reset_index()

    #For the test set only, keep last 20 words in a new column, then remove them from original column
    test_set['Target'] = test_set['Target'].str.split().apply(' '.join)
    test_set['Source'] = test_set['Source'].str.split().apply(' '.join)
    generated_code = text_generation(model, tokenizer, test_set)
    print(generated_code)


In [13]:
trained_model, tokenizer = fine_tune_GPT2(df, model_params)

70it [00:03, 23.19it/s]


for epoch 0 the loss is 2.1594077008111134



70it [00:01, 38.70it/s]


for epoch 1 the loss is 2.166194588797433



70it [00:01, 39.10it/s]


for epoch 2 the loss is 2.0968627146312167



70it [00:01, 37.89it/s]


for epoch 3 the loss is 2.1733724662235807



70it [00:01, 37.97it/s]


for epoch 4 the loss is 2.1015058517456056



70it [00:01, 37.49it/s]


for epoch 5 the loss is 2.1158788749149866



70it [00:01, 40.76it/s]


for epoch 6 the loss is 2.085200844492231



70it [00:01, 38.60it/s]


for epoch 7 the loss is 2.027850573403495



70it [00:01, 38.32it/s]


for epoch 8 the loss is 1.9658605439322336



70it [00:01, 38.11it/s]


for epoch 9 the loss is 1.970819766180856



70it [00:01, 38.43it/s]


for epoch 10 the loss is 1.9131283112934658



70it [00:01, 37.34it/s]


for epoch 11 the loss is 1.9248878444944109



70it [00:01, 37.08it/s]


for epoch 12 the loss is 1.9014925786427088



70it [00:01, 38.33it/s]


for epoch 13 the loss is 1.8670312711170742



70it [00:01, 38.11it/s]


for epoch 14 the loss is 1.8852615458624704



70it [00:01, 38.95it/s]


for epoch 15 the loss is 1.7404375519071307



70it [00:01, 38.64it/s]


for epoch 16 the loss is 1.760209390095302



70it [00:01, 36.87it/s]


for epoch 17 the loss is 1.795272626195635



70it [00:01, 37.79it/s]


for epoch 18 the loss is 1.7597379990986415



70it [00:01, 37.99it/s]


for epoch 19 the loss is 1.7151532513754708



70it [00:01, 38.32it/s]


for epoch 20 the loss is 1.6214135817119053



70it [00:01, 39.28it/s]


for epoch 21 the loss is 1.6458809852600098



70it [00:01, 38.77it/s]


for epoch 22 the loss is 1.6830742291041783



70it [00:01, 38.72it/s]


for epoch 23 the loss is 1.5923302343913488



70it [00:01, 38.26it/s]


for epoch 24 the loss is 1.5801535981042045



70it [00:01, 38.60it/s]


for epoch 25 the loss is 1.5711481298719134



70it [00:01, 37.96it/s]


for epoch 26 the loss is 1.52281653881073



70it [00:01, 38.61it/s]


for epoch 27 the loss is 1.4471357039042883



70it [00:01, 38.31it/s]


for epoch 28 the loss is 1.4692912476403373



70it [00:01, 38.46it/s]

for epoch 29 the loss is 1.4330849715641567






In [14]:
test_fine_tuned_gpt2(trained_model, tokenizer, df)

100%|██████████| 1/1 [00:11<00:00, 11.79s/it]

[['pragma solidity >=0.4.24 <0.6.0; contract Consts { constructor () public { uint a; address b; address c; a = 10; b = address(0x10); c = address(0x12); } }\n\nNotice that we are now using contracts to control how the function will return and the only "contract-specific" way of doing that is through the $contract(x) function. We\'re using contracts so that we can do this without writing the following code in our sample contract:\n\nfunction f() { if (a === 0) return false; return true; } contract Consts { constructor () public { a = 10; b = a + b; } }\n\nThis<|endoftext|>']]





In [66]:
def generate_test_contrct(path, ratio): 
  col = ['Source']
  f = open(path, "r")
  file = f.read()
  test_df = pd.DataFrame([file], columns=col)
  program_length = len(test_df['Source'][0].split())
  prompt_ratio = ratio
  prompt_length = int(prompt_ratio * program_length)
  return test_df, prompt_length

test_df, n = generate_test_contrct('/content/LoopFor.sol', 0.2)
print(test_df)

                                              Source
0  pragma solidity ^0.4.24;\nimport "./../../Libr...


In [77]:
def truncate_test(df, prompt_length):
  copy_1 = df.copy(deep=True)
  copy_2 = df.copy(deep=True)
  print('copy_1: ', copy_1)
  #true
  test_col = ['Truth', 'Prompt']
  df = pd.DataFrame(columns=test_col)
  #print(copy_1['Source'].str.split().str[-prompt_length:].apply(' '.join)[0])
  a = copy_1['Source'].str.split().str[-prompt_length:].apply(' '.join)[0]
  #masked out program 
  print(a)
  b = copy_2['Source'].str.split().str[:-prompt_length].apply(' '.join)[0]
  print(b)
  return a, b
truth, prompt = truncate_test(test_df, n)
col = ['Source']
prompt_df = pd.DataFrame([prompt], columns=col)
out = text_generation(trained_model, tokenizer, prompt_df)

copy_1:                                                Source
0  pragma solidity ^0.4.24; import "./../../Libra...
Empty DataFrame
Columns: [Truth, Prompt]
Index: []
while (x != 0) { y++; x--; } } // test Loop invariant with do-while loop function Bar(uint n) public { require
pragma solidity ^0.4.24; import "./../../Libraries/VeriSolContracts.sol"; //import "./VeriSolContracts.sol"; //import "github.com/microsoft/verisol/blob/master/Libraries/VeriSolContracts.sol"; contract LoopFor { // test Loop invariant with for loop constructor(uint n) public { require (n >= 0); uint y = 0; for (uint x = n; x != 0; x --) { y++; } } // test Loop invariant with while loop function Foo(uint n) public { require (n >= 0); uint y = 0; uint x = n;


100%|██████████| 1/1 [00:20<00:00, 20.65s/it]


In [82]:
print(out)
for row in out:
  with open('cotract.sol', 'w') as writefile: 
    for line in row:
      print(line)
      writefile.write(line + '\n')

[['pragma solidity ^0.4.24; import "./../../Libraries/VeriSolContracts.sol"; //import "./VeriSolContracts.sol"; //import "github.com/microsoft/verisol/blob/master/Libraries/VeriSolContracts.sol"; contract LoopFor { // test Loop invariant with for loop constructor(uint n) public { require (n >= 0); uint y = 0; for (uint x = n; x!= 0; x --) { y++; } } // test Loop invariant with while loop function Foo(uint n) public { require (n >= 0); uint y = 0; uint x = n; for (uint y = n; y!= 0; y --) { y++; } } } // test Limiting to 1 here // *if(function(n) { //-}function CheckVector { function assert(n < 10) if (n < 10) { return true; } } //if(function(n) { //-}function CheckVectorAgain { function assert(n < 10) if (n < 10) { return true; } } // if(<|endoftext|>']]
['pragma solidity ^0.4.24; import "./../../Libraries/VeriSolContracts.sol"; //import "./VeriSolContracts.sol"; //import "github.com/microsoft/verisol/blob/master/Libraries/VeriSolContracts.sol"; contract LoopFor { // test Loop invarian

In [None]:
#To solve CUDA out of memory error; not necesssary here 
import gc
gc.collect()
torch.cuda.empty_cache()
os.environ['CUDA_VISIBLE_DEVICES']='0, 1, 2, 3'
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

In [None]:
#for downloading purpose 
#!zip -r /content/model.zip /content/outputs/model_files