<a href="https://colab.research.google.com/github/sallywang147/SCInvarinfer/blob/main/GPT2_for_smart_contracts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install sentencepiece
!pip install transformers
!pip install rich[jupyter]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.97
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m47.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m101.8 MB/s[0m eta [36m0:00:00[

In [2]:
from google.colab import auth
from google.auth import default
from numpy import random
import gspread
import gc
#autenticating to google
auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)

In [14]:
import pandas as pd
#defining my worksheet
worksheet = gc.open('short_invariants_for_gpt').sheet1
#get_all_values gives a list of rows
rows = worksheet.get_all_values()
#Convert to a DataFrame 
cols = ['Target']
df = pd.DataFrame(rows, columns=cols)

In [15]:
df

Unnamed: 0,Target
0,3 contract LoopFor {\n\n4 int x;\n5 in...
1,3 contract LoopFor {\n\n4 int x;\n5 in...
2,3 contract LoopFor {\n\n4 int x;\n5 in...
3,8 contract A {\n9 function funcA1() public ...


In [16]:
# Importing libraries
import os
import numpy as np
import pandas as pd
import plotly.express as px
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler


# Importing the T5 modules from huggingface/transformers
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import random
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv

from rich.table import Column, Table
from rich import box
from rich.console import Console

# define a rich console logger
console=Console(record=True)

def display_df(df):
  """display dataframe in ASCII format"""

  console=Console()
  table = Table(Column("source_text", justify="center" ), Column("target_text", justify="center"), title="Sample Data",pad_edge=False, box=box.ASCII)

  for i, row in enumerate(df.values.tolist()):
    table.add_row(row[0], row[1])

  console.print(table)

def plot_loss(index_list, loss_list):
  results = {
      "epochs": index_list,
      "cross entropy loss": loss_list,
  }
  df = pd.DataFrame(results)
  fig = px.line(df, x ="epochs", y="cross entropy loss",  title="Evaluation")
  fig.show()

training_logger = Table(Column("Epoch", justify="center" ),
                        Column("Cross Entropy Loss", justify="center"), 
                        title="Training Status",pad_edge=False, box=box.ASCII)


In [17]:
# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [18]:
model_params={
    "MODEL":"gpt2",             # model_type: t5-large
    "MAX_LENGTH": 1024,  # max length of source text
   # "SEED": random.randint(1000)    # randomized seeds to shuffle test set

}

In [19]:
class GPTDataSetClass(Dataset):
  """
  Creating a custom dataset for reading the dataset and 
  loading it into the dataloader to pass it to the neural network for finetuning the model

  """

  def __init__(self, target_label, truncate=False, \
               gpt2_type=model_params['MODEL'], \
               max_length=model_params["MAX_LENGTH"]):
    self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
    self.target_tokens = []

    for row in df['Target']:
        self.target_tokens.append(torch.tensor(
            self.tokenizer.encode(f"<|{target_label}|>{row[:max_length]}<|endoftext|>")
            ))  
    if truncate:
            self.target_tokens = self.target_tokens[:20000]
    self.length = len(self.target_tokens)   

  def __len__(self):
    return self.length

  def __getitem__(self, index):
    return self.target_tokens[index]   

In [20]:
#Accumulated batch size (since GPT2 is so big)
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None

In [21]:
def train(
    dataset, model, tokenizer,
    batch_size=16, epochs=100, lr=2e-5,
    max_seq_len=400, warmup_steps=200,
    gpt2_type="gpt2", output_dir=".", output_prefix="wreckgar",
    test_mode=False, save_model_on_epoch=False,
):
    acc_steps = 100
    device=torch.device("cuda")
    model = model.cuda()
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None
    loss_list = []
    epoch_list = []
    for epoch in range(epochs):
        total_loss = []
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)  
            loss = outputs[0] 
            total_loss.append(float(loss.item()))       
            loss.backward()                      
            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()
  
            accumulating_batch_count += 1
            input_tensor = None

        training_logger.add_row(str(epoch), str(np.mean(total_loss)))       
        if save_model_on_epoch:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{output_prefix}-{epoch}.pt"),
            )
        loss_list.append(np.mean(total_loss))
        epoch_list.append(epoch) 
        print(f"for epoch {epoch} the loss is {np.mean(total_loss)}\n")
    console.print(training_logger)   
    plot_loss(epoch_list, loss_list)
    return model

In [22]:
def fine_tune_GPT2(df, model_params):   
  console.log(f"""[Model]: Loading {model_params["MODEL"]}...\n""")

  # tokenzier for encoding the text
  dataset = GPTDataSetClass(df['Target'], truncate=False, gpt2_type=model_params["MODEL"]) 
  #Get the tokenizer and model
  tokenizer = GPT2Tokenizer.from_pretrained(model_params["MODEL"])
  model = GPT2LMHeadModel.from_pretrained(model_params["MODEL"])  
  trained_model = train(dataset, model, tokenizer)
  console.log(f"[Saving Model]...\n")
  #Saving the model after training
  path = os.path.join('/content/output', "model_files")
  model.save_pretrained(path)
  tokenizer.save_pretrained(path)
  console.print(f"""[Model] Model saved @ {os.path.join('/content/output', "model_files")}\n""")
  
  # logging
  console.log(f"[Data]: Reading Raw data...\n")


  # Creation of Dataset and Dataloader
  # Defining the train size. So 80% of the data will be used for training and the rest for validation. 

  console.print(f"FULL Dataset: {df.shape}")
  return trained_model, tokenizer


In [23]:

def generate(
    model,
    tokenizer,
    prompt,
    entry_count=10,
    entry_length=100, #maximum number of words
    top_p=0.8,
    temperature=1.,
):
    model.eval()
    generated_num = 0
    generated_list = []

    filter_value = -float("Inf")

    with torch.no_grad():

        for entry_idx in trange(entry_count):

            entry_finished = False
            generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)

            for i in range(entry_length):
                outputs = model(generated, labels=generated)
                loss, logits = outputs[:2]
                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)

                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
                    ..., :-1
                ].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = filter_value

                next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
                generated = torch.cat((generated, next_token), dim=1)

                if next_token in tokenizer.encode("<|endoftext|>"):
                    entry_finished = True

                if entry_finished:

                    generated_num = generated_num + 1

                    output_list = list(generated.squeeze().numpy())
                    output_text = tokenizer.decode(output_list)
                    generated_list.append(output_text + '\n')
                    break
            
            if not entry_finished:
              output_list = list(generated.squeeze().numpy())
              output_text = f"{tokenizer.decode(output_list)}<|endoftext|>" 
              generated_list.append(output_text + '\n')
                
    return generated_list

#Function to generate multiple sentences. Test data should be a dataframe
def text_generation(model, tokenizer, test_data):
  generated_code = []
  for i in range(len(test_data)):    
    x = generate(model.to('cpu'), tokenizer, test_data['Target'][i], entry_count=1)
    generated_code.append(x)
  return generated_code

#Run the functions to generate the lyrics

def test_fine_tuned_gpt2(model, tokenizer, df): 
    test_set = df.sample(n=1)
    df = df.loc[~df.index.isin(test_set.index)]

    #Reset the indexes
    test_set = test_set.reset_index()
    df = df.reset_index()

    #For the test set only, keep last 20 words in a new column, then remove them from original column
    test_set['Target'] = test_set['Target'].str.split().apply(' '.join)
    generated_code = text_generation(model, tokenizer, test_set)
    print(generated_code)


In [24]:
trained_model, tokenizer = fine_tune_GPT2(df, model_params)

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

4it [00:03,  1.14it/s]


for epoch 0 the loss is 2.2952758073806763



4it [00:00, 46.15it/s]


for epoch 1 the loss is 2.406958818435669



4it [00:00, 38.57it/s]


for epoch 2 the loss is 2.3600988388061523



4it [00:00, 44.34it/s]


for epoch 3 the loss is 2.3811352252960205



4it [00:00, 41.75it/s]


for epoch 4 the loss is 2.290522336959839



4it [00:00, 41.04it/s]


for epoch 5 the loss is 2.3031617403030396



4it [00:00, 50.94it/s]


for epoch 6 the loss is 2.5447524785995483



4it [00:00, 36.30it/s]


for epoch 7 the loss is 2.3493478298187256



4it [00:00, 46.46it/s]


for epoch 8 the loss is 2.3838541507720947



4it [00:00, 38.62it/s]


for epoch 9 the loss is 2.31590473651886



4it [00:00, 52.20it/s]


for epoch 10 the loss is 2.3203909397125244



4it [00:00, 38.68it/s]


for epoch 11 the loss is 2.2965511083602905



4it [00:00, 51.68it/s]


for epoch 12 the loss is 2.345776677131653



4it [00:00, 38.84it/s]


for epoch 13 the loss is 2.3792515993118286



4it [00:00, 44.45it/s]


for epoch 14 the loss is 2.269974708557129



4it [00:00, 51.15it/s]


for epoch 15 the loss is 2.3357656002044678



4it [00:00, 35.71it/s]


for epoch 16 the loss is 2.363720178604126



4it [00:00, 45.06it/s]


for epoch 17 the loss is 2.2410563230514526



4it [00:00, 51.15it/s]


for epoch 18 the loss is 2.3072195053100586



4it [00:00, 38.65it/s]


for epoch 19 the loss is 2.397513747215271



4it [00:00, 51.47it/s]


for epoch 20 the loss is 2.255880832672119



4it [00:00, 44.31it/s]


for epoch 21 the loss is 2.3187962770462036



4it [00:00, 42.90it/s]


for epoch 22 the loss is 2.3306851387023926



4it [00:00, 44.02it/s]


for epoch 23 the loss is 2.4519633054733276



4it [00:00, 33.79it/s]


for epoch 24 the loss is 2.383134126663208



4it [00:00, 48.26it/s]


for epoch 25 the loss is 2.3985060453414917



4it [00:00, 46.77it/s]


for epoch 26 the loss is 2.4440393447875977



4it [00:00, 44.25it/s]


for epoch 27 the loss is 2.4306790828704834



4it [00:00, 44.13it/s]


for epoch 28 the loss is 2.3513855934143066



4it [00:00, 38.91it/s]


for epoch 29 the loss is 2.268486976623535



4it [00:00, 44.07it/s]


for epoch 30 the loss is 2.358740210533142



4it [00:00, 44.75it/s]


for epoch 31 the loss is 2.274142861366272



4it [00:00, 40.50it/s]


for epoch 32 the loss is 2.3044183254241943



4it [00:00, 51.91it/s]


for epoch 33 the loss is 2.3938324451446533



4it [00:00, 43.09it/s]


for epoch 34 the loss is 2.428154468536377



4it [00:00, 38.66it/s]


for epoch 35 the loss is 2.382160186767578



4it [00:00, 41.83it/s]


for epoch 36 the loss is 2.396466851234436



4it [00:00, 51.12it/s]


for epoch 37 the loss is 2.357079029083252



4it [00:00, 38.99it/s]


for epoch 38 the loss is 2.344240427017212



4it [00:00, 51.19it/s]


for epoch 39 the loss is 2.36418616771698



4it [00:00, 40.89it/s]


for epoch 40 the loss is 2.277372360229492



4it [00:00, 44.28it/s]


for epoch 41 the loss is 2.3254311084747314



4it [00:00, 39.00it/s]


for epoch 42 the loss is 2.3244500160217285



4it [00:00, 52.03it/s]


for epoch 43 the loss is 2.38221538066864



4it [00:00, 38.83it/s]


for epoch 44 the loss is 2.2713325023651123



4it [00:00, 38.94it/s]


for epoch 45 the loss is 2.3345158100128174



4it [00:00, 51.86it/s]


for epoch 46 the loss is 2.3468741178512573



4it [00:00, 44.17it/s]


for epoch 47 the loss is 2.297314405441284



4it [00:00, 40.01it/s]


for epoch 48 the loss is 2.3330122232437134



4it [00:00, 44.58it/s]


for epoch 49 the loss is 2.3632397651672363



4it [00:00, 45.44it/s]


for epoch 50 the loss is 2.383010149002075



4it [00:00, 38.73it/s]


for epoch 51 the loss is 2.342361569404602



4it [00:00, 45.12it/s]


for epoch 52 the loss is 2.2631391286849976



4it [00:00, 50.08it/s]


for epoch 53 the loss is 2.375361204147339



4it [00:00, 44.46it/s]


for epoch 54 the loss is 2.3483104705810547



4it [00:00, 38.24it/s]


for epoch 55 the loss is 2.2573763132095337



4it [00:00, 40.20it/s]


for epoch 56 the loss is 2.2885451316833496



4it [00:00, 44.99it/s]


for epoch 57 the loss is 2.324442744255066



4it [00:00, 45.16it/s]


for epoch 58 the loss is 2.4006630182266235



4it [00:00, 50.99it/s]


for epoch 59 the loss is 2.316488265991211



4it [00:00, 44.33it/s]


for epoch 60 the loss is 2.3285120725631714



4it [00:00, 38.56it/s]


for epoch 61 the loss is 2.3062891960144043



4it [00:00, 52.20it/s]


for epoch 62 the loss is 2.454826235771179



4it [00:00, 38.74it/s]


for epoch 63 the loss is 2.3690825700759888



4it [00:00, 40.22it/s]


for epoch 64 the loss is 2.3315967321395874



4it [00:00, 52.36it/s]


for epoch 65 the loss is 2.2720755338668823



4it [00:00, 44.63it/s]


for epoch 66 the loss is 2.244518995285034



4it [00:00, 44.83it/s]


for epoch 67 the loss is 2.3429352045059204



4it [00:00, 42.70it/s]


for epoch 68 the loss is 2.3160425424575806



4it [00:00, 37.67it/s]


for epoch 69 the loss is 2.285865068435669



4it [00:00, 44.92it/s]


for epoch 70 the loss is 2.340181827545166



4it [00:00, 44.14it/s]


for epoch 71 the loss is 2.2769726514816284



4it [00:00, 46.09it/s]


for epoch 72 the loss is 2.3736754655838013



4it [00:00, 38.87it/s]


for epoch 73 the loss is 2.3959951400756836



4it [00:00, 45.28it/s]


for epoch 74 the loss is 2.180710196495056



4it [00:00, 45.15it/s]


for epoch 75 the loss is 2.305603265762329



4it [00:00, 50.55it/s]


for epoch 76 the loss is 2.3272980451583862



4it [00:00, 44.68it/s]


for epoch 77 the loss is 2.4380433559417725



4it [00:00, 38.33it/s]


for epoch 78 the loss is 2.2555383443832397



4it [00:00, 45.35it/s]


for epoch 79 the loss is 2.344031810760498



4it [00:00, 40.79it/s]


for epoch 80 the loss is 2.2490700483322144



4it [00:00, 51.04it/s]


for epoch 81 the loss is 2.2255570888519287



4it [00:00, 38.43it/s]


for epoch 82 the loss is 2.250330328941345



4it [00:00, 44.76it/s]


for epoch 83 the loss is 2.248690366744995



4it [00:00, 51.43it/s]


for epoch 84 the loss is 2.357328772544861



4it [00:00, 37.21it/s]


for epoch 85 the loss is 2.2694050073623657



4it [00:00, 51.39it/s]


for epoch 86 the loss is 2.176340937614441



4it [00:00, 45.43it/s]


for epoch 87 the loss is 2.3398629426956177



4it [00:00, 34.57it/s]


for epoch 88 the loss is 2.2955435514450073



4it [00:00, 44.70it/s]


for epoch 89 the loss is 2.255821943283081



4it [00:00, 44.60it/s]


for epoch 90 the loss is 2.2900243997573853



4it [00:00, 50.88it/s]


for epoch 91 the loss is 2.3152204751968384



4it [00:00, 45.14it/s]


for epoch 92 the loss is 2.2347092628479004



4it [00:00, 38.34it/s]


for epoch 93 the loss is 2.3561123609542847



4it [00:00, 52.06it/s]


for epoch 94 the loss is 2.210537075996399



4it [00:00, 39.00it/s]


for epoch 95 the loss is 2.3293851613998413



4it [00:00, 40.71it/s]


for epoch 96 the loss is 2.2305535078048706



4it [00:00, 42.59it/s]


for epoch 97 the loss is 2.3369861841201782



4it [00:00, 50.95it/s]


for epoch 98 the loss is 2.1981916427612305



4it [00:00, 38.88it/s]


for epoch 99 the loss is 2.1908375024795532



In [25]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [26]:
def generate_test_contrct(path, ratio): 
  col = ['Target']
  f = open(path, "r")
  file = f.read()
  test_df = pd.DataFrame([file], columns=col)
  program_length = len(test_df['Target'][0].split())
  prompt_ratio = ratio
  prompt_length = int(prompt_ratio * program_length)
  return test_df, prompt_length


In [27]:
def truncate_test(df, prompt_length):
  copy_1 = df.copy(deep=True)
  copy_2 = df.copy(deep=True)
  #true
  a = copy_1['Target'].str.split().str[-prompt_length:].apply(' '.join)[0]
  #masked out program 
  b = copy_2['Target'].str.split().str[:-prompt_length].apply(' '.join)[0]
  return a, b

test_df, n = generate_test_contrct('/content/drive/MyDrive/experiments/baseline benchmark/Replica.sol', 0.95)
truth, prompt = truncate_test(test_df, n)
col = ['Target']
prompt_df = pd.DataFrame([prompt], columns=col)
gpt_out = text_generation(trained_model, tokenizer, prompt_df)
print('this is the GPT prompt without T5: \n', prompt_df['Target'][0])
print('this is the GPT prediction without T5: \n', gpt_out)

100%|██████████| 1/1 [00:16<00:00, 16.38s/it]

this is the GPT prompt without T5: 
 1 // SPDX-License-Identifier: MIT OR Apache-2.0 2 //test cross bridge 3 pragma solidity >=0.6.11; 4 contract Replica { 5 /** 6 * @notice Emitted when message is processed 7 * @param messageHash Hash of message that failed to process 8 * @param success TRUE if the call was executed successfully, FALSE if the call reverted 9 * @param returnData the return data from the external call 10 */ 11 event Process(
this is the GPT prediction without T5: 
 [["1 // SPDX-License-Identifier: MIT OR Apache-2.0 2 //test cross bridge 3 pragma solidity >=0.6.11; 4 contract Replica { 5 /** 6 * @notice Emitted when message is processed 7 * @param messageHash Hash of message that failed to process 8 * @param success TRUE if the call was executed successfully, FALSE if the call reverted 9 * @param returnData the return data from the external call 10 */ 11 event Process(uint done, uint retval, uint data) => { 12 // // TODO update: a bug fixes here 3 // 4 public void setHas




In [28]:
class File():

    def __init__(self, name, mode='w'):
        self.f = open(name, mode, buffering=1)
        
    def write(self, string, newline=True):
        if newline:
            self.f.write(string + '\n')
        else:
            self.f.write(string)

In [29]:
for i in out: 
  print(i)

NameError: ignored

In [None]:
print(out)
for row in out:
  with open('/content/contract.sol', 'w') as writefile: 
    for line in row:
      sentence = []
      for letter in line: 
        if letter != ';':
             sentence.append(letter)
        writefile.write(str(sentence)+';\n')

     

In [None]:
with open('/content/contract.sol', 'r') as file: 
   for line in file: 
     print(line)

In [None]:
#To solve CUDA out of memory error; not necesssary here 
import gc
gc.collect()
torch.cuda.empty_cache()
os.environ['CUDA_VISIBLE_DEVICES']='0, 1, 2, 3'
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

In [None]:
#for downloading purpose 
!zip -r /content/model.zip /content/output/model_files

In [None]:
from google.colab import files
files.download("/content/model.zip")