<a href="https://colab.research.google.com/github/sallywang147/SCInvarinfer/blob/main/GPT2_for_smart_contracts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [85]:
!pip install sentencepiece
!pip install transformers
!pip install rich[jupyter]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [86]:
from google.colab import auth
from google.auth import default
from numpy import random
import gspread
import gc
#autenticating to google
auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)

In [87]:
import pandas as pd
#defining my worksheet
worksheet = gc.open('invariants3').sheet1
#get_all_values gives a list of rows
rows = worksheet.get_all_values()
#Convert to a DataFrame 
cols = ['Source', 'Target', 'Verify_Success']
df = pd.DataFrame(rows, columns=cols)

In [88]:
df

Unnamed: 0,Source,Target,Verify_Success
0,pragma solidity >=0.4.24 <0.6.0; contract C {\...,pragma solidity >=0.4.24 <0.6.0; contract C {\...,1
1,pragma solidity >=0.4.24 <0.6.0;\n\ncontract A...,pragma solidity >=0.4.24 <0.6.0;\n\ncontract A...,1
2,"pragma solidity >=0.4.24<0.6.0;\nimport ""./Lib...","pragma solidity >=0.4.24<0.6.0;\nimport ""./Lib...",0
3,pragma solidity >=0.4.24 <0.6.0;\n\ncontract A...,pragma solidity >=0.4.24 <0.6.0;\n\ncontract A...,1
4,pragma solidity >=0.4.24 <0.6.0;\n\ncontract L...,pragma solidity >=0.4.24 <0.6.0;\n\ncontract L...,1
...,...,...,...
65,pragma solidity >=0.4.24 <0.6.0;\n\n// This te...,pragma solidity >=0.4.24 <0.6.0;\n\n// This te...,
66,pragma solidity >=0.4.24 <0.6.0;\ncontract B {...,pragma solidity >=0.4.24 <0.6.0;\ncontract B {...,
67,pragma solidity >=0.4.24 <0.6.0;\n\ncontract E...,pragma solidity >=0.4.24 <0.6.0;\n\ncontract E...,
68,// SPDX-License-Identifier: MIT\npragma experi...,// SPDX-License-Identifier: MIT\npragma experi...,


In [89]:
# Importing libraries
import os
import numpy as np
import pandas as pd
import plotly.express as px
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler


# Importing the T5 modules from huggingface/transformers
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import random
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv

from rich.table import Column, Table
from rich import box
from rich.console import Console

# define a rich console logger
console=Console(record=True)

def display_df(df):
  """display dataframe in ASCII format"""

  console=Console()
  table = Table(Column("source_text", justify="center" ), Column("target_text", justify="center"), title="Sample Data",pad_edge=False, box=box.ASCII)

  for i, row in enumerate(df.values.tolist()):
    table.add_row(row[0], row[1])

  console.print(table)

def plot_loss(index_list, loss_list):
  results = {
      "epochs": index_list,
      "cross entropy loss": loss_list,
  }
  df = pd.DataFrame(results)
  fig = px.line(df, x ="epochs", y="cross entropy loss",  title="Evaluation")
  fig.show()

training_logger = Table(Column("Epoch", justify="center" ),
                        Column("Cross Entropy Loss", justify="center"), 
                        title="Training Status",pad_edge=False, box=box.ASCII)


In [90]:
# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [91]:
model_params={
    "MODEL":"gpt2",             # model_type: t5-large
    "MAX_LENGTH": 1024,  # max length of source text
   # "SEED": random.randint(1000)    # randomized seeds to shuffle test set

}

In [92]:
class GPTDataSetClass(Dataset):
  """
  Creating a custom dataset for reading the dataset and 
  loading it into the dataloader to pass it to the neural network for finetuning the model

  """

  def __init__(self, target_label, truncate=False, \
               gpt2_type=model_params['MODEL'], \
               max_length=model_params["MAX_LENGTH"]):
    self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
    self.target_tokens = []

    for row in df['Target']:
        self.target_tokens.append(torch.tensor(
            self.tokenizer.encode(f"<|{target_label}|>{row[:max_length]}<|endoftext|>")
            ))  
    if truncate:
            self.target_tokens = self.target_tokens[:20000]
    self.length = len(self.target_tokens)   

  def __len__(self):
    return self.length

  def __getitem__(self, index):
    return self.target_tokens[index]   

In [93]:
#Accumulated batch size (since GPT2 is so big)
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None

In [94]:
def train(
    dataset, model, tokenizer,
    batch_size=16, epochs=100, lr=2e-5,
    max_seq_len=400, warmup_steps=200,
    gpt2_type="gpt2", output_dir=".", output_prefix="wreckgar",
    test_mode=False, save_model_on_epoch=False,
):
    acc_steps = 100
    device=torch.device("cuda")
    model = model.cuda()
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None
    loss_list = []
    epoch_list = []
    for epoch in range(epochs):
        total_loss = []
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)  
            loss = outputs[0] 
            total_loss.append(float(loss.item()))       
            loss.backward()                      
            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()
  
            accumulating_batch_count += 1
            input_tensor = None

        training_logger.add_row(str(epoch), str(np.mean(total_loss)))       
        if save_model_on_epoch:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{output_prefix}-{epoch}.pt"),
            )
        loss_list.append(np.mean(total_loss))
        epoch_list.append(epoch) 
        print(f"for epoch {epoch} the loss is {np.mean(total_loss)}\n")
    console.print(training_logger)   
    plot_loss(epoch_list, loss_list)
    return model

In [95]:
def fine_tune_GPT2(df, model_params):   
  console.log(f"""[Model]: Loading {model_params["MODEL"]}...\n""")

  # tokenzier for encoding the text
  dataset = GPTDataSetClass(df['Target'], truncate=True, gpt2_type=model_params["MODEL"]) 
  #Get the tokenizer and model
  tokenizer = GPT2Tokenizer.from_pretrained(model_params["MODEL"])
  model = GPT2LMHeadModel.from_pretrained(model_params["MODEL"])  
  trained_model = train(dataset, model, tokenizer)
  console.log(f"[Saving Model]...\n")
  #Saving the model after training
  path = os.path.join('/content/output', "model_files")
  model.save_pretrained(path)
  tokenizer.save_pretrained(path)
  console.print(f"""[Model] Model saved @ {os.path.join('/content/output', "model_files")}\n""")
  
  # logging
  console.log(f"[Data]: Reading Raw data...\n")

  # Importing the raw dataset
  display_df(df.head(2))

  # Creation of Dataset and Dataloader
  # Defining the train size. So 80% of the data will be used for training and the rest for validation. 

  console.print(f"FULL Dataset: {df.shape}")
  return trained_model, tokenizer


In [96]:

def generate(
    model,
    tokenizer,
    prompt,
    entry_count=10,
    entry_length=100, #maximum number of words
    top_p=0.8,
    temperature=1.,
):
    model.eval()
    generated_num = 0
    generated_list = []

    filter_value = -float("Inf")

    with torch.no_grad():

        for entry_idx in trange(entry_count):

            entry_finished = False
            generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)

            for i in range(entry_length):
                outputs = model(generated, labels=generated)
                loss, logits = outputs[:2]
                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)

                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
                    ..., :-1
                ].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = filter_value

                next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
                generated = torch.cat((generated, next_token), dim=1)

                if next_token in tokenizer.encode("<|endoftext|>"):
                    entry_finished = True

                if entry_finished:

                    generated_num = generated_num + 1

                    output_list = list(generated.squeeze().numpy())
                    output_text = tokenizer.decode(output_list)
                    generated_list.append(output_text + '\n')
                    break
            
            if not entry_finished:
              output_list = list(generated.squeeze().numpy())
              output_text = f"{tokenizer.decode(output_list)}<|endoftext|>" 
              generated_list.append(output_text + '\n')
                
    return generated_list

#Function to generate multiple sentences. Test data should be a dataframe
def text_generation(model, tokenizer, test_data):
  generated_code = []
  for i in range(len(test_data)):
    x = generate(model.to('cpu'), tokenizer, test_data['Source'][i], entry_count=1)
    generated_code.append(x)
  return generated_code

#Run the functions to generate the lyrics

def test_fine_tuned_gpt2(model, tokenizer, df): 
    test_set = df.sample(n=1)
    df = df.loc[~df.index.isin(test_set.index)]

    #Reset the indexes
    test_set = test_set.reset_index()
    df = df.reset_index()

    #For the test set only, keep last 20 words in a new column, then remove them from original column
    test_set['Target'] = test_set['Target'].str.split().apply(' '.join)
    test_set['Source'] = test_set['Source'].str.split().apply(' '.join)
    generated_code = text_generation(model, tokenizer, test_set)
    print(generated_code)


In [97]:
trained_model, tokenizer = fine_tune_GPT2(df, model_params)



70it [00:01, 35.82it/s]


for epoch 0 the loss is 2.1835273538316997



70it [00:01, 36.49it/s]


for epoch 1 the loss is 2.1434805699757167



70it [00:02, 34.62it/s]


for epoch 2 the loss is 2.1424297741481237



70it [00:01, 36.48it/s]


for epoch 3 the loss is 2.1495807375226703



70it [00:01, 37.48it/s]


for epoch 4 the loss is 2.136149065835135



70it [00:01, 37.28it/s]


for epoch 5 the loss is 2.089957138470241



70it [00:01, 37.97it/s]


for epoch 6 the loss is 2.1036580562591554



70it [00:01, 39.09it/s]


for epoch 7 the loss is 1.9928932973316738



70it [00:01, 37.32it/s]


for epoch 8 the loss is 1.954841034752982



70it [00:01, 37.17it/s]


for epoch 9 the loss is 1.9683968237468175



70it [00:01, 36.96it/s]


for epoch 10 the loss is 1.9875431707927158



70it [00:01, 37.90it/s]


for epoch 11 the loss is 1.8817782197679793



70it [00:01, 37.98it/s]


for epoch 12 the loss is 1.9208463566643852



70it [00:01, 37.63it/s]


for epoch 13 the loss is 1.8526899133409773



70it [00:01, 37.90it/s]


for epoch 14 the loss is 1.8188412870679582



70it [00:01, 37.87it/s]


for epoch 15 the loss is 1.8018763882773263



70it [00:01, 37.14it/s]


for epoch 16 the loss is 1.7491755383355276



70it [00:01, 38.63it/s]


for epoch 17 the loss is 1.6951265471322197



70it [00:01, 38.93it/s]


for epoch 18 the loss is 1.6914959021977016



70it [00:01, 37.11it/s]


for epoch 19 the loss is 1.6968672105244227



70it [00:01, 37.16it/s]


for epoch 20 the loss is 1.6661079202379498



70it [00:01, 37.71it/s]


for epoch 21 the loss is 1.6526125635419573



70it [00:01, 37.01it/s]


for epoch 22 the loss is 1.6186682326453072



70it [00:01, 38.37it/s]


for epoch 23 the loss is 1.56583936555045



70it [00:01, 39.21it/s]


for epoch 24 the loss is 1.571251610347203



70it [00:01, 39.01it/s]


for epoch 25 the loss is 1.5321653604507446



70it [00:01, 36.72it/s]


for epoch 26 the loss is 1.5580393041883196



70it [00:01, 37.88it/s]


for epoch 27 the loss is 1.525659329550607



70it [00:01, 37.74it/s]


for epoch 28 the loss is 1.481574010848999



70it [00:01, 36.99it/s]


for epoch 29 the loss is 1.4267426490783692



70it [00:01, 38.87it/s]


for epoch 30 the loss is 1.4064831222806657



70it [00:01, 37.84it/s]


for epoch 31 the loss is 1.3608144470623562



70it [00:01, 37.75it/s]


for epoch 32 the loss is 1.32422491141728



70it [00:01, 37.76it/s]


for epoch 33 the loss is 1.387314476285662



70it [00:01, 38.06it/s]


for epoch 34 the loss is 1.2784983617918833



70it [00:01, 37.05it/s]


for epoch 35 the loss is 1.2131601572036743



70it [00:01, 37.18it/s]


for epoch 36 the loss is 1.2408393365996224



70it [00:01, 37.26it/s]


for epoch 37 the loss is 1.1984797767230442



70it [00:01, 37.77it/s]


for epoch 38 the loss is 1.1639610477856228



70it [00:01, 37.85it/s]


for epoch 39 the loss is 1.1175963589123317



70it [00:01, 37.44it/s]


for epoch 40 the loss is 1.0668229835374015



70it [00:01, 37.28it/s]


for epoch 41 the loss is 1.0572307229042053



70it [00:01, 37.80it/s]


for epoch 42 the loss is 1.0112234660557338



70it [00:01, 38.36it/s]


for epoch 43 the loss is 0.9583831293242319



70it [00:01, 38.47it/s]


for epoch 44 the loss is 0.9395110181399754



70it [00:01, 39.31it/s]


for epoch 45 the loss is 0.8952146189553397



70it [00:01, 38.51it/s]


for epoch 46 the loss is 0.9065890431404113



70it [00:01, 37.86it/s]


for epoch 47 the loss is 0.9014723334993635



70it [00:01, 37.57it/s]


for epoch 48 the loss is 0.8887053625924247



70it [00:01, 38.98it/s]


for epoch 49 the loss is 0.7516513569014412



70it [00:01, 37.30it/s]


for epoch 50 the loss is 0.8135856074946267



70it [00:01, 37.51it/s]


for epoch 51 the loss is 0.8208842762878963



70it [00:01, 37.33it/s]


for epoch 52 the loss is 0.8189783794539315



70it [00:01, 37.92it/s]


for epoch 53 the loss is 0.8022054757390703



70it [00:01, 37.96it/s]


for epoch 54 the loss is 0.7386848168713706



70it [00:01, 38.93it/s]


for epoch 55 the loss is 0.6672179290226528



70it [00:01, 37.82it/s]


for epoch 56 the loss is 0.7747894900185721



70it [00:01, 37.77it/s]


for epoch 57 the loss is 0.6984059308256422



70it [00:01, 37.13it/s]


for epoch 58 the loss is 0.693394057239805



70it [00:01, 37.75it/s]


for epoch 59 the loss is 0.7052660422665732



70it [00:01, 38.13it/s]


for epoch 60 the loss is 0.6638242789677211



70it [00:01, 38.34it/s]


for epoch 61 the loss is 0.66578973702022



70it [00:01, 37.79it/s]


for epoch 62 the loss is 0.7229002160685403



70it [00:01, 38.26it/s]


for epoch 63 the loss is 0.6559910710368838



70it [00:01, 37.55it/s]


for epoch 64 the loss is 0.6494922633681979



70it [00:01, 37.04it/s]


for epoch 65 the loss is 0.6562515599387032



70it [00:01, 37.39it/s]


for epoch 66 the loss is 0.6965244033506939



70it [00:01, 37.47it/s]


for epoch 67 the loss is 0.7117793632405145



70it [00:01, 37.86it/s]


for epoch 68 the loss is 0.6331893290792193



70it [00:01, 37.91it/s]


for epoch 69 the loss is 0.6714016148022243



70it [00:01, 37.72it/s]


for epoch 70 the loss is 0.6367611472095762



70it [00:01, 38.26it/s]


for epoch 71 the loss is 0.6077710079295294



70it [00:01, 37.89it/s]


for epoch 72 the loss is 0.6503164431878499



70it [00:01, 38.68it/s]


for epoch 73 the loss is 0.5495846318347114



70it [00:01, 38.08it/s]


for epoch 74 the loss is 0.6193690849202019



70it [00:01, 37.67it/s]


for epoch 75 the loss is 0.7034207637820925



70it [00:01, 36.92it/s]


for epoch 76 the loss is 0.6599002399614879



70it [00:01, 38.65it/s]


for epoch 77 the loss is 0.5920100007738386



70it [00:01, 37.97it/s]


for epoch 78 the loss is 0.6166981407574245



70it [00:01, 37.70it/s]


for epoch 79 the loss is 0.614609352179936



70it [00:01, 37.54it/s]


for epoch 80 the loss is 0.6044936180114746



70it [00:01, 38.71it/s]


for epoch 81 the loss is 0.5402834832668304



70it [00:01, 37.39it/s]


for epoch 82 the loss is 0.6004708728619984



70it [00:01, 37.92it/s]


for epoch 83 the loss is 0.6089081057480403



70it [00:01, 36.84it/s]


for epoch 84 the loss is 0.5455794206687382



70it [00:01, 36.53it/s]


for epoch 85 the loss is 0.6207823932170868



70it [00:01, 39.48it/s]


for epoch 86 the loss is 0.4904466211795807



70it [00:01, 37.63it/s]


for epoch 87 the loss is 0.5558377210583005



70it [00:01, 37.51it/s]


for epoch 88 the loss is 0.5351431889193399



70it [00:01, 38.28it/s]


for epoch 89 the loss is 0.5495967469045094



70it [00:01, 38.06it/s]


for epoch 90 the loss is 0.5183845707348415



70it [00:01, 38.14it/s]


for epoch 91 the loss is 0.5417005653892245



70it [00:01, 37.97it/s]


for epoch 92 the loss is 0.4821667262486049



70it [00:01, 38.37it/s]


for epoch 93 the loss is 0.505840204017503



70it [00:01, 39.03it/s]


for epoch 94 the loss is 0.5181684936795916



70it [00:01, 38.49it/s]


for epoch 95 the loss is 0.5012339136430195



70it [00:01, 38.98it/s]


for epoch 96 the loss is 0.5401353819029672



70it [00:01, 36.55it/s]


for epoch 97 the loss is 0.593966549209186



70it [00:01, 37.07it/s]


for epoch 98 the loss is 0.5267473357064384



70it [00:01, 37.88it/s]


for epoch 99 the loss is 0.5171846487692424



In [106]:
def generate_test_contrct(path, ratio): 
  col = ['Source']
  f = open(path, "r")
  file = f.read()
  test_df = pd.DataFrame([file], columns=col)
  program_length = len(test_df['Source'][0].split())
  prompt_ratio = ratio
  prompt_length = int(prompt_ratio * program_length)
  return test_df, prompt_length

test_df, n = generate_test_contrct('/content/Bank.sol', 0.2)
print(test_df)

                                              Source
0  /**\n *Submitted for verification at Etherscan...


In [107]:
def truncate_test(df, prompt_length):
  copy_1 = df.copy(deep=True)
  copy_2 = df.copy(deep=True)
  #true
  test_col = ['Truth', 'Prompt']
  df = pd.DataFrame(columns=test_col)
  #print(copy_1['Source'].str.split().str[-prompt_length:].apply(' '.join)[0])
  a = copy_1['Source'].str.split().str[-prompt_length:].apply(' '.join)[0]
  #masked out program 
  b = copy_2['Source'].str.split().str[:-prompt_length].apply(' '.join)[0]
  return a, b
truth, prompt = truncate_test(test_df, n)
col = ['Source']
prompt_df = pd.DataFrame([prompt], columns=col)
out = text_generation(trained_model, tokenizer, prompt_df)

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:11<00:00, 11.17s/it]


In [108]:
print(out)
for row in out:
  with open('cotract.sol', 'w') as writefile: 
    for line in row:
      print(line)
      writefile.write(line + '\n')

[['/** *Submitted for verification at Etherscan.io on 2020-10-07 */ // File: openzeppelin-solidity-2.3.0/contracts/ownership/Ownable.sol //test for algebraic invariaantsss pragma solidity ^0.5.0; contract Bank{ //reentrant here function work(uint256 id, address goblin, uint256 loan, uint256 maxReturn, bytes calldata data) external payable onlyEOA accrue(msg.value) { // 1. Sanity check the input position, or add a new position of ID is 0. if (id == 0) { id = nextPositionID++; positions[id].goblin = goblin; positions[id].owner = msg.sender; } else { require(id < nextPositionID, "bad position id"); } emit Work(id, loan); // 2. Make sure the goblin can accept more debt and remove the existing debt. uint256 debt = _removeDebt(id).add(loan); // 3. Perform the actual work, using a new scope to avoid stack-too-deep errors. uint256 back; { uint256 sendETH = msg.value.add(loan); uint256 beforeETH = address(this).balance.sub(sendETH); Goblin(goblin).work.value(sendETH)(id, msg.sender, debt, data)

In [None]:
#To solve CUDA out of memory error; not necesssary here 
import gc
gc.collect()
torch.cuda.empty_cache()
os.environ['CUDA_VISIBLE_DEVICES']='0, 1, 2, 3'
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

In [101]:
#for downloading purpose 
!zip -r /content/model.zip /content/output/model_files

  adding: content/output/model_files/ (stored 0%)
  adding: content/output/model_files/special_tokens_map.json (deflated 74%)
  adding: content/output/model_files/tokenizer_config.json (deflated 69%)
  adding: content/output/model_files/pytorch_model.bin (deflated 9%)
  adding: content/output/model_files/config.json (deflated 51%)
  adding: content/output/model_files/vocab.json (deflated 68%)
  adding: content/output/model_files/merges.txt (deflated 53%)
  adding: content/output/model_files/generation_config.json (deflated 24%)


In [102]:
from google.colab import files
files.download("/content/model.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>