In [1]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import time
import random
import torch.optim as optim
from tqdm import tqdm
import pandas as pd
import os
from transformers import RobertaTokenizer, T5ForConditionalGeneration
import nltk

# Testing

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-small')
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-small')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

In [None]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32100, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32100, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [None]:
text = "def greet(user): print(f'hello <extra_id_0>!')"
input_ids = tokenizer(text, return_tensors="pt").input_ids

# simply generate a single sequence
generated_ids = model.generate(input_ids, max_length=10)
print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))

<extra_id_0>user.id) print(f


In [None]:
text = "def return_something(): result = 1 + 4 return result"
input_ids = tokenizer(text, return_tensors="pt").input_ids

# simply generate a single sequence
generated_ids = model.generate(input_ids, max_length=10)
print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))

<extra_id_0>result


# Data Manipulation


In [None]:
!unzip /content/T5-Pretraining-20240304T162648Z-002.zip -d /content/Data

Archive:  /content/T5-Pretraining-20240304T162648Z-002.zip
  inflating: /content/Data/T5-Pretraining/pretrain.tsv  


In [None]:
# Data from https://drive.google.com/drive/folders/1OyERePErjkZCV-XHiEIowZ0I8nfBFAAR

dfTrain = pd.read_table('/content/Data/test-linking.tsv', names=['method','line_links'])
dfTrain

# Method shows the relevant java method with line numbers between <>
# The line links relates the <comment></comment> block to the line numbers

Unnamed: 0,method,line_links
0,private void otaPerformActivation ( ) { <1> if...,<11> <12>
1,public static boolean isTachyonEnabled ( Conte...,<28> <29> <30> <31> <32>
2,private static void updateCdmaCallStateOnNewOu...,<2> <3> <5> <6> <7>
3,private void onProfileClicked ( LocalBluetooth...,<11> <13> <14> <15>
4,"public void buildFromCursor ( Cursor cursor , ...",<59> <60> <61> <62> <63>
...,...,...
628,@ Override <1> protected void onDialogClosed (...,<25> <26>
629,private void configInputDisplay ( ) { <1> Text...,<48> <49> <50>
630,private void setSeekBarAndTextLayout ( ) { <1>...,<12> <13>
631,private void updateListenerMap ( ) { <1> Set <...,<34> <35> <36> <37> <38> <39> <40> <41> <42> <...


In [None]:
dfTrain.iloc[545]['method']

'private void handleBootCompleted ( ) { <1> // Some messages may get stuck in the outbox. At this point, they\'re probably irrelevant <2> // to the user, so mark them as failed and notify the user, who can then decide whether to <3> // resend them manually. <4> int numMoved = moveOutboxMessagesToFailedBox ( ) ; <5> if ( numMoved > 0 ) { <6> MessagingNotification . notifySendFailed ( getApplicationContext ( ) , true ) ; <7> } <8> <9> <comment> " Send any queued messages that were waiting from before the reboot. " </comment> <10> sendFirstQueuedMessage ( ) ; <11> <12> // Called off of the UI thread so ok to block. <13> MessagingNotification . blockingUpdateNewMessageIndicator ( <14> this , MessagingNotification . THREAD_ALL , false ) ; <15> } <16>'

In [None]:
dfTrain.iloc[545]['line_links']

'<11>'

In [None]:
dfTest = pd.read_table('/content/Data/test-linking.tsv', names=['method','line_links'])

# Dataset Class

In [2]:
# Downloading
!gdown 10I25gPrv9oE8KK2rW6GwEYdu3reCE9hH

Downloading...
From (original): https://drive.google.com/uc?id=10I25gPrv9oE8KK2rW6GwEYdu3reCE9hH
From (redirected): https://drive.google.com/uc?id=10I25gPrv9oE8KK2rW6GwEYdu3reCE9hH&confirm=t&uuid=c726e3b4-5e61-4b6e-b3fe-1dd0227e4667
To: /content/train_preprocessed.csv
100% 115M/115M [00:01<00:00, 107MB/s] 


In [3]:
class CustomDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.data = df
        #.set_index('index').T.to_dict('list')
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Extract the values as strings. They are Pandas
        src_method = self.data['src_method'][idx]
        src_doc = self.data['src_javadoc'][idx]
        dst_method = self.data['dst_method'][idx]
        dst_doc = self.data['dst_javadoc'][idx]

        # Tokenize
        srcMTokens = tokenizer(src_method, return_tensors="pt", padding='max_length', truncation=True).input_ids
        dstMTokens = tokenizer(dst_method, return_tensors="pt", padding='max_length', truncation=True).input_ids
        srcDTokens = tokenizer(src_doc, return_tensors="pt", padding='max_length', truncation=True).input_ids
        dstDTokens = tokenizer(dst_doc, return_tensors="pt", padding='max_length', truncation=True).input_ids

        input_ids = torch.cat([srcMTokens, dstMTokens, srcDTokens], dim=1)
        labels = dstDTokens

        # Return the method and line_links as a tuple
        return input_ids, labels

In [25]:
dfTest = pd.read_csv('/content/Data/train_preprocessed.csv')
dfTest

Unnamed: 0,sample_id,full_name,commit_id,src_method,dst_method,src_javadoc,dst_javadoc
0,3055905,opennetworkinglab/onos,4ae5aa8b207090bfc9776dd802d4ada04b732d00,public Interface getInterface() {\n ret...,public Interface getInterface() {\n ret...,Get the PIM Interface.,Return the ONOS Interface.
1,2571481,kaaproject/kaa,5c56e1972ed0201beb4f41f1d98a89e1cd03cda6,"@RequestMapping(value = ""CTLSchemas"", method =...","@RequestMapping(value = ""CTLSchema"", method = ...",Gets CTL schemas with the given fully qualifie...,Gets CTL schemas by their shared fully qualifi...
2,5552502,unofficial-openjdk/openjdk,4cb2ac82d248a1cd62bf2272649576e3bbca4951,"private static void writeKtab0(String tab, boo...","public void writeKtab(String tab, boolean appe...",Writes or appends KDC keys into a keytab. See ...,Writes or appends keys into a keytab.\n\nAtten...
3,3246481,plutext/docx4j,96e616b8e5740b47a4d644228713d0c9755810bc,public List<Object> getCustomXmlOrSmartTagOrSd...,public List<Object> getAccOrBarOrBox() {\n ...,Gets the value of the customXmlOrSmartTagOrSdt...,Gets the value of the accOrBarOrBox property.\...
4,3246481,plutext/docx4j,96e616b8e5740b47a4d644228713d0c9755810bc,public List<Object> getCustomXmlOrSmartTagOrSd...,public List<Object> getAccOrBarOrBox() {\n ...,Gets the value of the customXmlOrSmartTagOrSdt...,Gets the value of the accOrBarOrBox property.\...
...,...,...,...,...,...,...,...
85652,463357,apache/calcite,cabdcf44e4aec4d4ceea7f97c8c6fd9e9dbd36b1,"private static <T> List<T> flatList_(T[] t, bo...",private static <T extends Comparable> List<T> ...,"Creates a memory-, CPU- and cache-efficient im...","Creates a memory-, CPU- and cache-efficient co..."
85653,4005378,apache/karaf,d888f462d95de77b8f034ec92b9ebb34c1f5ce94,private Dictionary getProperties() {\n\t\tDict...,private Dictionary getProperties() {\n\t\tDict...,Called by on an\nAdminPermission which was co...,Called by on an AdminPermission which was\nco...
85654,4005385,apache/karaf,d888f462d95de77b8f034ec92b9ebb34c1f5ce94,private static int skipWildCards(List dnChainP...,private static int skipWildCards(List dnChainP...,This method will return an 'index' which point...,This method will return an 'index' which point...
85655,4005386,apache/karaf,d888f462d95de77b8f034ec92b9ebb34c1f5ce94,private static ArrayList parseDNchainPattern(S...,private static List parseDNchainPattern(String...,Parses a distinguished name chain pattern and ...,Parses a distinguished name chain pattern and ...


## Accuracy Metrics

In [None]:
!pip install rouge-score

In [None]:
from rouge_score import rouge_scorer

def calculate_metrics(predictions, labels):

  meteorScore = nltk.translate.meteor_score.meteor_score(predictions, labels)

  bleuScore = nltk.translate.bleu_score.sentence_bleu(predictions, labels)*100

  scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
  rougeScores = scorer.score(labels, predictions)

  return meteorScore, bleuScore, rougeScores


# Training

In [4]:
def train_my_model(train_loader,val_loader,num_epochs,model,tokenizer,criterion,optimizer,target_folder, name_experiment, with_validation=False):
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  print("Device: {}".format(device))
  model.to(device)

  total_steps = len(train_loader)
  t1 = time.time()

  training_losses, training_accuracies = [], []
  validation_losses, validation_accuracies = [], []
  model.train()
  for epoch in range(num_epochs):
      training_loss, training_acc = [], []
      validation_loss, validation_acc = [], []

      model.train()
      for i, data in enumerate(tqdm(train_loader)):
          method, links = data[0].to(device), data[1].squeeze(1).to(device)
          method = method.squeeze()
          # Forward pass
          outputs = model(input_ids=method, labels=links)
          loss = outputs[0]
          #loss = criterion(outputs.logits, links)
          # Backprop and optimization
          optimizer.zero_grad()
          loss.backward()
          optimizer.step()
          training_loss.append(loss.detach().cpu())
      training_losses.append(np.mean(training_loss))

      if with_validation and epoch%5==0:
        model.eval()
        print("--- validation")
        for i, data in enumerate(tqdm(val_loader)):
          method, links = data[0].to(device), data[1].squeeze(1).to(device)
          method = method.squeeze()
          # Forward pass
          outputs=model(input_ids=method, labels=links)
          loss = outputs[0]
          #loss = criterion(outputs.logits, links)
          validation_loss.append(loss.detach().cpu())
        validation_losses.append(np.mean(validation_loss))
      print('Epoch [{}/{}], Loss: {:.4f}'
                .format(epoch + 1, num_epochs, loss.item()))
  if with_validation:
    dict = {'Training loss': training_losses}
    dict_val = {'Validation loss': validation_losses}
    df = pd.DataFrame(dict_val)
    # saving the dataframe
    df.to_csv(os.path.join(target_folder,name_experiment+'-validation.csv'))
  else:
    dict = {'Training loss': training_losses}
    df = pd.DataFrame(dict)
    # saving the dataframe
    df.to_csv(os.path.join(target_folder,name_experiment+'-train.csv'))
  print("Saving the model in ",target_folder)
  torch.save(model.state_dict(), os.path.join(target_folder,'model-'+name_experiment+'.ckpt'))
  print("######## Training Finished in {} seconds ###########".format(time.time()-t1))

In [5]:
def run_exp(model,tokenizer,config,datafolder,saveFolder):
  print("running experiment ",config['name'],"\n")

  #loading the data
  df = pd.read_csv(os.path.join(datafolder,'/content/Data/train_preprocessed.csv'))
  #dfTest = pd.read_table(os.path.join(datafolder,'test-linking.tsv'), names=['method','line_links'])
  # Manipulating the dataframe
  df = df.dropna()
  df.reset_index(drop=True, inplace=True)
  # df['index'] = df.index
  # df[['index'] + [col for col in df.columns if col != 'index']]


  #Creating dataset and loaders
  dataset = CustomDataset(df, tokenizer)
  train_dataset, val_dataset = torch.utils.data.random_split(dataset, [0.9, 0.1])

  #training parameters
  learning_rate = config['lr']
  batch_size = config['batch_size']

  #criterion and optimizer
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)

  #dataloader
  train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=False)#define train loader
  val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)#define val loader

  #setting up the training and folders were experiments will be saved
  target_folder = saveFolder
  name_experiment = config['name']
  val = True
  #training
  epochs = config['num_epochs']
  train_my_model(train_loader,val_loader,epochs,model,tokenizer,criterion,optimizer,target_folder, name_experiment, with_validation=True)

  print("Training for experiment ",config['name']," is done!")

## Testing

In [6]:
tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-small')
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-small')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

In [None]:
config = {'name': 'Training', 'batch_size': 4, 'num_epochs': 1, 'lr': 0.001}

run_exp(model,tokenizer,config,'/content/Data','/content/ResultsFolder')

running experiment  Training 

Device: cuda:0


 13%|█▎        | 2444/19273 [37:11<4:24:14,  1.06it/s]

In [10]:
# To resest mem
import gc
torch.cuda.empty_cache()
gc.collect()

0

### Testing after training

In [None]:
import re
testNumber = 92

text = dfTrain.iloc[testNumber]['method']

input_ids = tokenizer(text, return_tensors="pt").input_ids
input_ids = input_ids.to("cuda")
# simply generate a single sequence
generated_ids = model.generate(input_ids, max_length=850)

In [None]:
tags_with_numbers = re.findall(r'<\d+>', text)

splits = re.split(r'<\d+>', text)
result = [(s.strip(), t) for s, t in zip(splits, tags_with_numbers)]

for s, t in result:
    print(f"String: {s} - {t}")

print()
print("Prediction", tokenizer.decode(generated_ids[0], skip_special_tokens=True))
print("True Answer: ", dfTrain.iloc[testNumber]['line_links'])

String: private void sendNotifyRespInd ( int status ) throws MmsException , IOException { - <1>
String: // Create the M-NotifyResp.ind - <2>
String: NotifyRespInd notifyRespInd = new NotifyRespInd ( - <3>
String: PduHeaders . CURRENT_MMS_VERSION , - <4>
String: mNotificationInd . getTransactionId ( ) , - <5>
String: status ) ; - <6>
String:  - <7>
String: <comment> " Pack M-NotifyResp.ind and send it " </comment> - <8>
String: if ( MmsConfig . getNotifyWapMMSC ( ) ) { - <9>
String: sendPdu ( new PduComposer ( mContext , notifyRespInd ) . make ( ) , mContentLocation ) ; - <10>
String: } else { - <11>
String: sendPdu ( new PduComposer ( mContext , notifyRespInd ) . make ( ) ) ; - <12>
String: } - <13>
String: } - <14>

Prediction <23> <24>
True Answer:  <9> <10> <11> <12> <13>
