In [1]:
import numpy as np
import pandas as pd
import time
import datetime
import random
import re
import transformers
import torch
import torch.nn as nn

from nltk.corpus import stopwords
from torch.optim import AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import RobertaForSequenceClassification, get_linear_schedule_with_warmup, RobertaTokenizer

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [3]:
#from google.colab import drive
#drive.mount('/content/drive')
#!ls "/content/drive/MyDrive/Colab Notebooks/gb_final_proj"
#df=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/gb_final_proj/gb.csv')
df = pd.read_csv('gb.csv')

In [6]:
print(df.columns)

Index(['proj_spec', 'proj_id', 'func_id', 'namespace', 'func', 'e', 'r', 'w',
       'x', 'i_entry', 'i_exit', 'c', 'p_decision', 'p_functionCall',
       'p_calculation', 'lang', 'eff', 'Unnamed: 17'],
      dtype='object')


## TRAIN TEST SPLIT

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df['func'], df['p_functionCall'],
                                                    test_size=0.2, random_state=13)
display(y_train)

577     1
1024    3
1012    3
1261    1
437     2
       ..
866     0
742     8
74      3
176     1
338     0
Name: p_functionCall, Length: 1052, dtype: int64

In [8]:
df = pd.DataFrame(data={'code': X_train, 'p_functionCall': y_train}).reset_index(drop=True)
test_df = pd.DataFrame(data={'code': X_test, 'p_functionCall': y_test}).reset_index(drop=True)

display(df)
display(test_df)

Unnamed: 0,code,p_functionCall
0,private void Start()\r\n{\r\n displayError ...,1
1,public void Open_L_cement()\r\n{\r\n if (ma...,3
2,public void OpenSeedBundle()\r\n{\r\n marke...,3
3,public void ShowBuildigConstructionSpeedUpAd()...,1
4,public void StartInstantSimulateOnLoadCallback...,2
...,...,...
1047,public int getPremiumSeedAmount()\r\n{\r\n ...,0
1048,public void OnIgnoreButtonClicked()\r\n{\r\n ...,8
1049,public void On_AGCAuth_Authorized(string i...,3
1050,public static int Parse_Int(string txt)\r\n{\r...,1


Unnamed: 0,code,p_functionCall
0,private void CheckStartFireMainLoop(int tick =...,36
1,public static int Hex_to_Dec(string hex)\r\n{\...,1
2,"public bool TryUseTechFood(double cfValue, int...",0
3,public override string ToString()\r\n{\r\n ...,0
4,private void FixedUpdate()\n{\n // --bluepr...,12
...,...,...
258,public void OpenFarmBuildingBar()\r\n{\r\n ...,0
259,public bool IsAssistanMessageQueueEmpty()\r\n{...,0
260,private void Start()\r\n{\r\n saveManager =...,2
261,public void ChangeWhichObjectToSetDetails(Plac...,1


In [9]:
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
tokenizer.model_max_length = 512

In [10]:
print(' Original: ', list(df['code'])[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(list(df['code'])[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(list(df['code'])[0])))

 Original:  private void Start()
{
    displayError = DisplayError.Instance;
    screensManager = ScreenManager.Instance;
    environment = EnvironmentManager.Instance;
    resourceManager = RawResource.Instance;
    utilitiesManager = Utilities.Instance;
    buildingInfo = BuildingInfo.Instance;
    leaderManager = Leader.Instance;
    saveManager = SaveManager.Instance;
    isolatedSimulator = IsolatedSimulator.Instance;
    achievementsManager = AchievementsManager.Instance;
    playerAuthManager = PlayerAuthManager.Instance;
    marketplace = Marketplace.Instance;
    technologyManager = TechnologyManager.Instance;
    leaderboardManager = LeaderboardManager.Instance;

    grid = GridBuildingSystem.Instance.GetGrid();

    // object pooling init for UI anim
    Init();
}
Tokenized:  ['private', 'Ġvoid', 'ĠStart', '()', 'č', 'Ċ', '{', 'č', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġdisplay', 'Error', 'Ġ=', 'ĠDisplay', 'Error', '.', 'Instance', ';', 'č', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġscreens

In [11]:
max_len = 512
ids_longer = []

for i, code in enumerate(list(df['code'])):

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(code, add_special_tokens=False)
    if len(input_ids) > max_len:
        ids_longer.append(i)
        
print(len(ids_longer), "out of", len(df), "train samples are removed.")
df = df.drop(ids_longer)
df = df.reset_index(drop=True)

ids_longer = []
for i, code in enumerate(list(test_df['code'])):

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(code, add_special_tokens=False)
    if len(input_ids) > max_len:
        ids_longer.append(i)

print(len(ids_longer), "out of", len(test_df), "test samples are removed.")
test_df = test_df.drop(ids_longer)
test_df = test_df.reset_index(drop=True)

labels = torch.from_numpy(df['p_functionCall'].values)
test_labels = torch.from_numpy(test_df['p_functionCall'].values)

Token indices sequence length is longer than the specified maximum sequence length for this model (1044 > 512). Running this sequence through the model will result in indexing errors


137 out of 1052 train samples are removed.
38 out of 263 test samples are removed.


In [12]:
input_ids = []
attention_masks = []

# For every tweet...
for code in list(df['code']):
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        code,               # Sentence to encode.
                        add_special_tokens = False, # Add '[CLS]' and '[SEP]'
                        truncation=True,
                        max_length = 512,           # Pad & truncate all sentences.
                        padding = 'max_length',
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )

    # Add the encoded sentence to the list.
    input_ids.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = labels.clone().detach().float()

# Print sentence 0, now as a list of IDs.
print('Original:\n', list(df['code'])[5])
print('Token IDs:\n', input_ids[5])

Original:
 public static Vector3 GetMouseWorldPosition_OffsetForFarmDrag(Vector3Int offset) => Instance.GetMouseWorldPosition_Instance_OffSetted_ForDrag(offset);
Token IDs:
 tensor([15110, 25156, 40419,   246,  2315, 48394, 10988, 46884,  1215, 48772,
         2709, 36587, 45608,  1640, 48417,   246, 22886,  6147,    43, 46161,
         8857,  2389,     4, 14181, 48394, 10988, 46884,  1215, 49483,  1215,
        22985, 28512,  5357,  1215,  2709, 45608,  1640, 48025,  4397,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,    

In [13]:
print(input_ids.shape)
print(attention_masks.shape)
print(labels.shape)

torch.Size([915, 512])
torch.Size([915, 512])
torch.Size([915])


In [14]:
# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

# Calculate the number of samples to include in each set.
train_size = int(0.8 * len(dataset))
val_size = len(dataset)  - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

  732 training samples
  183 validation samples


In [15]:
# The DataLoader needs to know our batch size for training, so we specify it
# here. For fine-tuning BERT on a specific task, the authors recommend a batch
# size of 16 or 32.
batch_size = 64

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order.
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [16]:
# Load the pretrained codebert model with a single linear classification layer on top.
model = RobertaForSequenceClassification.from_pretrained(
    "microsoft/codebert-base", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 1, # The number of output labels=2 for binary classification.
                    # You can increase this for multi-class tasks.
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

model = model.to(device)

Some weights of the model checkpoint at microsoft/codebert-base were not used when initializing RobertaForSequenceClassification: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be 

In [17]:
optimizer = AdamW(model.parameters(),
                  lr = 1e-4, # args.learning_rate - default is 5e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8
                )

In [18]:
# Number of training epochs. The BERT authors recommend between 2 and 4.
# We chose to run for 4, but we'll see later that this may be over-fitting the training data.
NUM_EPOCHS = 4

# Total number of training steps is [number of batches] x [number of epochs].
# (Note that this is not the same as the number of training samples).
TOTAL_STEPS = len(train_dataloader) * NUM_EPOCHS

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = TOTAL_STEPS)

In [19]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [20]:
import math

def get_metrics(preds, trues):
        res = [(np.abs(preds[i]-trues[i]) / trues[i]) for i in range(len(preds))]
        ses = [math.pow(preds[i]-trues[i], 2) for i in range(len(preds))]
        aes = [np.abs(preds[i]-trues[i]) for i in range(len(preds))]

        resUnder30 = [re for re in res if re < 0.3]
        pred30 = len(resUnder30)/len(res)
        mre = np.mean(res)
        mse = np.mean(ses)
        mae = np.mean(aes)
        return {'pred30': pred30, 'mre': mre, 'mse': mse, 'mae': mae}

## RUN TRAIN

In [21]:
from tqdm.notebook import tqdm

pb_train = tqdm(range(TOTAL_STEPS))

t_start = time.time()

for epoch in range(NUM_EPOCHS):

    # ========================================
    #               Training
    # ========================================

    epoch_training_loss = 0
    model.train()
    for batch in train_dataloader:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)
        output = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = output.loss.item()
        epoch_training_loss += loss

        logits = output.logits.to('cpu').detach().numpy()
        trues = batch[2]
        preds = logits.flatten().tolist()

        output.loss.backward()
        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
        pb_train.set_postfix({"Loss": loss})
        pb_train.update(1)
    
    avg_training_loss = epoch_training_loss/len(train_dataloader)
    print(f"============ EPOCH {epoch} ============")
    print(f"Training Loss:       {avg_training_loss:.2f}")

    # ========================================
    #               Validation
    # ========================================
    # Measure performance at the end of each epoch

    epoch_eval_preds, epoch_eval_trues = [], []
    model.eval()
    for batch in validation_dataloader:
        input_ids = batch[0].to(device)
        input_mask = batch[1].to(device)
        labels = batch[2].to(device)
        with torch.no_grad():
            output= model(input_ids, token_type_ids=None, attention_mask=input_mask, labels=labels)
        loss = output.loss.item()

        logits = output.logits.to('cpu').detach().numpy()
        trues = batch[2]
        preds = logits.flatten().tolist()

        epoch_eval_preds.extend(preds)
        epoch_eval_trues.extend(trues)

    eval_metrics = get_metrics(epoch_eval_preds, epoch_eval_trues)
    print(f"Evaluation PRED(30): {eval_metrics['pred30']:.2f}")
    print(f"Evaluation MRE:      {eval_metrics['mre']:.2f}")
    print(f"Evaluation MSE:      {eval_metrics['mse']:.2f}")
    print(f"Evaluation MAE:      {eval_metrics['mae']:.2f}")

print(f"Total time {format_time(time.time() - t_start)} (h:mm:ss)")

  0%|          | 0/48 [00:00<?, ?it/s]

Training Loss:       6.28
Evaluation PRED(30): 0.47
Evaluation MRE:      inf
Evaluation MSE:      4.35
Evaluation MAE:      1.05
Training Loss:       3.05
Evaluation PRED(30): 0.51
Evaluation MRE:      inf
Evaluation MSE:      3.03
Evaluation MAE:      0.64
Training Loss:       3.28
Evaluation PRED(30): 0.40
Evaluation MRE:      inf
Evaluation MSE:      3.33
Evaluation MAE:      0.77
Training Loss:       1.85
Evaluation PRED(30): 0.55
Evaluation MRE:      inf
Evaluation MSE:      2.68
Evaluation MAE:      0.54
Total time 0:00:56 (h:mm:ss)
