In [21]:
import os, shutil, argparse, re, json, random, copy, logging
from datetime import datetime
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
import numpy as np
import pandas as pd
from numpy import argmax, save, load, sum, sqrt
from numpy.linalg import norm

from transformers import (RobertaConfig, RobertaModel, RobertaTokenizer)
from transformers import WEIGHTS_NAME, get_linear_schedule_with_warmup


import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss, MSELoss
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler
from transformers import WEIGHTS_NAME, get_linear_schedule_with_warmup
from torch.optim import AdamW 

from collections import Counter
from tqdm.notebook import tqdm
# import helpers_data_process as dproc

In [22]:
def create_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory, exist_ok=True)

In [23]:
def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYHTONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [24]:
# model
class RobertaClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size*2, config.hidden_size)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.out_proj = nn.Linear(config.hidden_size, 2)

    def forward(self, features, **kwargs):
        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
        x = x.reshape(-1,x.size(-1)*2)
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x
        
class Model(nn.Module):   
    def __init__(self, encoder,config,tokenizer,args):
        super(Model, self).__init__()
        self.encoder = encoder
        self.config=config
        self.tokenizer=tokenizer
        self.classifier=RobertaClassificationHead(config)
        self.args=args
    
    def forward(self, input_ids=None,labels=None): 
        input_ids=input_ids.view(-1,self.args.block_size)
        outputs = self.encoder(input_ids= input_ids,attention_mask=input_ids.ne(1))[0]
        #print('here', outputs)
        logits=self.classifier(outputs)
        prob=F.softmax(logits, dim=-1)
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits, labels)
            return loss,prob
        else:
            return prob

In [25]:

class InputFeatures(object):
    """A single training/test features for a example."""
    def __init__(self, input_tokens, input_ids, id1,id2, label):
        self.input_tokens = input_tokens
        self.input_ids = input_ids
        self.label = label
        self.id1 = id1
        self.id2 = id2
        

def convert_examples_to_features(args, x): 
    
    code_1 = x[0]
    code_2 = x[1]
    label = x[2]
    tokenizer=x[3]
    
    id1 = x[4]
    id2=x[5]

    code1_tokens=tokenizer.tokenize(code_1)    
    code1_tokens=code1_tokens[:args.block_size-2]
    code1_tokens =[tokenizer.cls_token]+code1_tokens+[tokenizer.sep_token]
    code1_ids=tokenizer.convert_tokens_to_ids(code1_tokens)
    padding_length = args.block_size - len(code1_ids)
    code1_ids+=[tokenizer.pad_token_id]*padding_length
    
    code2_tokens=tokenizer.tokenize(code_2)
    code2_tokens=code2_tokens[:args.block_size-2]
    code2_tokens =[tokenizer.cls_token]+code2_tokens+[tokenizer.sep_token]      
    code2_ids=tokenizer.convert_tokens_to_ids(code2_tokens)
    padding_length = args.block_size - len(code2_ids)
    code2_ids+=[tokenizer.pad_token_id]*padding_length
    source_tokens=code1_tokens+code2_tokens
    source_ids=code1_ids+code2_ids
    #input_tokens, input_ids, label, uid_func, uid_file)
    return InputFeatures(source_tokens, source_ids, id1, id2, int(label)) #input_tokens, input_ids, id1,id2, label


class TextDataset(Dataset):
    def __init__(self, args, tokenizer, file_path=None):
       
        self.examples = []
        codes={}
        with open(args.codes) as f:
            for line in f:
                line=line.strip()
                js=json.loads(line)
                codes[js['idx']]=js['func']

        # Read the CSV file with specified column names
        f = pd.read_csv(file_path, names=["id1", "id2", "label"])
        data = []
        for ind, row in f.iterrows():
            id1 = row['id1']
            id2 = row['id2']
            label = row['label']
            data.append((codes[id1], codes[id2], label, tokenizer, id1, id2))
        print(len(data))

        if 'valid' in file_path:
            data = random.sample(data,int(len(data)*0.1))
        #convert example to input features   ddd
        for x in data:
            try:
                self.examples.append(convert_examples_to_features(args, x))
            except:
                continue
        
                
    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):       
        return torch.tensor(self.examples[i].input_ids),torch.tensor(self.examples[i].label), torch.tensor(self.examples[i].id1),torch.tensor(self.examples[i].id2)
        
def create_dataset(args, tokenizer, fp):
    dx_dataset = TextDataset(args, tokenizer, fp)
    dx_sampler = RandomSampler(dx_dataset) 
    dx_dataloader = DataLoader(dx_dataset, sampler=dx_sampler, batch_size=args.batch_size)
    return dx_dataloader



In [26]:
def evaluation_metrics(y_trues, y_preds):
    accuracy= accuracy_score(y_trues, y_preds)
    f1 = f1_score(y_trues, y_preds)
    precision = precision_score(y_trues, y_preds)
    recall = recall_score(y_trues, y_preds)
    result = { 
            "precision": round(precision,4),
            "recall": round(recall,4),
            "f1": round(f1,4),
            "accuracy": round(accuracy,4)
            }
    return result
    
def model_evaluation(args, model, dx_dataloader):
    y_preds = []
    y_trues = []
    for batch in dx_dataloader:
        loss, logits_= model(batch[0].to(args.device), batch[1].to(args.device))
        y_preds.append(logits_.detach().cpu().numpy())
        y_trues.append(batch[1].detach().cpu().numpy())
    # best_threshold=0.5
    y_preds=np.concatenate(y_preds,0)
    y_trues=np.concatenate(y_trues,0)
    y_preds= y_preds.argmax(-1)  #y_preds[:,1]>best_threshold
    return evaluation_metrics(y_trues, y_preds)

In [27]:
def train_loop(args, model,tokenizer):
    train_dataloader = create_dataset(args, tokenizer, args.train_data_file)
    max_steps=args.epochs*len( train_dataloader)
    save_steps=len(train_dataloader)//10
    warmup_steps=max_steps//5
    if save_steps==0:
        save_steps = 2
    gradient_accumulation_steps = 1
    max_grad_norm=1

    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],'weight_decay': 0.0},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

    optimizer = AdamW (optimizer_grouped_parameters, lr=2e-5, eps=1e-8)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=max_steps)
    global_step=0
    tr_loss, logging_loss,avg_loss,tr_nb,tr_num,train_loss = 0.0, 0.0,0.0,0,0,0
    best_f1=-1

    #training loop
    model.zero_grad()
    for idx in range(args.epochs): 
        bar = tqdm(train_dataloader,total=len(train_dataloader))
        tr_num=0
        train_loss=0
        for step, batch in enumerate(bar):
            inputs = batch[0].to(args.device)
            labels = batch[1].to(args.device)
        
            model.train()
            loss,logits = model(inputs, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            tr_loss += loss.item()
            tr_num+=1
            train_loss+=loss.item()
            if avg_loss==0:
                avg_loss=tr_loss               
            avg_loss=round(train_loss/tr_num,5)
            bar.set_description("epoch {} loss {}".format(idx,avg_loss))
            if (step + 1) % gradient_accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()
                scheduler.step()  
                global_step += 1
                output_flag=True
                avg_loss=round(np.exp((tr_loss - logging_loss) /(global_step- tr_nb)),4)

                if global_step % save_steps == 0: 
                    # Save model checkpoint
                    valid_data = create_dataset(args, tokenizer, args.valid_data_file)
                    results = model_evaluation(args, model, valid_data)
                    eval_f1=results['f1']
                    if eval_f1>best_f1:
                        
                        best_f1=eval_f1 #results['eval_f1']                             
                        model_to_save = model.module if hasattr(model,'module') else model
                        output_dir = f"{args.output_dir}/codebert_best.bin"
                        print("saving model checkpoint to "+ output_dir)
                        torch.save(model_to_save.state_dict(), output_dir)
                        

In [28]:
def get_model(args):
    config = RobertaConfig.from_pretrained(args.pretrained_model_name)
    tokenizer = RobertaTokenizer.from_pretrained(args.pretrained_model_name)
    model = RobertaModel.from_pretrained(args.pretrained_model_name,config=config)   
    model = Model(model,config,tokenizer, args)
    model.to(args.device)
    return model, tokenizer


In [29]:
def run(args):
    print(args)
    set_seed(args.seed)
    create_directory(args.output_dir)
    #results_dict
    results_dict = {}
    # get model tokenizer
    model, tokenizer = get_model(args)
    
    test_dataset= create_dataset(args, tokenizer, args.test_data_file)
    
    #BASELINE MODEL EVAL
    print("evaluate on test data on baseline model\n")
    results_dict['baseline'] = model_evaluation(args, model, test_dataset) 
    print("baseline: ", results_dict['baseline'])
    #TRAIN AND EVAL
    print("fine-tuning model")
    train_loop(args, model,tokenizer)
    #eval
    print("evaluate on test data using trained model\n")
    results_dict['fine_tuned'] = model_evaluation(args, model, test_dataset) 
    print("trained model: ",results_dict['fine_tuned'])
    
    #BEST MODEL EVAL
    model, tokenizer = get_model(args) #get new model to load saved model
    model_path = f"{args.output_dir}/codebert_best.bin"
    model.load_state_dict(torch.load(model_path, map_location=lambda storage, loc:storage), strict=False)
    model.to(args.device)
    
    print("evaluate on test data using best model\n")
    results_dict['best_model'] = model_evaluation(args, model, test_dataset) 
    print("best model: ", results_dict['best_model'])
    
    # #BEST MODEL ON GROUND TRUTH DATA
    # # gt_data = create_dataset(args, tokenizer, args.gt_data_file)
    # # results_dict['best_model_gt'] = model_evaluation_and_save_preds(args, model, gt_data, "best_model_gt") 
    # # print("best_model_gt model: ", results_dict['best_model_gt'])
    return results_dict
    

In [30]:
if __name__=="__main__":
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('--device',default= "cuda" if torch.cuda.is_available() else "cpu")
    parser.add_argument('--epochs', default=1)
    parser.add_argument('--block_size', default=512)
    parser.add_argument('--batch_size', default=2) #16 for A6000
    parser.add_argument('--pretrained_model_name', default='microsoft/codebert-base')
    parser.add_argument('--data', default='/student/egk204/projects/clone-type-iv/data') 
    parser.add_argument('--seed', default= 42)
    parser.add_argument('--do_train', default=False)
    parser.add_argument('--test_saved_models', default=False)

    parser.add_argument('--benchmark', default="gptcb") #benchmark
    # parser.add_argument('--train_data_file', default="")
    # parser.add_argument('--valid_data_file', default="")
    # parser.add_argument('--test_data_file', default="")
    args=parser.parse_args([])
    # create_directory("saved_models/checkpoint-best-f1")
    
    #modify required arguements: system specific
    #
    args.system_storage = f"./"
    args.output_dir =  f"saved_models/checkpoint-best-f1/{args.benchmark}"
    args.dataset_path = f"{args.data}/{args.benchmark}"
    
    #for this task these files are in specific directory, no need to pass from terminal
    args.codes = f"{args.dataset_path}/data.jsonl"
    args.train_data_file = f"{args.dataset_path}/train.csv"
    args.valid_data_file = f"{args.dataset_path}/valid.csv"
    args.test_data_file = f"{args.dataset_path}/test.csv"
    
    # # get model tokenizer
    # model, tokenizer = get_model(args)
    
    # test_dataset= create_dataset(args, tokenizer, args.test_data_file)
    
    
    #baseline, train, saved results
    all_results = run(args)
    # print(all_results)
    all_results = pd.DataFrame.from_dict(all_results, orient="index")
    all_results.to_csv(f"{args.system_storage}/codebert_results_{args.benchmark}.csv")
    

Namespace(device='cuda', epochs=1, block_size=512, batch_size=2, pretrained_model_name='microsoft/codebert-base', data='/student/egk204/projects/clone-type-iv/data', seed=42, do_train=False, test_saved_models=False, benchmark='gptcb', system_storage='./', output_dir='saved_models/checkpoint-best-f1/gptcb', dataset_path='/student/egk204/projects/clone-type-iv/data/gptcb', codes='/student/egk204/projects/clone-type-iv/data/gptcb/data.jsonl', train_data_file='/student/egk204/projects/clone-type-iv/data/gptcb/train.csv', valid_data_file='/student/egk204/projects/clone-type-iv/data/gptcb/valid.csv', test_data_file='/student/egk204/projects/clone-type-iv/data/gptcb/test.csv')
1115
evaluate on test data on baseline model

baseline:  {'precision': 0.5495, 'recall': 0.1091, 'f1': 0.1821, 'accuracy': 0.5085}
fine-tuning model
8912


  0%|          | 0/4456 [00:00<?, ?it/s]

1114
saving model checkpoint to saved_models/checkpoint-best-f1/gptcb/codebert_best.bin
1114
saving model checkpoint to saved_models/checkpoint-best-f1/gptcb/codebert_best.bin
1114
saving model checkpoint to saved_models/checkpoint-best-f1/gptcb/codebert_best.bin
1114
1114
saving model checkpoint to saved_models/checkpoint-best-f1/gptcb/codebert_best.bin
1114
1114
saving model checkpoint to saved_models/checkpoint-best-f1/gptcb/codebert_best.bin
1114
1114
1114
evaluate on test data using trained model

trained model:  {'precision': 0.9946, 'recall': 0.9857, 'f1': 0.9901, 'accuracy': 0.9901}
evaluate on test data using best model

best model:  {'precision': 0.9893, 'recall': 0.9893, 'f1': 0.9893, 'accuracy': 0.9892}


AttributeError: 'DataLoader' object has no attribute 'values'