We have used GPT-2 model in HuggingFace Library. <br>
We have refered to following links, [🎻Fine-tune Transformers in PyTorch using 🤗 Transformers](https://gmihaila.medium.com/fine-tune-transformers-in-pytorch-using-transformers-57b40450635).
We are proposing to use Large pre-trained language model for usefulness of comment prediction.
The usefullness of these model is limited need for feature engineering. These models are pretrained on large code bases/textual document and can easily produce superior quality features.

Bert/GPT based mdoel also comes with subwork/sentence peiece tokenizer. These tokenizer can easily handle OOV. Also since we are working with code snippet we did'nt do any pre-processing (textual pre-processing won't be useful for code based language)

### 1. Model and Tokenizer

In 🤗, they prepared GPT2 model for classification in advance. Very Thankful! <br>
Here's link: https://huggingface.co/transformers/model_doc/gpt2.html#transformers.GPT2ForSequenceClassification

In [None]:
from transformers import set_seed, GPT2Config, GPT2Tokenizer, GPT2ForSequenceClassification

set_seed(731) 
model_config = GPT2Config.from_pretrained('gpt2', num_labels=2) # Binary Classification
model = GPT2ForSequenceClassification.from_pretrained('gpt2', config=model_config)

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.padding_side = "left" # Very Important
tokenizer.pad_token = tokenizer.eos_token

model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = model.config.eos_token_id

In [None]:
from transformers import set_seed, GPT2Config, GPT2Tokenizer, GPT2ForSequenceClassification

set_seed(731) # My Birthday!, you should get train_loss: 0.773, train_acc: 0.567 in epoch 0.


In [1]:
import pandas as pd
#pd.read_csv("../input/irecdata/IRSE_Test_Data_preprocessed.csv")

In [None]:
dataPath = "../input/irecdata/"

### 2. Build Dataset

In [None]:
import os
import pandas as pd
from torch.utils.data import Dataset

class TweetDataset(Dataset):
    def __init__(self, train=True):
        super().__init__()
        self.train = train
        self.data = pd.read_csv(os.path.join(dataPath, 'IRSE_Training_Data_preprocessed.csv' if train else 'IRSE_Test_Data_preprocessed.csv'))
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        record = self.data.iloc[index]
        text = "Comment: " + record['Comments'] + " code: "+record['Surrounding Code Context']
        label = 0 if record['Class'] == 'Not Useful' else 1
        return {'text': text, 'label': label}
        

train_dataset = TweetDataset(train=True)
test_dataset = TweetDataset(train=False)

In [None]:
for i in range(10):
    print(train_dataset.__getitem__(i)['text'])

### 3. Data Collator

In [None]:
class Gpt2ClassificationCollator(object):
    def __init__(self, tokenizer, max_seq_len=None):
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len
        
        return
    
    def __call__(self, sequences):
        texts = [sequence['text'] for sequence in sequences]
        labels = [int(sequence['label']) for sequence in sequences]
        inputs = self.tokenizer(text=texts,
                                return_tensors='pt',
                                padding=True,
                                truncation=True,
                                max_length=self.max_seq_len)
        inputs.update({'labels': torch.tensor(labels)})
        
        return inputs

gpt2classificationcollator = Gpt2ClassificationCollator(tokenizer=tokenizer,
                                                        max_seq_len=60)

### 4. DataLoader

In [None]:
from torch.utils.data import DataLoader, random_split

train_size = int(len(train_dataset) * 0.8)
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])

train_dataloader = DataLoader(dataset=train_dataset,
                              batch_size=32,
                              shuffle=True,
                              collate_fn=gpt2classificationcollator)
val_dataloader = DataLoader(dataset=val_dataset,
                            batch_size=32,
                            shuffle=False,
                            collate_fn=gpt2classificationcollator)
test_dataloader = DataLoader(dataset=test_dataset,
                             batch_size=32,
                             shuffle=False,
                             collate_fn=gpt2classificationcollator)

### 5. Optimizer & Lr Scheduler

In [None]:
from transformers import AdamW, get_cosine_schedule_with_warmup

total_epochs = 10

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters,
                  lr=1e-5,
                  eps=1e-8)

num_train_steps = len(train_dataloader) * total_epochs
num_warmup_steps = int(num_train_steps * 0.1) 

lr_scheduler = get_cosine_schedule_with_warmup(optimizer,
                                               num_warmup_steps=num_warmup_steps,
                                               num_training_steps = num_train_steps)

### 6. Train & Validation

In [None]:
import torch

def train(dataloader, optimizer, scheduler, device_):
    global model
    model.train()
    
    prediction_labels = []
    true_labels = []
    
    total_loss = []
    
    for batch in dataloader:
        true_labels += batch['labels'].numpy().flatten().tolist()
        batch = {k:v.type(torch.long).to(device_) for k, v in batch.items()}
        
        
        outputs = model(**batch)
        loss, logits = outputs[:2]
        logits = logits.detach().cpu().numpy()
        total_loss.append(loss.item())
        
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # prevent exploding gradient

        optimizer.step()
        scheduler.step()
        
        prediction_labels += logits.argmax(axis=-1).flatten().tolist()
    
    return true_labels, prediction_labels, total_loss

def validation(dataloader, device_):
    global model
    model.eval()
    
    prediction_labels = []
    true_labels = []
    
    total_loss = []
    
    for batch in dataloader:
        true_labels += batch['labels'].numpy().flatten().tolist()
        batch = {k:v.type(torch.long).to(device_) for k, v in batch.items()}
        
        with torch.no_grad():
            outputs = model(**batch)
            loss, logits = outputs[:2]
            logits = logits.detach().cpu().numpy()
            total_loss.append(loss.item())

            prediction_labels += logits.argmax(axis=-1).flatten().tolist()
        
    return true_labels, prediction_labels, total_loss

### 7. Run!

In [None]:
from sklearn.metrics import classification_report, accuracy_score

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

all_loss = {'train_loss': [], 'val_loss': []}
all_acc = {'train_acc': [], 'val_acc': []}

for epoch in range(total_epochs):
    y, y_pred, train_loss = train(train_dataloader, optimizer, lr_scheduler, device)
    train_acc = accuracy_score(y, y_pred)
    
    y, y_pred, val_loss = validation(val_dataloader, device)
    val_acc = accuracy_score(y, y_pred)
    
    all_loss['train_loss'] += train_loss
    all_loss['val_loss'] += val_loss
    
    all_acc['train_acc'].append(train_acc)
    all_acc['val_acc'].append(val_acc)
    
    print(f'Epoch: {epoch}, train_loss: {torch.tensor(train_loss).mean():.3f}, train_acc: {train_acc:.3f}, val_loss: {torch.tensor(val_loss).mean():.3f}, val_acc: {val_acc:.3f}') 

### 7.1. Check Loss with Graph

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

fig = plt.figure(figsize=(20,20))
a = fig.add_subplot(4, 1, 1)
b = fig.add_subplot(4, 1, 2)
c = fig.add_subplot(2, 1, 2)
a.plot(all_loss['train_loss'])
b.plot(all_loss['val_loss'])
c.plot(all_acc['train_acc'])
c.plot(all_acc['val_acc'])
c.set(xlabel='epoch', ylabel='accuracy')
c.legend(['train', 'val'])

pass

### 8. Run on Test Data

In [None]:
y, y_pred, val_loss = validation(val_dataloader, device)

In [None]:
from sklearn.metrics import classification_report

target_names = ['Not Useful', 'Useful']
print(classification_report(y, y_pred, target_names=target_names))

In [None]:
y, y_pred, val_loss = validation(test_dataloader, device)

In [None]:
from sklearn.metrics import classification_report

target_names = ['Not Useful', 'Useful']
print(classification_report(y, y_pred, target_names=target_names))

In [None]:
_, y_pred, _ = validation(test_dataloader, device)

submit = pd.read_csv(dataPath+'IRSE_Test_Data_preprocessed.csv')
submit['target'] = y_pred

submit.to_csv('IRSE_Test_Data_preprocessed_GPT.csv', index=False)

In [None]:
torch.save(model.state_dict(), "gpt-2_model")

## Bert based model for Baseline

In [None]:
from transformers import BertTokenizer, BertForNextSentencePrediction
import torch


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')


In [None]:
import os
import pandas as pd
from torch.utils.data import Dataset

class TweetDataset(Dataset):
    def __init__(self, train=True):
        super().__init__()
        self.train = train
        self.data = pd.read_csv(os.path.join(dataPath, 'IRSE_Training_Data_preprocessed.csv' if train else 'IRSE_Test_Data_preprocessed.csv'))
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        record = self.data.iloc[index]
        text = "[CLS] " + record['Comments'] + " [SEP] "+record['Surrounding Code Context']
        label = 0 if record['Class'] == 'Not Useful' else 1
        return {'text': text, 'label': label}

train_dataset = TweetDataset(train=True)
test_dataset = TweetDataset(train=False)

In [None]:
'''
if self.train:
            return {'text': text, 'label': label}

'''

In [None]:
for i in range(10):
    print(train_dataset.__getitem__(i)['text'])

In [None]:
class BertClassificationCollator(object):
    def __init__(self, tokenizer, max_seq_len=None):
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len
        
        return
    
    def __call__(self, sequences):
        texts = [sequence['text'] for sequence in sequences]
        labels = [int(sequence['label']) for sequence in sequences]
        inputs = self.tokenizer(text=texts,
                                return_tensors='pt',
                                padding=True,
                                truncation=True,
                                max_length=self.max_seq_len)
        inputs.update({'labels': torch.tensor(labels)})
        
        return inputs

BertClassificationCollator = BertClassificationCollator(tokenizer=tokenizer,
                                                        max_seq_len=60)

In [None]:
from torch.utils.data import DataLoader, random_split

train_size = int(len(train_dataset) * 0.8)
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])

train_dataloader = DataLoader(dataset=train_dataset,
                              batch_size=32,
                              shuffle=True,
                              collate_fn=BertClassificationCollator)
val_dataloader = DataLoader(dataset=val_dataset,
                            batch_size=32,
                            shuffle=False,
                            collate_fn=BertClassificationCollator)
test_dataloader = DataLoader(dataset=test_dataset,
                             batch_size=32,
                             shuffle=False,
                             collate_fn=BertClassificationCollator)

In [None]:
from transformers import BertForSequenceClassification, AdamW, BertConfig


model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()

In [None]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))


In [None]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 5e-5, # args.learning_rate - default is 5e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

In [None]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs. The BERT authors recommend between 2 and 4. 
# We chose to run for 2,I have already seen that the model starts overfitting beyound 2 epochs
epochs = 2

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [None]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)



In [None]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


In [None]:
from transformers import AdamW, get_cosine_schedule_with_warmup

total_epochs = 10

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters,
                  lr=1e-5,
                  eps=1e-8)

num_train_steps = len(train_dataloader) * total_epochs
num_warmup_steps = int(num_train_steps * 0.1) 

lr_scheduler = get_cosine_schedule_with_warmup(optimizer,
                                               num_warmup_steps=num_warmup_steps,
                                               num_training_steps = num_train_steps)

In [None]:
from sklearn.metrics import classification_report, accuracy_score
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

all_loss = {'train_loss': [], 'val_loss': []}
all_acc = {'train_acc': [], 'val_acc': []}

for epoch in range(total_epochs):
    y, y_pred, train_loss = train(train_dataloader, optimizer, lr_scheduler, device)
    train_acc = accuracy_score(y, y_pred)
    
    y, y_pred, val_loss = validation(val_dataloader, device)
    val_acc = accuracy_score(y, y_pred)
    
    all_loss['train_loss'] += train_loss
    all_loss['val_loss'] += val_loss
    
    all_acc['train_acc'].append(train_acc)
    all_acc['val_acc'].append(val_acc)
    
    print(f'Epoch: {epoch}, train_loss: {torch.tensor(train_loss).mean():.3f}, train_acc: {train_acc:.3f}, val_loss: {torch.tensor(val_loss).mean():.3f}, val_acc: {val_acc:.3f}') 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

fig = plt.figure(figsize=(20,20))
a = fig.add_subplot(4, 1, 1)
b = fig.add_subplot(4, 1, 2)
c = fig.add_subplot(2, 1, 2)
a.plot(all_loss['train_loss'])
b.plot(all_loss['val_loss'])
c.plot(all_acc['train_acc'])
c.plot(all_acc['val_acc'])
c.set(xlabel='epoch', ylabel='accuracy')
c.legend(['train', 'val'])

pass

In [None]:
y, y_pred, val_loss = validation(val_dataloader, device)

In [None]:
from sklearn.metrics import classification_report

target_names = ['Not Useful', 'Useful']
print(classification_report(y, y_pred, target_names=target_names))

In [None]:
y, y_pred, val_loss = validation(test_dataloader, device)

In [None]:
from sklearn.metrics import classification_report

target_names = ['Not Useful', 'Useful']
print(classification_report(y, y_pred, target_names=target_names))

In [None]:
_, y_pred, _ = validation(test_dataloader, device)

submit = pd.read_csv(dataPath+'IRSE_Test_Data_preprocessed.csv')
submit['target'] = y_pred

submit.to_csv('IRSE_Test_Data_preprocessed_BERT.csv', index=False)

In [None]:
torch.save(model.state_dict(), "bert_model")