In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
downsized_data = pd.read_csv(r"C:\Users\Atharva Kulkarni\Desktop\WPI_Courses\Semester-2\NLP\Final_project\pre_processed.csv")

In [None]:
x_train, x_test, y_train, y_test = train_test_split(downsized_data['text'], downsized_data['stars'], test_size=0.2, random_state=42)

In [None]:
import transformers
from transformers import RobertaModel, RobertaTokenizer, RobertaForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
import torch
from torch import nn,optim
from torch.utils.data import Dataset,DataLoader,TensorDataset, RandomSampler, SequentialSampler
import torch.nn as nn
import torch.nn.functional as F
import time
import datetime

In [None]:
pre_trained_model = 'roberta-base'

In [None]:
tokenizer=RobertaTokenizer.from_pretrained(pre_trained_model)

In [None]:
x_train_token_lens = []
for txt in x_train:
  tokens = tokenizer.encode(txt, max_length=512, truncation=True)
  x_train_token_lens.append(len(tokens))

In [None]:
x_test_token_lens = []
for txt in x_test:
  tokens = tokenizer.encode(txt, max_length=512, truncation=True)
  x_test_token_lens.append(len(tokens))

In [None]:
MAX_SEQ_LENGTH = 240

In [None]:
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [None]:
x_train = x_train.tolist()
x_test = x_test.tolist()

In [None]:
train_input_ids = [tokenizer.encode(x_train[i],add_special_tokens = True, max_length=MAX_SEQ_LENGTH, truncation=True) for i in range(0,len(x_train))]
test_input_ids = [tokenizer.encode(x_test[i],add_special_tokens = True, max_length=MAX_SEQ_LENGTH, truncation=True) for i in range(0,len(x_test))]

In [None]:
from keras.preprocessing.sequence import pad_sequences
train_input_ids = pad_sequences(train_input_ids, maxlen=MAX_SEQ_LENGTH, dtype="long", value=0, truncating="post", padding="post")
test_input_ids = pad_sequences(test_input_ids, maxlen=MAX_SEQ_LENGTH, dtype="long", value=0, truncating="post", padding="post")

In [None]:
train_attention_masks = [[int(token_id > 0) for token_id in review]for review in train_input_ids]
test_attention_masks = [[int(token_id > 0) for token_id in review] for review in test_input_ids]

In [None]:
train_inputs = torch.tensor(train_input_ids)
test_inputs = torch.tensor(test_input_ids)
train_labels = torch.tensor(y_train.values)
test_labels = torch.tensor(y_test.values)
train_masks = torch.tensor(train_attention_masks)
test_masks = torch.tensor(test_attention_masks)

In [None]:
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [None]:
n_classes = y_train.nunique()
n_classes

In [None]:
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [None]:
model = model.to(device)

In [None]:
epochs=2
optimizer=AdamW(model.parameters(),lr=3e-5)
total_steps=len(train_dataloader)*epochs
scheduler=get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)
loss_fn=nn.CrossEntropyLoss().to(device)

In [None]:
def format_time(elapsed):
    # Round to the nearest second
    elapsed_round = int(round(elapsed))
    # Format time in hh:mm:ss
    return str(datetime.timedelta(seconds = elapsed_round))

In [None]:
def accuracy(preds, labels):
    preds = np.argmax(preds, axis=1).flatten()
    labels = labels.flatten()
    return np.sum(preds == labels) / len(labels)

In [None]:
loss_train_values = []
acc_train_values = []
loss_val_values = []
acc_val_values = []
for epoch in range(0,epochs):

        #             --- Train---

        # Perform forward pass over the training dataset
        print("\n Epoch {:}/{:} :".format(epoch+1,epochs))
        print('Training....')

        # Measure how long the training epoch takes
        t0 = time.time()
        # Reset total loss and accuracy for this epoch
        total_loss = 0
        total_acc = 0

        # Put the model in training mode
        model.train()

        # For each batch of training data
        for step, batch in enumerate(train_dataloader):
            # Update progress for each 100 steps
            if (step % 100==0) & (not step==0):
                # Calculate elapsed time in minutes
                elapsed = format_time((time.time()-t0))
                # Report progress
                print(' Batch {:>5,} of {:>5,}. Elapsed:{:}.'.format(step,len(train_dataloader),elapsed))

            # Unpack training batch from trainloader and move to GPU
            b_input_ids = batch[0].long().to(device)  # 0 - input ids tensor
            b_attention_mask = batch[1].long().to(device) # 1 - input masks tensor
            b_labels = batch[2].long().to(device) # 2 - labels tensor

            # Clear any previously calculated gradients in Pytorch before performing a backward pass
            model.zero_grad()

            # Output the results
            outputs = model(input_ids = b_input_ids, attention_mask = b_attention_mask, labels=b_labels) # Return tuple
            # Loss value from output
            loss = outputs.loss   # Loss

            # Update total loss
            total_loss += loss.item()

            preds = outputs.logits  # Output probabilities
            # Move logits and labels to CPU
            preds = preds.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            # Calculate the accuracy for this batch
            tmp_train_accuracy = accuracy(preds, label_ids)

            # Accumulate the total accuracy
            total_acc += tmp_train_accuracy

            # Perform a backward pass to calculate gradients
            loss.backward()

            # To avoid exploding vanishing gradients problem, clip the norm of the gradients to 1.0
            torch.nn.utils.clip_grad_norm_(model.parameters(),1.0)

            # Update the parameters (weights)
            optimizer.step()

            # Update the learning rate
            scheduler.step()

        # Calculate the average loss over training data
        avg_total_loss = total_loss/len(train_dataloader)

        # Store the loss values
        loss_train_values.append(avg_total_loss)

        # Calculate the average accuracy over the training data
        avg_train_acc = total_acc / len(train_dataloader)

        # Store the accuracy values
        acc_train_values.append(avg_train_acc)

        print("")
        print("\nAverage training accuracy: {0:.2f}".format(avg_train_acc))

        print('Average training loss : {0:.2f}'.format(avg_total_loss))
        print('Training epoch took: {:}'.format(format_time(time.time()- t0)))

        #             --- VALIDATION ---


        # After each epoch perform validation to check model performance
        print('\n Running validation...')

        t0 = time.time()
        # Put model in evaluation mode
        model.eval()

        # Tracking variables
        total_eval_accuracy = 0
        total_eval_loss = 0

        # Unpack validation batch from trainloader and move to GPU
        for batch in test_dataloader:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            # Tell model not to compute gradients to save memory and accelerate validation
            with torch.no_grad():
                # Forward pass, calculate logit prediction
                 outputs = model(b_input_ids,
                                   token_type_ids=None,
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
            loss = outputs.loss
            logits = outputs.logits
            # Update total evaluation loss
            total_eval_loss += loss.item()

            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            # Calculate the accuracy for this batch and accumulate it over all batches
            total_eval_accuracy += accuracy(logits, label_ids)

        # Compute the average accuracy over all of the batches
        avg_val_accuracy = total_eval_accuracy / len(test_dataloader)

        # Store the accuracy values
        acc_val_values.append(avg_val_accuracy)

        # Compute the average loss over all of the batches
        avg_val_loss = total_eval_loss / len(test_dataloader)

         # Store the loss values
        loss_val_values.append(avg_val_loss)

    # Measure how long the validation run took.
        validation_time = format_time(time.time() - t0)
        print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
        print("  Validation Loss: {0:.2f}".format(avg_val_loss))
        print("  Validation took: {:}".format(validation_time))