<a href="https://colab.research.google.com/github/robgon-art/ai8ball/blob/main/AI_8_Ball_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install wikipedia
!pip install pynytimes
!gsutil cp gs://boolq/train.jsonl .
!gsutil cp gs://boolq/dev.jsonl .

In [None]:
import random
import torch
import numpy as np
import pandas as pd
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW

In [None]:
# Use a GPU if you have one available (Runtime -> Change runtime type -> GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set seeds for reproducibility
random.seed(26)
np.random.seed(26)
torch.manual_seed(26)
tokenizer = AutoTokenizer.from_pretrained("roberta-large") 
model = AutoModelForSequenceClassification.from_pretrained("roberta-large")
model.to(device) # Send the model to the GPU if we have one
learning_rate = 1e-5
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1e-8)

In [None]:
def encode_data(tokenizer, questions, passages, max_length):
    input_ids = []
    attention_masks = []
    for question, passage in zip(questions, passages):
        encoded_data = tokenizer.encode_plus(question, passage,
          max_length=max_length, pad_to_max_length=True,
          truncation_strategy="longest_first")
        encoded_pair = encoded_data["input_ids"]
        attention_mask = encoded_data["attention_mask"]

        input_ids.append(encoded_pair)
        attention_masks.append(attention_mask)
    return np.array(input_ids), np.array(attention_masks)

# Loading data
train_data_df = pd.read_json("/content/train.jsonl", lines=True, orient='records')
dev_data_df = pd.read_json("/content/dev.jsonl", lines=True, orient="records")
passages_train = train_data_df.passage.values
questions_train = train_data_df.question.values
answers_train = train_data_df.answer.values.astype(int)
passages_dev = dev_data_df.passage.values
questions_dev = dev_data_df.question.values
answers_dev = dev_data_df.answer.values.astype(int)

# Encoding data
max_seq_length = 256
input_ids_train, attention_masks_train = encode_data(tokenizer, questions_train, passages_train, max_seq_length)
input_ids_dev, attention_masks_dev = encode_data(tokenizer, questions_dev, passages_dev, max_seq_length)
train_features = (input_ids_train, attention_masks_train, answers_train)
dev_features = (input_ids_dev, attention_masks_dev, answers_dev)

In [None]:
batch_size = 8
train_features_tensors = [torch.tensor(feature, dtype=torch.long) for feature in train_features]
dev_features_tensors = [torch.tensor(feature, dtype=torch.long) for feature in dev_features]
train_dataset = TensorDataset(*train_features_tensors)
dev_dataset = TensorDataset(*dev_features_tensors)
train_sampler = RandomSampler(train_dataset)
dev_sampler = SequentialSampler(dev_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)
dev_dataloader = DataLoader(dev_dataset, sampler=dev_sampler, batch_size=batch_size)

In [None]:
from tqdm import tqdm
batch_size = 8
epochs = 3
grad_acc_steps = 4
train_loss_values = []
dev_acc_values = []
for _ in tqdm(range(epochs), desc="Epoch"):
  # Training
  epoch_train_loss = 0
  model.train()
  model.zero_grad()
  for step, batch in enumerate(train_dataloader):
      input_ids = batch[0].to(device)
      attention_masks = batch[1].to(device)
      labels = batch[2].to(device)     
      outputs = model(input_ids, token_type_ids=None,
                      attention_mask=attention_masks, labels=labels)
      loss = outputs[0]
      loss = loss / grad_acc_steps
      epoch_train_loss += loss.item()
      loss.backward()
      if (step+1) % grad_acc_steps == 0:
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        model.zero_grad()
  epoch_train_loss = epoch_train_loss / len(train_dataloader)          
  train_loss_values.append(epoch_train_loss)
  
  # Evaluation
  epoch_dev_accuracy = 0
  model.eval()
  for batch in dev_dataloader:
    input_ids = batch[0].to(device)
    attention_masks = batch[1].to(device)
    labels = batch[2]   
    with torch.no_grad():        
        outputs = model(input_ids, token_type_ids=None,
                        attention_mask=attention_masks)          
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    predictions = np.argmax(logits, axis=1).flatten()
    labels = labels.numpy().flatten()
    epoch_dev_accuracy += np.sum(predictions == labels) / len(labels)

  epoch_dev_accuracy = epoch_dev_accuracy / len(dev_dataloader)
  dev_acc_values.append(epoch_dev_accuracy)

In [None]:
!mkdir roberta-large_fine-tuned

In [None]:
import os
model_path = "roberta-large_fine-tuned"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()
plt.plot(train_loss_values, label="train_loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training Loss")
plt.legend()
plt.xticks(np.arange(0, epochs))
plt.show()

In [None]:
plt.plot(dev_acc_values, label="dev_acc")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Evaluation Accuracy")
plt.legend()
plt.xticks(np.arange(0, epochs))
plt.show()

In [None]:
import math
import torch
def predict(question, passage):
  sequence = tokenizer.encode_plus(question, passage, return_tensors="pt",
    max_length=512, truncation=True)['input_ids'].to(device)
  logits = model(sequence)[0]
  probabilities = torch.softmax(logits, dim=1).detach().cpu().tolist()[0]
  vector = logits.detach().cpu().tolist()[0]
  confidence = min(math.sqrt(vector[0]**2+vector[1]**2)/3.6, 1)
  proba_yes = round(probabilities[1], 2)
  proba_no = round(probabilities[0], 2)
  conf_round = round(confidence, 2)
  print("Question:", question, "Yes:", proba_yes, "No:", proba_no, "Conf.:",
    conf_round)
  
passage_magic_8_ball = """The Magic 8-Ball is a plastic sphere, made to look
  like an eight-ball, that is used for fortune-telling or seeking advice. It was
  invented in 1950 by Albert C. Carter and Abe Bookman and is currently
  manufactured by Mattel. The user asks a yes–no question to the ball and then
  turns it over to reveal an answer in a window on the ball."""

magic_8_ball_questions = [
  "Is the Magic 8-Ball a sphere?", 
  "Is the Magic 8-Ball a cube?",
  "Was the Magic 8-Ball invented in 1940?", 
  "Was the Magic 8-Ball invented in 1950?", 
  "Was the Magic 8-Ball invented by Carter and Bookman?", 
  "Was the Magic 8-Ball invented by Black and Decker?", 
]

for s_question in magic_8_ball_questions:
  predict(s_question, passage_magic_8_ball)