In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
# Change to the desired directory
import os
os.chdir('/content/drive/MyDrive/Task-1')

In [None]:
!pip install datasets



In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer
from datasets import load_dataset

In [None]:
# Load the SQuAD dataset
train_data = load_dataset('squad', split='train')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Load pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [None]:
def prepare_data(data, tokenizer):
    encodings = tokenizer(data['context'], data['question'], truncation=True, padding='max_length', max_length=512)
    # Assuming data['answers'] is a list of dictionaries:
    start_positions = [answer['answer_start'] for answer in data['answers']]
    end_positions = [start + len(ans) for start, ans in zip(start_positions, [answer['text'] for answer in data['answers']])]
    return encodings, start_positions, end_positions

In [None]:
# Define a simple RNN model for Question Answering
class RNNForQA(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(RNNForQA, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc_start = nn.Linear(hidden_dim, output_dim)
        self.fc_end = nn.Linear(hidden_dim, output_dim)

    def forward(self, input_ids):
        embedded = self.embedding(input_ids)
        rnn_out, _ = self.rnn(embedded)
        start_logits = self.fc_start(rnn_out)
        end_logits = self.fc_end(rnn_out)
        return start_logits, end_logits


In [None]:
# Dataset class to preprocess and load the data
class SquadDataset(Dataset):
    def __init__(self, encodings, start_positions, end_positions):
        self.encodings = encodings
        self.start_positions = start_positions
        self.end_positions = end_positions

    def __len__(self):
        return len(self.start_positions)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'start_positions': torch.tensor(self.start_positions[idx]),
            'end_positions': torch.tensor(self.end_positions[idx]),
        }

In [None]:
# Prepare data for training
def prepare_data(data, tokenizer):
    encodings = tokenizer(data['context'], data['question'], truncation=True, padding='max_length', max_length=512)
    start_positions = data['answers']['answer_start']
    end_positions = [start + len(ans) for start, ans in zip(data['answers']['answer_start'], data['answers']['text'])]
    return encodings, start_positions, end_positions

In [None]:
# Define hyperparameters
embedding_dim = 128
hidden_dim = 128
vocab_size = tokenizer.vocab_size
output_dim = 1
batch_size = 2
learning_rate = 0.001
epochs = 1

In [None]:
# Load and preprocess the data
encodings, start_positions, end_positions = prepare_data(train_data, tokenizer)
train_dataset = SquadDataset(encodings, start_positions, end_positions)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [None]:
# Initialize the model
model = RNNForQA(vocab_size, embedding_dim, hidden_dim, output_dim)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

NameError: name 'RNNForQA' is not defined