Shubh Agarwal 
AIML B2
22070126108 

In [2]:
# Imports
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    
    T5Tokenizer, T5ForConditionalGeneration,
    DistilBertTokenizer, DistilBertForQuestionAnswering,
    GPT2Tokenizer, GPT2LMHeadModel,
    AdamW
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm


from nltk.translate.bleu_score import sentence_bleu
import nltk
import logging

# Download NLTK data for BLEU score calculation
nltk.download('punkt')

# Ignore Warnings
logging.disable(logging.WARNING)

# Device setup: Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Using device: cuda


In [2]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: nltk
Successfully installed nltk-3.9.1
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
# Load CoQA dataset
def load_coqa_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data['data']

# Load the data
data = load_coqa_data('/kaggle/input/coqa-train-v/coqa-train-v1.0.json')

In [4]:
# Custom dataset class
class CoQADataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512, model_type='bert'):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.model_type = model_type

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        context = item['story']
        question = item['questions'][0]['input_text']
        answer = item['answers'][0]['input_text']

        if self.model_type == 't5':
            # For T5, we format the input as text-to-text
            input_text = f"question: {question} context: {context}"
            target_text = answer

            # Tokenize inputs and targets
            input_ids = self.tokenizer.encode(
                input_text,
                max_length=self.max_length,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            ).squeeze()

            target_ids = self.tokenizer.encode(
                target_text,
                max_length=self.max_length // 4,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            ).squeeze()

            return {
                'input_ids': input_ids,
                'attention_mask': (input_ids != self.tokenizer.pad_token_id).long(),
                'labels': target_ids,
                'answer': answer
            }

        elif self.model_type == 'gpt2':
            # For GPT-2, we prepare input for language modeling
            input_text = f"Question: {question} Context: {context} Answer:"
            target_text = answer

            # Concatenate input and target for GPT-2
            full_text = input_text + " " + target_text

            input_ids = self.tokenizer.encode(
                full_text,
                max_length=self.max_length,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            ).squeeze()

            return {
                'input_ids': input_ids,
                'attention_mask': (input_ids != self.tokenizer.pad_token_id).long(),
                'labels': input_ids,
                'answer': answer
            }

        else:
            # For BERT and DistilBERT (extractive QA models)
            # Tokenize the input
            inputs = self.tokenizer.encode_plus(
                question,
                context,
                add_special_tokens=True,
                max_length=self.max_length,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )

            # Find the start and end positions of the answer in the tokenized input
            input_ids = inputs['input_ids'].squeeze()
            attention_mask = inputs['attention_mask'].squeeze()
            token_type_ids = inputs.get('token_type_ids', None)
            if token_type_ids is not None:
                token_type_ids = token_type_ids.squeeze()

            answer_tokens = self.tokenizer.encode(answer, add_special_tokens=False)
            start_position = None
            end_position = None

            for i in range(len(input_ids) - len(answer_tokens) + 1):
                if input_ids[i:i+len(answer_tokens)].tolist() == answer_tokens:
                    start_position = i
                    end_position = i + len(answer_tokens) - 1
                    break

            # If the answer is not found, use the CLS token position as a default
            if start_position is None:
                start_position = 0
                end_position = 0

            item = {
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'start_positions': torch.tensor(start_position),
                'end_positions': torch.tensor(end_position),
                'answer': answer
            }
            if token_type_ids is not None:
                item['token_type_ids'] = token_type_ids

            return item


In [5]:
# Split the data
train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)

print(f"Train samples: {len(train_data)}")
print(f"Validation samples: {len(val_data)}")
print(f"Test samples: {len(test_data)}")


Train samples: 5039
Validation samples: 1080
Test samples: 1080


In [6]:
!pip install sentencepiece





In [14]:
!pip install --upgrade pip

Collecting pip
  Downloading pip-24.2-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.0.1
    Uninstalling pip-23.0.1:
      Successfully uninstalled pip-23.0.1
Successfully installed pip-24.2
[0m

In [16]:
!pip install --upgrade sentencepiece




[0m

In [9]:
!pip install --upgrade --no-cache-dir transformers sentencepiece


Collecting transformers
  Downloading transformers-4.46.0-py3-none-any.whl (10.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m67.6 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.45.1
    Uninstalling transformers-4.45.1:
      Successfully uninstalled transformers-4.45.1
Successfully installed transformers-4.46.0
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [10]:
import sentencepiece as spm
print(spm.__version__)


0.2.0


In [7]:
# Initialize tokenizer and model
t5_tokenizer = T5Tokenizer.from_pretrained('t5-base')
t5_model = T5ForConditionalGeneration.from_pretrained('t5-base').to(device)

# Prepare datasets and dataloaders
train_dataset_t5 = CoQADataset(train_data, t5_tokenizer, model_type='t5')
val_dataset_t5 = CoQADataset(val_data, t5_tokenizer, model_type='t5')
test_dataset_t5 = CoQADataset(test_data, t5_tokenizer, model_type='t5')

train_loader_t5 = DataLoader(train_dataset_t5, batch_size=4, shuffle=True)
val_loader_t5 = DataLoader(val_dataset_t5, batch_size=4)
test_loader_t5 = DataLoader(test_dataset_t5, batch_size=4)


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [8]:
# Training function
def train_t5(model, train_loader, optimizer, device):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc="Training T5")
    for batch in progress_bar:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        progress_bar.set_postfix({'loss': loss.item()})

    return total_loss / len(train_loader)


In [9]:
# Validation function
def validate_t5(model, val_loader, device):
    model.eval()
    total_loss = 0
    progress_bar = tqdm(val_loader, desc="Validating T5")
    with torch.no_grad():
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss
            total_loss += loss.item()

            progress_bar.set_postfix({'loss': loss.item()})

    return total_loss / len(val_loader)


In [10]:
!pip install ipywidgets




In [11]:
# Set optimizer
optimizer_t5 = AdamW(t5_model.parameters(), lr=3e-5)

# Training loop
num_epochs = 2
best_loss = float('inf')
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train_loss = train_t5(t5_model, train_loader_t5, optimizer_t5, device)
    val_loss = validate_t5(t5_model, val_loader_t5, device)
    print(f"Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")
    if val_loss < best_loss:
        best_loss = val_loss
        torch.save(t5_model.state_dict(), 't5_qa_model.pth')
        print("Model saved!")
    else:
        print("Validation Loss Increased. Model Not Saved.")
    print("*" * 50)




Epoch 1/2


Training T5: 100%|██████████| 1260/1260 [13:59<00:00,  1.50it/s, loss=0.0352] 
Validating T5: 100%|██████████| 270/270 [01:02<00:00,  4.34it/s, loss=0.0438] 


Train Loss: 0.4914, Validation Loss: 0.0385
Model saved!
**************************************************
Epoch 2/2


Training T5: 100%|██████████| 1260/1260 [14:04<00:00,  1.49it/s, loss=0.0359] 
Validating T5: 100%|██████████| 270/270 [01:02<00:00,  4.35it/s, loss=0.0427] 


Train Loss: 0.0355, Validation Loss: 0.0322
Model saved!
**************************************************


In [12]:
# Testing function
def test_t5(model, test_loader, tokenizer, device):
    model.eval()
    all_predictions = []
    all_answers = []
    progress_bar = tqdm(test_loader, desc="Testing T5")
    with torch.no_grad():
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            answers = batch['answer']

            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=50
            )

            predictions = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outputs]
            all_predictions.extend(predictions)
            all_answers.extend(answers)

    bleu_score = calculate_bleu(all_predictions, all_answers)
    return bleu_score


In [14]:
# Create a simple QA bot
def qa_bot_t5(context, question):
    input_text = f"question: {question} context: {context}"
    input_ids = t5_tokenizer.encode(input_text, return_tensors='pt', max_length=512, truncation=True).to(device)

    with torch.no_grad():
        outputs = t5_model.generate(input_ids=input_ids, max_length=50)
        answer = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# Example usage of the QA bot
context = test_data[0]['story']
question = test_data[0]['questions'][0]['input_text']
answer = qa_bot_t5(context, question)
print(f"Question: {question}")
print(f"Answer: {answer}")

Question: Who was a major influence on the theory on world travel?
Answer: Napoleon


In [15]:
# Initialize tokenizer and model
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
distilbert_model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased').to(device)

# Prepare datasets and dataloaders
train_dataset_distilbert = CoQADataset(train_data, distilbert_tokenizer, model_type='bert')
val_dataset_distilbert = CoQADataset(val_data, distilbert_tokenizer, model_type='bert')
test_dataset_distilbert = CoQADataset(test_data, distilbert_tokenizer, model_type='bert')

train_loader_distilbert = DataLoader(train_dataset_distilbert, batch_size=8, shuffle=True)
val_loader_distilbert = DataLoader(val_dataset_distilbert, batch_size=8)
test_loader_distilbert = DataLoader(test_dataset_distilbert, batch_size=8)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [16]:
# Training function
def train_distilbert(model, train_loader, optimizer, device):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc="Training DistilBERT")
    for batch in progress_bar:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        outputs = model(
            input_ids,
            attention_mask=attention_mask,
            start_positions=start_positions,
            end_positions=end_positions
        )
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        progress_bar.set_postfix({'loss': loss.item()})

    return total_loss / len(train_loader)


In [17]:
# Validation function
def validate_distilbert(model, val_loader, device):
    model.eval()
    total_loss = 0
    progress_bar = tqdm(val_loader, desc="Validating DistilBERT")
    with torch.no_grad():
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            outputs = model(
                input_ids,
                attention_mask=attention_mask,
                start_positions=start_positions,
                end_positions=end_positions
            )
            loss = outputs.loss
            total_loss += loss.item()

            progress_bar.set_postfix({'loss': loss.item()})

    return total_loss / len(val_loader)


In [18]:
# Set optimizer
optimizer_distilbert = AdamW(distilbert_model.parameters(), lr=5e-5)

# Training loop
num_epochs = 2
best_loss = float('inf')
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train_loss = train_distilbert(distilbert_model, train_loader_distilbert, optimizer_distilbert, device)
    val_loss = validate_distilbert(distilbert_model, val_loader_distilbert, device)
    print(f"Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")
    if val_loss < best_loss:
        best_loss = val_loss
        torch.save(distilbert_model.state_dict(), 'distilbert_qa_model.pth')
        print("Model saved!")
    else:
        print("Validation Loss Increased. Model Not Saved.")
    print("*" * 50)


Epoch 1/2


Training DistilBERT: 100%|██████████| 630/630 [05:06<00:00,  2.06it/s, loss=1.78] 
Validating DistilBERT: 100%|██████████| 135/135 [00:28<00:00,  4.71it/s, loss=2.16]


Train Loss: 2.7911, Validation Loss: 2.1208
Model saved!
**************************************************
Epoch 2/2


Training DistilBERT: 100%|██████████| 630/630 [05:07<00:00,  2.05it/s, loss=1.35] 
Validating DistilBERT: 100%|██████████| 135/135 [00:28<00:00,  4.70it/s, loss=1.91] 


Train Loss: 1.4349, Validation Loss: 2.0061
Model saved!
**************************************************


In [19]:
# Testing function
def test_distilbert(model, test_loader, tokenizer, device):
    model.eval()
    all_predictions = []
    all_answers = []
    progress_bar = tqdm(test_loader, desc="Testing DistilBERT")
    with torch.no_grad():
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            answers = batch['answer']

            outputs = model(input_ids, attention_mask=attention_mask)
            start_scores = outputs.start_logits
            end_scores = outputs.end_logits

            for i in range(input_ids.shape[0]):
                start_index = torch.argmax(start_scores[i])
                end_index = torch.argmax(end_scores[i])
                prediction = tokenizer.decode(input_ids[i][start_index:end_index+1])
                all_predictions.append(prediction)
                all_answers.append(answers[i])

    bleu_score = calculate_bleu(all_predictions, all_answers)
    return bleu_score


In [None]:
# Create a simple QA bot
def qa_bot_distilbert(context, question):
    inputs = distilbert_tokenizer.encode_plus(question, context, return_tensors='pt', max_length=512, truncation=True)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = distilbert_model(input_ids, attention_mask=attention_mask)
        start_scores = outputs.start_logits
        end_scores = outputs.end_logits

    start_index = torch.argmax(start_scores)
    end_index = torch.argmax(end_scores)
    answer = distilbert_tokenizer.decode(input_ids[0][start_index:end_index+1])
    return answer

# Example usage of the QA bot
context = test_data[0]['story']
question = test_data[0]['questions'][0]['input_text']
answer = qa_bot_distilbert(context, question)
print(f"Question: {question}")
print(f"Answer: {answer}")


In [21]:
# Initialize tokenizer and model
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token  # Add padding token
gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)

# Prepare datasets and dataloaders
train_dataset_gpt2 = CoQADataset(train_data, gpt2_tokenizer, model_type='gpt2')
val_dataset_gpt2 = CoQADataset(val_data, gpt2_tokenizer, model_type='gpt2')
test_dataset_gpt2 = CoQADataset(test_data, gpt2_tokenizer, model_type='gpt2')

train_loader_gpt2 = DataLoader(train_dataset_gpt2, batch_size=2, shuffle=True)
val_loader_gpt2 = DataLoader(val_dataset_gpt2, batch_size=2)
test_loader_gpt2 = DataLoader(test_dataset_gpt2, batch_size=2)


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [22]:
# Training function
def train_gpt2(model, train_loader, optimizer, device):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc="Training GPT-2")
    for batch in progress_bar:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            labels=labels
        )
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        progress_bar.set_postfix({'loss': loss.item()})

    return total_loss / len(train_loader)


In [23]:
# Validation function
def validate_gpt2(model, val_loader, device):
    model.eval()
    total_loss = 0
    progress_bar = tqdm(val_loader, desc="Validating GPT-2")
    with torch.no_grad():
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                labels=labels
            )
            loss = outputs.loss
            total_loss += loss.item()

            progress_bar.set_postfix({'loss': loss.item()})

    return total_loss / len(val_loader)


In [24]:
# Set optimizer
optimizer_gpt2 = AdamW(gpt2_model.parameters(), lr=5e-5)

# Training loop
num_epochs = 1  # GPT-2 is heavy; adjust as per resources
best_loss = float('inf')
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train_loss = train_gpt2(gpt2_model, train_loader_gpt2, optimizer_gpt2, device)
    val_loss = validate_gpt2(gpt2_model, val_loader_gpt2, device)
    print(f"Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")
    if val_loss < best_loss:
        best_loss = val_loss
        torch.save(gpt2_model.state_dict(), 'gpt2_qa_model.pth')
        print("Model saved!")
    else:
        print("Validation Loss Increased. Model Not Saved.")
    print("*" * 50)


Epoch 1/1


Training GPT-2: 100%|██████████| 2520/2520 [13:25<00:00,  3.13it/s, loss=0.963]
Validating GPT-2: 100%|██████████| 540/540 [00:49<00:00, 10.88it/s, loss=2.19]


Train Loss: 2.2776, Validation Loss: 2.1235
Model saved!
**************************************************


In [29]:
# Testing function
def test_gpt2(model, test_loader, tokenizer, device):
    model.eval()
    all_predictions = []
    all_answers = []
    progress_bar = tqdm(test_loader, desc="Testing GPT-2")
    with torch.no_grad():
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            answers = batch['answer']

            outputs = model.generate(
                input_ids=input_ids,
                max_length=521,
                do_sample=True,
                top_k=50,
                top_p=0.95,
                num_return_sequences=1
            )

            for i in range(len(outputs)):
                generated = outputs[i][input_ids.size(1):]  # Skip the prompt part
                prediction = tokenizer.decode(generated, skip_special_tokens=True)
                all_predictions.append(prediction.strip())
                all_answers.append(answers[i])

    bleu_score = calculate_bleu(all_predictions, all_answers)
    return bleu_score


In [None]:
# Test the model
bleu_score = test_gpt2(gpt2_model, test_loader_gpt2, gpt2_tokenizer, device)
print(f"GPT-2 BLEU Score: {bleu_score:.4f}")


In [32]:
# Create a simple QA bot
def qa_bot_gpt2(context, question):
    input_text = f"Question: {question} Context: {context} Answer:"
    input_ids = gpt2_tokenizer.encode(input_text, return_tensors='pt', max_length=512, truncation=True).to(device)

    with torch.no_grad():
        outputs = gpt2_model.generate(
            input_ids=input_ids,
            max_length=410,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            num_return_sequences=1
        )
        generated = outputs[0][input_ids.size(1):]  # Skip the prompt part
        answer = gpt2_tokenizer.decode(generated, skip_special_tokens=True).strip()
    return answer

# Example usage of the QA bot
context = test_data[0]['story']
question = test_data[0]['questions'][0]['input_text']
answer = qa_bot_gpt2(context, question)
print(f"Question: {question}")
print(f"Answer: {answer}")


Question: Who was a major influence on the theory on world travel?
Answer: Henry Ford


## Gradio UI

In [2]:
!pip install gradio


Collecting gradio
  Downloading gradio-5.3.0-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting anyio<5.0,>=3.0 (from gradio)
  Downloading anyio-4.6.2.post1-py3-none-any.whl.metadata (4.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.3-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.4.2 (from gradio)
  Downloading gradio_client-1.4.2-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting huggingface-hub>=0.25.1 (from gradio)
  Downloading huggingface_hub-0.26.1-py3-none-any.whl.metadata (13 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.10-cp312-none-win_amd64.whl.metadata (51 kB)
     ---------------------------------------- 0.0/51.8 


[notice] A new release of pip is available: 24.1.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, DistilBertTokenizer, DistilBertForQuestionAnswering, T5ForConditionalGeneration,T5Tokenizer
import gradio as gr

In [4]:
# Load the GPT-2 model and tokenizer
gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2')  # Initialize GPT-2 model
gpt2_model.load_state_dict(torch.load(r'C:\Users\Suyash Tambe\Desktop\NLP Lab\gpt2_qa_model.pth'))  # Load trained weights
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')  # Default GPT-2 tokenizer

In [6]:
distilbert_model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')  # Initialize model
distilbert_model.load_state_dict(torch.load(r'C:\Users\Suyash Tambe\Desktop\NLP Lab\distilbert_qa_model.pth'))  # Load trained weights
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')  # Default tokenizer


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
  size (`size`):


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [13]:
# Load the T5 model and tokenizer
t5_model = T5ForConditionalGeneration.from_pretrained('t5-base')  # Initialize T5 model
t5_model.load_state_dict(torch.load(r'C:\Users\Suyash Tambe\Desktop\NLP Lab\t5_qa_model.pth'))  # Load trained weights
t5_tokenizer = T5Tokenizer.from_pretrained('t5-base')  # Default T5 tokenizer

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [14]:
# Inference functions for the models
def qa_bot_gpt2(context, question):
    input_text = f"Question: {question} Context: {context} Answer:"
    input_ids = gpt2_tokenizer.encode(input_text, return_tensors='pt', max_length=512, truncation=True).to('cpu')

    with torch.no_grad():
        outputs = gpt2_model.generate(
            input_ids=input_ids,
            max_length=410,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            num_return_sequences=1
        )
        generated = outputs[0][input_ids.size(1):]  # Skip the prompt part
        answer = gpt2_tokenizer.decode(generated, skip_special_tokens=True).strip()
    return answer

def qa_bot_distilbert(context, question):
    inputs = distilbert_tokenizer.encode_plus(question, context, return_tensors='pt')
    input_ids = inputs['input_ids'].to('cpu')
    attention_mask = inputs['attention_mask'].to('cpu')
    
    outputs = distilbert_model(input_ids, attention_mask=attention_mask)
    start_scores, end_scores = outputs.start_logits, outputs.end_logits

    # Get the most likely answer
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores) + 1
    answer = distilbert_tokenizer.convert_tokens_to_string(
        distilbert_tokenizer.convert_ids_to_tokens(input_ids[0][answer_start:answer_end])
    )
    return answer

def qa_bot_t5(context, question):
    input_text = f"question: {question} context: {context}"
    input_ids = t5_tokenizer.encode(input_text, return_tensors="pt").to('cpu')

    with torch.no_grad():
        outputs = t5_model.generate(input_ids=input_ids, max_length=512, num_beams=4, early_stopping=True)
        answer = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

In [15]:
# Main function to handle model selection and question answering
def generate_answer(model_name, article, question):
    if model_name == "GPT-2":
        return qa_bot_gpt2(article, question)
    elif model_name == "DistilBERT":
        return qa_bot_distilbert(article, question)
    elif model_name == "T5":
        return qa_bot_t5(article, question)
    else:
        return "Model not recognized."

In [16]:
# Create Gradio Interface
def create_interface():
    # Dropdown for model selection
    model_dropdown = gr.Dropdown(choices=["GPT-2", "DistilBERT", "T5"], label="Choose a Model")

    # Text boxes for article and question inputs
    article_input = gr.Textbox(lines=10, placeholder="Enter article here", label="Article")
    question_input = gr.Textbox(lines=2, placeholder="Enter your question", label="Question")
    
    # Output text box for the answer
    answer_output = gr.Textbox(label="Answer")
    
    # Create the Gradio interface
    interface = gr.Interface(
        fn=generate_answer,
        inputs=[model_dropdown, article_input, question_input],
        outputs=answer_output,
        title="Model-based Q&A",
        description="Select a model (GPT-2, DistilBERT, or T5), input an article, ask a question, and receive an answer.",
    )
    
    return interface

In [18]:
# Launch the UI
if __name__ == "__main__":
    ui = create_interface()
    ui.launch()

* Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.
