# Natural Language Processing 2024 – Final Project

Add the names and ID of the submitting students here:

1.Gal Ein Dor 209070671

2.David Koplev 208870279

3.Rotem Kashani 209073352

Our project aims to create a smart system that can answer questions or provide relevant information from written text, we're going to use two different methods to do this: one by training a seq2seq LSTM model using a dataset found called AG News Classification Dataset, the dataset is about 4 different types of topics:
1-World, 2-Sports, 3-Business, 4-Sci/Tech.

And we aim to accurately answer questions about those topics using the model.
The second method is by fine-tuning ChatGPT using the dataset , we believe that this method will provide better accuracy and we wish to be able to use OpenAI API models to accurately answer questions.


---

# Step 1: Data Collection and Preprocessing
Gather a diverse dataset of texts/article and preprocess the data by cleaning, tokenizing, and annotating questions and answers




##1.1 Import Required Libraries

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

##1.2 Load the Dataset

In [None]:
test_df = pd.read_csv('test.csv', nrows=100)
train_df = pd.read_csv('train.csv', nrows=1000)

##1.3 DataProccessor Class

In [None]:
class DataProcessor:
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def remove_stopwords(self, column, stop_words):
        self.dataframe[column] = self.dataframe[column].apply(lambda x: [word for word in x if word not in stop_words])

    def lowercase(self, column):
        self.dataframe[column] = self.dataframe[column].apply(lambda x: x.lower())

    def remove_special_characters(self, column):
        self.dataframe[column] = self.dataframe[column].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))

    def tokenize(self, column):
        self.dataframe[column] = self.dataframe[column].apply(word_tokenize)

    def lemmatize(self, column):
        from nltk.stem import WordNetLemmatizer
        lemmatizer = WordNetLemmatizer()
        self.dataframe[column] = self.dataframe[column].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

    def preprocess(self, column, stop_words):
        self.lowercase(column)
        self.remove_special_characters(column)
        self.tokenize(column)
        self.remove_stopwords(column, stop_words)
        self.lemmatize(column)

    def get_processed_dataframe(self):
        return self.dataframe

##1.4 Get proccessed data


In [None]:
stop_words = set(stopwords.words('english'))

processor = DataProcessor(train_df)
processor.preprocess('Title', stop_words)
processor.preprocess('Description', stop_words)

processor = DataProcessor(test_df)
processor.preprocess('Title', stop_words)
processor.preprocess('Description', stop_words)

---

# Step 2: Seq2Seq LSTM Model
Train a seq2seq LSTM model to map questions to answers for Q&A and experiment with architectures and hyperparameters for optimal performance.

##2.1 Import Required Libraries

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import torch.nn.functional as F
import re
import nltk
from tqdm import tqdm

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

##2.2 Building Vocabulary

In [None]:
def build_vocab(data):
    vocab = set()
    for _, row in data.iterrows():
        title_tokens = row['Title']
        desc_tokens = row['Description']
        for token in title_tokens:
            vocab.add(token)
        for token in desc_tokens:
            vocab.add(token)
    return vocab

# Build vocabulary
vocab = build_vocab(train_df)

##2.3 Mapping tokens to indices

In [None]:
# Map tokens to indices
word_to_index = {word: idx + 1 for idx, word in enumerate(vocab)}

##2.4 Adding a special token for unknown words

In [None]:
# Add a special token for unknown words
word_to_index['<UNK>'] = len(word_to_index)
word_to_index['<SOS>'] = len(word_to_index)

##2.5 Preprocessing Data


In [None]:
def text_to_indices(tokens):
    return [word_to_index.get(token, word_to_index['<UNK>']) for token in tokens]

# Apply preprocessing to data
train_df['Title'] = train_df['Title'].apply(text_to_indices)
train_df['Description'] = train_df['Description'].apply(text_to_indices)
test_df['Title'] = test_df['Title'].apply(text_to_indices)
test_df['Description'] = test_df['Description'].apply(text_to_indices)

##2.6 Loading Data

In [None]:
class QADataset(Dataset):
    def __init__(self, data):
        self.data = data

        # Determine maximum sequence length dynamically
        self.max_seq_length = max(max(len(row['Title']), len(row['Description'])) for _, row in self.data.iterrows())

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_seq = torch.tensor(self.data.iloc[idx]['Title'], dtype=torch.long)
        target_seq = torch.tensor(self.data.iloc[idx]['Description'], dtype=torch.long)

        # Pad sequences to the maximum sequence length
        input_seq = F.pad(input_seq, (0, self.max_seq_length - len(input_seq)), value=0)
        target_seq = F.pad(target_seq, (0, self.max_seq_length - len(target_seq)), value=0)

        return input_seq, target_seq

# Create DataLoader objects
batch_size = 1
train_dataset = QADataset(train_df)
test_dataset = QADataset(test_df)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

##2.7 Define Seq2Seq Model

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=2)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input, hidden):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded, hidden)
        output = self.softmax(self.out(output))
        return output, hidden

##2.8 Training Loop

In [None]:
learning_rate = 0.001
num_epochs = 10
size = len(vocab) + 2
hidden_size = 256

encoder = EncoderRNN(size, hidden_size)
decoder = DecoderRNN(hidden_size, size)

optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

for epoch in range(num_epochs):
    encoder.train()
    decoder.train()
    total_loss = 0

    # Use tqdm for progress visualization
    for input_seq, target_seq in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        optimizer.zero_grad()

        if input_seq.size(1) == 0:
            continue

        encoder_outputs, encoder_hidden = encoder(input_seq)
        decoder_hidden = encoder_hidden.squeeze(0).unsqueeze(0)  # Ensure correct dimensionality
        decoder_input = torch.tensor([[word_to_index['<SOS>']] * input_seq.size(0)], dtype=torch.long)
        decoder_hidden = decoder_hidden.view(1, 1, -1)[:, :, :256]

        loss = 0
        for di in range(30):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            loss += criterion(decoder_output.squeeze(1), target_seq[:, di])
            decoder_input = target_seq[:, di].unsqueeze(1)

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Calculate average loss per batch
    average_loss = total_loss / len(train_loader)

    print(f"Epoch {epoch+1}, Loss: {average_loss}")

Epoch 1/10: 100%|██████████| 1000/1000 [05:13<00:00,  3.19it/s]


Epoch 1, Loss: 169.93704300308227


Epoch 2/10: 100%|██████████| 1000/1000 [05:15<00:00,  3.17it/s]


Epoch 2, Loss: 127.122130859375


Epoch 3/10: 100%|██████████| 1000/1000 [05:30<00:00,  3.02it/s]


Epoch 3, Loss: 83.85162349510193


Epoch 4/10: 100%|██████████| 1000/1000 [05:28<00:00,  3.04it/s]


Epoch 4, Loss: 49.366813178539275


Epoch 5/10: 100%|██████████| 1000/1000 [05:29<00:00,  3.04it/s]


Epoch 5, Loss: 27.466068338871


Epoch 6/10: 100%|██████████| 1000/1000 [05:21<00:00,  3.11it/s]


Epoch 6, Loss: 16.358784922599792


Epoch 7/10: 100%|██████████| 1000/1000 [05:27<00:00,  3.05it/s]


Epoch 7, Loss: 11.0859784886837


Epoch 8/10: 100%|██████████| 1000/1000 [05:24<00:00,  3.08it/s]


Epoch 8, Loss: 8.33750065869093


Epoch 9/10: 100%|██████████| 1000/1000 [05:27<00:00,  3.05it/s]


Epoch 9, Loss: 6.8493596145808695


Epoch 10/10: 100%|██████████| 1000/1000 [05:30<00:00,  3.03it/s]

Epoch 10, Loss: 5.596354971945286





##2.9 Evaluation

In [None]:
# Define the end-of-sequence token
EOS_token = 0  # Assign a unique integer value

def tensorFromSentence(lang, sentence):
    indexes = [lang.word2index[word] for word in sentence.split()]
    indexes.append(EOS_token)  # Add end-of-sequence token
    return torch.tensor(indexes, dtype=torch.long).view(-1, 1)

def evaluate(encoder, decoder, sentence, input_lang, output_lang):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            decoded_words.append(output_lang.index2word[idx.item()])
    return decoded_words, decoder_attn

encoder.eval()
decoder.eval()

DecoderRNN(
  (embedding): Embedding(7260, 256)
  (gru): GRU(256, 256, batch_first=True)
  (out): Linear(in_features=256, out_features=7260, bias=True)
  (softmax): LogSoftmax(dim=2)
  (dropout): Dropout(p=0.1, inplace=False)
)

##2.10 Regularization Model

In [None]:
# Update the model architecture with dropout layers
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=2)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input, hidden):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded, hidden)
        output = self.softmax(self.out(output))
        return output, hidden

---

# Step 3: Fine-tuning GPT2 for Q&A
Fine-tune GPT2 on the dataset for Q&A and implement mechanisms to quote relevant passages from the dataset.

##3.1 Install Dependencies

In [None]:
!pip install pytorch-transformers
!pip install accelerate
!pip install pytorch_transformers

Collecting pytorch-transformers
  Downloading pytorch_transformers-1.2.0-py3-none-any.whl (176 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/176.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.4/176.4 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Collecting boto3 (from pytorch-transformers)
  Downloading boto3-1.34.61-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
Collecting sacremoses (from pytorch-transformers)
  Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
Collecting botocore<1.35.0,>=1.34.61 (from boto3->pytorch-transformers)
  Downloading botocore-1.34.61-py3-none-any.whl (12.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m 

##3.2 Import the necessary libraries

In [None]:
import torch
from pytorch_transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm

##3.3 Load GPT2 Model

In [None]:
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.eval()

100%|██████████| 665/665 [00:00<00:00, 2086172.15B/s]
100%|██████████| 548118077/548118077 [00:16<00:00, 33744400.92B/s]


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

##3.4 Preprocess the Dataset

In [None]:
import torch

# Combine Title and Description columns to form the context for the model
train_df['context'] = train_df['Title'].astype(str) +' ' + train_df['Description'].astype(str)
test_df['context'] = test_df['Title'].astype(str) + ' ' + test_df['Description'].astype(str)

class QADataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context = self.data.iloc[idx]['context']

          # Tokenize and convert to input IDs
        inputs = self.tokenizer.encode(context)

      # Define maximum sequence length
        max_length = 512

      # Pad or trunc ate the inputs to the maximum length
        inputs = inputs[:max_length] if len(inputs) > max_length else inputs + [0] * (max_length - len(inputs))

          # Convert input IDs to tensor
        input_ids = torch.tensor(inputs, dtype=torch.long)
        attention_mask = torch.ones_like(input_ids)  # Assuming no padding token is used


        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': input_ids.clone()  # Labels are the same as inputs for language modeling task
        }



# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Create train and test datasets
train_dataset = QADataset(train_df, tokenizer)
test_dataset = QADataset(test_df, tokenizer)

# Define batch size and create data loaders
batch_size = 4
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


100%|██████████| 1042301/1042301 [00:00<00:00, 3425109.26B/s]
100%|██████████| 456318/456318 [00:00<00:00, 1501495.20B/s]


##3.5 Fine-tuning

In [None]:
# Define a function for fine-tuning
def fine_tune_model(model, train_loader, test_loader, optimizer, scheduler, device, epochs=3):
    model.train()
    for epoch in range(epochs):
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, labels=labels)
            loss = outputs[0]

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

# Initialize the model
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Initialize optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader)*3)

# Fine-tune the model
fine_tune_model(model, train_loader, test_loader, optimizer, scheduler, device)



Epoch 1/3, Loss: 0.31816565990448
Epoch 2/3, Loss: 0.3566182553768158
Epoch 3/3, Loss: 0.3496329188346863


##3.6 Quoting Mechanism

In [None]:
# Function to quote relevant passages from the text based on a query
def quote_relevant_passage(query, text):
    sentences = text.split('.')
    for sentence in sentences:
        if isinstance(query, str) and query in sentence:
            return f'"{sentence.strip()}."'
    return None


# Apply the quoting mechanism to the train_df
train_df['quoted_passage'] = train_df.apply(lambda row: quote_relevant_passage(row['Description'], row['context']), axis=1)

# Apply the quoting mechanism to the test_df
test_df['quoted_passage'] = test_df.apply(lambda row: quote_relevant_passage(row['Description'], row['context']), axis=1)

---

# Step 4: Integration and Evaluation
Compare the seq2seq LSTM model and ChatGPT results and evaluate the systems performances using accuracy metrics.


## 4.1 Import the necessary libraries

In [None]:
# Importing necessary libraries
import numpy as np
from sklearn.metrics import accuracy_score
import torch
import torch.nn as nn

##4.2 Define Evaluation Metrics

In [None]:
# Define evaluation function to calculate accuracy
def calculate_accuracy(predictions, targets):
    return accuracy_score(predictions, targets)

# Define evaluation function for Seq2Seq LSTM model
def evaluate_lstm_model(model, test_loader):
    model.eval()
    predictions = []
    targets = []
    for batch in test_loader:
        inputs, targets_batch = batch['input_ids'], batch['labels']
        outputs = model(inputs)

        # Ensure predicted_classes is a 1-dimensional array before extending predictions
        predicted_classes = torch.argmax(outputs).cpu().numpy()
        if predicted_classes.ndim == 1:
            predictions.extend(predicted_classes.tolist())
            targets.extend(targets_batch.cpu().numpy().tolist())

    return predictions, targets



# Define evaluation function for ChatGPT
def evaluate_chatgpt(model, tokenizer, test_loader, device):
    model.eval()
    predictions = []
    targets = []
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids)
            logits = outputs[0]
            predicted_ids = torch.argmax(logits, dim=-1)
            predictions.extend(predicted_ids.flatten().cpu().numpy())
            targets.extend(labels.flatten().cpu().numpy())
    return predictions, targets



##4.3 Evaluate Seq2Seq LSTM Model

In [None]:
# Define your LSTM model class
class Seq2SeqLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(Seq2SeqLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
      # Convert the input tensor x to the desired data type (e.g., torch.float32)
      x = x.to(torch.float32)

      # Pass the converted input tensor to the LSTM module
      lstm_out, _ = self.lstm(x)
      output = self.fc(lstm_out[-1])
      return output

# Instantiate your LSTM model
input_dim = 10 # Example input dimension
hidden_dim = 20 # Example hidden dimension
output_dim = 2 # Example output dimension
lstm_model = Seq2SeqLSTM(input_dim, hidden_dim, output_dim)

# Assuming you have your LSTM model and test_loader ready
lstm_predictions, lstm_targets = evaluate_lstm_model(lstm_model, test_loader)

# Assuming you have your LSTM model and test_loader ready
lstm_accuracy = calculate_accuracy(lstm_predictions, lstm_targets)
print(f"Accuracy of Seq2Seq LSTM Model: {lstm_accuracy}")


Accuracy of Seq2Seq LSTM Model: nan


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


##4.4 Evaluate ChatGPT

In [None]:
chatgpt_predictions, chatgpt_targets = evaluate_chatgpt(model, tokenizer, test_loader, device)
chatgpt_accuracy = calculate_accuracy(chatgpt_predictions, chatgpt_targets)
print(f"Accuracy of ChatGPT: {chatgpt_accuracy}")

Accuracy of ChatGPT: 0.7953125


##4.5 Compare Results

In [None]:
# Comparing results
print("Comparison of Results:")
print(f"Seq2Seq LSTM Model Accuracy: {lstm_accuracy}")
print(f"ChatGPT Accuracy: {chatgpt_accuracy}")

Comparison of Results:
Seq2Seq LSTM Model Accuracy: nan
ChatGPT Accuracy: 0.7953125
