https://github.com/sajjjadayobi/PersianQA

https://huggingface.co/docs/transformers/main/tasks/document_question_answering

https://www.google.com/search?q=question+and+answer+task+rnn+pytorch&oq=question+and+answer+task+rnn+pytorch&aqs=chrome..69i57.182j0j1&sourceid=chrome&ie=UTF-8

https://pytorch.org/tutorials/beginner/chatbot_tutorial.html

In [1]:
import pandas as pd

In [2]:
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import corpus_bleu

def calculate_accuracy(predictions, ground_truth):
    correct_count = 0
    total_count = len(predictions)
    
    for pred, gt in zip(predictions, ground_truth):
        if pred == gt:
            correct_count += 1
    
    accuracy = correct_count / total_count
    return accuracy

def calculate_bleu_score(predictions, ground_truth):
    # Convert predictions and ground truth to list of tokens
    predictions = [prediction.split() for prediction in predictions]
    ground_truth = [[answer.split()] for answer in ground_truth]
    
    # Calculate BLEU score
    bleu_score = corpus_bleu(ground_truth, predictions)
    return bleu_score

# Example usage
predictions = ["The answer is A.", "I think it's B.", "The correct answer is C."]
ground_truth = ["The answer is A.", "The correct answer is B.", "The answer is C."]

# Calculate accuracy
accuracy = calculate_accuracy(predictions, ground_truth)
print("Accuracy:", accuracy)

# Calculate BLEU score
bleu_score = calculate_bleu_score(predictions, ground_truth)
print("BLEU Score:", bleu_score)

Accuracy: 0.3333333333333333
BLEU Score: 0.43884190960972586


# Data

In [3]:
df = pd.read_csv('./data/Load/QA.csv')

In [7]:
from hazm import word_tokenize, Normalizer
normalizer = Normalizer()

tokenized_texts = [word_tokenize(normalizer.normalize(text)) for text in pd.concat([df['question'], df['answer']])]

vocab_index = {'' : 0}
index_vocab = {0 : ''}
index = 1

for text_tokens in tokenized_texts:
    for token in text_tokens:
        token = normalizer.normalize(token)
        if token not in vocab_index:
            vocab_index[token] = index
            index_vocab[index] = token
            index += 1

In [9]:
MAX_LENGTH = len(max(tokenized_texts, key=len))

In [10]:
df['answer_token'] = [[vocab_index[token] for token in word_tokenize(normalizer.normalize(text))] for text in df['answer']]
df['question_token'] = [[vocab_index[token] for token in word_tokenize(normalizer.normalize(text))] for text in df['question']]

In [11]:
print("max len question:", df['question_token'].str.len().max())
print("max len answer:", df['answer_token'].str.len().max())

max len question: 484
max len answer: 2020


In [12]:
print(df['answer'][1])
output_tokens = [index_vocab[index] for index in df['answer_token'][1]]
print(" ".join(output_tokens))

 احتیاط ترک تقلید ابتدایی میت است اما بقای بر تقلید میت در مسائلی که عمل کرده یا اخذ فتوا برای عمل نموده جایز است.
احتیاط ترک تقلید ابتدایی میت است اما بقای بر تقلید میت در مسائلی که عمل کرده یا اخذ فتوا برای عمل نموده جایز است .


# models

In [8]:
import torch
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split

from torch.nn.utils.rnn import pad_sequence
# Step 4: Padding
padded_questions = pad_sequence([torch.tensor(encoded) for encoded in df['question_token']], batch_first=True, padding_value=0)
padded_answers = pad_sequence([torch.tensor(encoded) for encoded in df['answer_token']], batch_first=True, padding_value=0)

# Pad the sequences to the maximum length
padded_questions = torch.nn.functional.pad(padded_questions, (0, max_length - padded_questions.size(1)))
padded_answers = torch.nn.functional.pad(padded_answers, (0, max_length - padded_answers.size(1)))


In [9]:
# Step 6: Train-test split
train_questions, test_questions, train_answers, test_answers = train_test_split(
    padded_questions, padded_answers, test_size=0.2
)

# Print some information
print("Train data size:", len(train_questions))
print("Test data size:", len(test_questions))

Train data size: 1914
Test data size: 479


In [80]:
import torch
import torch.nn as nn

class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNModel, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, input_seq):
        hidden = torch.zeros(1, self.hidden_size).to(torch.float32)
        input_seq = input_seq.to(torch.float32)
        output, _ = self.rnn(input_seq, hidden)
        output = self.fc(output[0])  # Select the last time step's output
        return output


In [95]:
input_size = 2000
hidden_size = 1000
output_size = 2000
learning_rate = 0.01
num_epochs = 5

model = RNNModel(input_size, hidden_size, output_size)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    for input, label in zip(train_questions, train_answers):
        optimizer.zero_grad()
        input = input.unsqueeze(0)
        output = model(input)
        print(label)
        print(output)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()

with torch.no_grad():
    correct = 0
    total = 0
    for input, label in zip(test_questions, test_answers):
        output = model(input)
        _, predicted = torch.max(output.data, 1)
        total += label.size(0)
        correct += (predicted == label).sum().item()

    accuracy = 100 * correct / total
    print('Test Accuracy: {:.2f}%'.format(accuracy))


tensor([8878,   52, 4533,  ...,    0,    0,    0])
tensor([ 0.2720,  0.6777,  0.6288,  ..., -1.0869, -0.6478,  0.4955],
       grad_fn=<AddBackward0>)


RuntimeError: Expected floating point type for target with class probabilities, got Long

In [94]:
output

tensor([ 0.4676,  0.0028, -0.7883,  ..., -0.4431, -0.5125, -0.0923],
       grad_fn=<AddBackward0>)