In [13]:
!pip install datasets



In [14]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict

In [15]:
model_name = "t5-small" 
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [16]:
dataset = load_dataset("squad")

In [17]:
def preprocess_data(examples):
    inputs = ["generate question: " + context for context in examples["context"]]
    targets = [question for question in examples["question"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids
    model_inputs["labels"] = labels
    return model_inputs

In [18]:
tokenized_datasets = dataset.map(preprocess_data, batched=True)

In [19]:
train_size = int(0.5 * len(tokenized_datasets["train"]))  
validation_size = int(0.5 * len(tokenized_datasets["validation"])) 

train_dataset = tokenized_datasets["train"].select(range(train_size))
validation_dataset = tokenized_datasets["validation"].select(range(validation_size))


In [20]:


training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=5, 
    save_steps=10_000,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=50,  
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss", 
    report_to="none"  
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
)


train_output = trainer.train()


  trainer = Trainer(
  0%|          | 0/27375 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
  0%|          | 2/27375 [00:15<60:02:46,  7.90s/it]

KeyboardInterrupt: 

In [None]:
import pandas as pd

logs = trainer.state.log_history

df_logs = pd.DataFrame(logs)

train_loss = df_logs[df_logs["loss"].notnull()]["loss"]
eval_loss = df_logs[df_logs["eval_loss"].notnull()]["eval_loss"]
steps = df_logs[df_logs["loss"].notnull()]["step"]

plt.figure(figsize=(10, 5))
plt.plot(steps, train_loss, label="Training Loss", color="blue")
plt.plot(steps, eval_loss[:len(steps)], label="Validation Loss", color="orange")
plt.title("Training vs Validation Loss")
plt.xlabel("Steps")
plt.ylabel("Loss")
plt.legend()
plt.show()

if "eval_accuracy" in df_logs.columns:
    eval_accuracy = df_logs[df_logs["eval_accuracy"].notnull()]["eval_accuracy"]
    plt.figure(figsize=(10, 5))
    plt.plot(steps, eval_accuracy[:len(steps)], label="Validation Accuracy", color="green")
    plt.title("Validation Accuracy Over Steps")
    plt.xlabel("Steps")
    plt.ylabel("Accuracy")
    plt.legend()
    plt.show()


In [1]:
model.save_pretrained("./t5-qna")
tokenizer.save_pretrained("./t5-qna")


NameError: name 'model' is not defined

In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model = T5ForConditionalGeneration.from_pretrained("t5-qna")
tokenizer = T5Tokenizer.from_pretrained("t5-qna")

context = "Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi's Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the 'golden anniversary' with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as Super Bowl L, so that the logo could prominently feature the Arabic numerals 50."

input_question = f"generate question: {context}"
input_ids_question = tokenizer.encode(input_question, return_tensors="pt")

outputs_question = model.generate(input_ids_question, max_length=50, num_beams=4, early_stopping=True)
question = tokenizer.decode(outputs_question[0], skip_special_tokens=True)

input_answer = f"answer the question: {question} context: {context}"
input_ids_answer = tokenizer.encode(input_answer, return_tensors="pt")

outputs_answer = model.generate(input_ids_answer, max_length=50, num_beams=4, early_stopping=True)
answer = tokenizer.decode(outputs_answer[0], skip_special_tokens=True)

print("Generated Question:", question)
print("Generated Answer:", answer)


  from .autonotebook import tqdm as notebook_tqdm


Generated Question: What was the name of each Super Bowl game?
Generated Answer: Super Bowl 50


In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model = T5ForConditionalGeneration.from_pretrained("./t5-qna")
tokenizer = T5Tokenizer.from_pretrained("./t5-qna")

def generate_questions(context, model, tokenizer):
    input_text = f"generate question: {context}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    outputs = model.generate(input_ids, max_length=50, num_beams=4, early_stopping=True)
    question = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return question

In [4]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
import string
import re
from PyPDF2 import PdfReader



In [5]:
model = T5ForConditionalGeneration.from_pretrained("./t5-qna")
tokenizer = T5Tokenizer.from_pretrained("./t5-qna")

In [6]:
def extract_text_from_pdf(pdf_path):
    pdf_reader = PdfReader(pdf_path)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

In [8]:
def generate_questions(chunk, model, tokenizer):
    input_text = f"generate question: {chunk}"
    
    inputs = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True)

    outputs = model.generate(inputs['input_ids'], max_length=150, num_beams=5, early_stopping=True, temperature=0.6)

    question = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return question

In [9]:
def predict(context, query):
    input_text = f"question: {query} context: {context}"
    
    inputs = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True)
    
    outputs = model.generate(inputs['input_ids'], max_length=150, num_beams=8, early_stopping=True, temperature=0.5)
    
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    if answer == query:
        answer = "The answer is unclear, please try again."
    
    return answer

In [10]:
def normalize_text(s):
    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

In [11]:
def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

In [13]:
def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()

    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)

    common_tokens = set(pred_tokens) & set(truth_tokens)

    if len(common_tokens) == 0:
        return 0

    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    return 2 * (prec * rec) / (prec + rec)

In [14]:
def process_pdf_and_generate_questions_with_context(pdf_path, model, tokenizer, max_context_length=1024):
    text = extract_text_from_pdf(pdf_path)

    chunks = [text[i:i + max_context_length] for i in range(0, len(text), max_context_length)]

    qa_pairs = []
    for chunk in chunks:
        question = generate_questions(chunk, model, tokenizer)
        
        answer = predict(chunk, question)

        qa_pairs.append({"context": chunk, "question": question, "answer": answer})

    return qa_pairs

In [15]:
pdf_path = "technology_paragraphs_removed.pdf"

qa_pairs = process_pdf_and_generate_questions_with_context(pdf_path, model, tokenizer)

for i, qa in enumerate(qa_pairs, 1):
    print(f"Question {i}: {qa['question']}")
    print(f"Context: {qa['context']}")
    print(f"Predicted Answer: {qa['answer']}\n")



Question 1: In what century did the steam engine revolutionize transportation and industry?
Context: The evolution of technology has been one of the most significant factors shaping human history.
From the advent of the wheel to the rise of artificial intelligence (AI), technology has played a pivotal
role in the progress of society. In ancient times, humans developed basic tools to make their lives
easier, but it was during the industrial revolution that technological advancements began to rapidly
transform the world. The invention of the steam engine, for example, revolutionized transportation
and industry, leading to an era of mass production and global trade.
The 20th century saw an explosion of technological breakthroughs. The invention of the telephone,
radio, and television changed the way humans communicated and interacted. The development of
computers and the internet has been equally transformative, enabling a level of connectivity and
information-sharing that was previously 