In [None]:
!pip install accelerate -U
!pip install transformers[torch]
!pip install transformers PyMuPDF
!pip install datasets



In [None]:
import json
import torch
from transformers import BertTokenizerFast, BertForQuestionAnswering, Trainer, TrainingArguments, pipeline
from datasets import Dataset
import numpy as np
import fitz  # PyMuPDF
from sklearn.model_selection import train_test_split
import cProfile
import pstats

# Function to read the JSON file and return a dataset
def read_squad(path):
    with open(path, 'r') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return {
        'context': contexts,
        'question': questions,
        'answers': answers
    }

#paths to JSON file
file_path = '/content/train-v1.1.json'
dataset = read_squad(file_path)

# Split the dataset into training and validation sets
train_contexts, val_contexts, train_questions, val_questions, train_answers, val_answers = train_test_split(
    dataset['context'], dataset['question'], dataset['answers'], test_size=0.2, random_state=42
)

# Create Dataset objects
train_dataset = Dataset.from_dict({
    'context': train_contexts,
    'question': train_questions,
    'answers': train_answers
})

validation_dataset = Dataset.from_dict({
    'context': val_contexts,
    'question': val_questions,
    'answers': val_answers
})

# Print the first few entries to verify the structure
print("Train Dataset:", train_dataset)  # Print train dataset
print("Validation Dataset:", validation_dataset)  # Print validation dataset

# Tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

# Optimized preprocessing function
def preprocess_function_optimized(examples, indices):
    questions = [q.lstrip() for q in examples["question"]]
    contexts = examples["context"]

    inputs = tokenizer(
        contexts,
        questions,
        max_length=384,
        truncation=True,
        padding="max_length",
        return_offsets_mapping=True,
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]

    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"]
        end_char = start_char + len(answer["text"])

        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end token positions
        start_idx = next((j for j, (start, end) in enumerate(offset) if start <= start_char < end), None)
        end_idx = next((j for j, (start, end) in enumerate(offset) if start < end_char <= end), None)

        if start_idx is None or end_idx is None:
            start_positions.append(0)
            end_positions.append(0)
        else:
            start_positions.append(start_idx)
            end_positions.append(end_idx)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    return inputs

# Profile the preprocessing function
profiler = cProfile.Profile()
profiler.enable()

# Preprocess the dataset with the optimized function
try:
    tokenized_train_dataset = train_dataset.map(preprocess_function_optimized, batched=True, with_indices=True)
    tokenized_validation_dataset = validation_dataset.map(preprocess_function_optimized, batched=True, with_indices=True)
    print("Tokenized Train Dataset:", tokenized_train_dataset)
    print("Tokenized Validation Dataset:", tokenized_validation_dataset)
except Exception as e:
    print(f"Error during dataset preprocessing: {e}")

profiler.disable()
stats = pstats.Stats(profiler).sort_stats('tottime')
stats.print_stats(10)  # Print top 10 time-consuming lines


Train Dataset: Dataset({
    features: ['context', 'question', 'answers'],
    num_rows: 70079
})
Validation Dataset: Dataset({
    features: ['context', 'question', 'answers'],
    num_rows: 17520
})


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/70079 [00:00<?, ? examples/s]

Map:   0%|          | 0/17520 [00:00<?, ? examples/s]

Tokenized Train Dataset: Dataset({
    features: ['context', 'question', 'answers', 'input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 70079
})
Tokenized Validation Dataset: Dataset({
    features: ['context', 'question', 'answers', 'input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 17520
})
         2425057 function calls (2404996 primitive calls) in 139.137 seconds

   Ordered by: internal time
   List reduced from 830 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
       89   78.759    0.885   78.759    0.885 {method 'encode_batch' of 'tokenizers.Tokenizer' objects}
     1424   26.419    0.019   27.740    0.019 /usr/local/lib/python3.10/dist-packages/datasets/arrow_writer.py:161(__arrow_array__)
    87599   15.439    0.000   15.782    0.000 /usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_fast.py:274(_convert

<pstats.Stats at 0x7f3c89d6e6e0>

In [None]:
# Load the model
model = BertForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

# Define compute_metrics function
def compute_metrics(p):
    start_logits, end_logits = p
    start_positions = p.label_ids[:, 0]
    end_positions = p.label_ids[:, 1]

    start_predictions = np.argmax(start_logits, axis=1)
    end_predictions = np.argmax(end_logits, axis=1)

    # Exact match
    exact_match = (start_positions == start_predictions) & (end_positions == end_predictions)
    exact_match = exact_match.mean()

    return {
        "exact_match": exact_match,
    }

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_validation_dataset,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()

# Save the model
model.save_pretrained("./fine-tuned-bert-qa")
tokenizer.save_pretrained("./fine-tuned-bert-qa")

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Print evaluation results
print(f"Evaluation results: {eval_results}")

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

# Function to get answers from the text
def get_answers_from_text(text, questions):
    answers = []
    for question in questions:
        result = qa_pipeline(question=question, context=text)
        answers.append(result['answer'])
    return answers

# Load the fine-tuned model and tokenizer
model_path = "./fine-tuned-bert-qa"
tokenizer = BertTokenizerFast.from_pretrained(model_path)
model = BertForQuestionAnswering.from_pretrained(model_path)

# Initialize the QA pipeline
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

# Example usage
pdf_path = "/content/3 (1).pdf"  # Path to your PDF file
questions = [
    "What is the main topic of the document?",
    "Who is the author?",
    "What are the key findings?",
]

text = extract_text_from_pdf(pdf_path)
answers = get_answers_from_text(text, questions)
for question, answer in zip(questions, answers):
    print(f"Question: {question}")
    print(f"Answer: {answer}")