<a href="https://colab.research.google.com/github/pranab308/SAP/blob/master/CV_Answers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
#!pip install PyPDF2 transformers torch

import PyPDF2
from transformers import pipeline
import os # Import os for checking file existence

# --- Configuration ---
PDF_FILE_PATH = '/content/PranabKumar.pdf' # <<< IMPORTANT: Change this to your PDF path
QUESTION = "My phone number and my email address" # <<< Your question

# --- 1. PDF Text Extraction ---
def extract_text_from_pdf(pdf_path):
  """Extracts text from a PDF file."""
  text = ""
  if not os.path.exists(pdf_path):
      print(f"Error: File not found at {pdf_path}")
      return None

  try:
    with open(pdf_path, 'rb') as file:
      reader = PyPDF2.PdfReader(file)
      # Check if the PDF has pages
      if len(reader.pages) == 0:
          print(f"Error: PDF file at {pdf_path} contains no pages.")
          return None

      for page_num in range(len(reader.pages)):
        page = reader.pages[page_num]
        page_text = page.extract_text()
        if page_text: # Add text only if extraction was successful
            text += page_text + "\n" # Add newline between pages for better readability
        else:
            print(f"Warning: Could not extract text from page {page_num + 1}.")

  except Exception as e:
    print(f"An error occurred while reading the PDF: {e}")
    return None

  if not text.strip(): # Check if extracted text is empty or only whitespace
      print(f"Error: No readable text extracted from PDF file at {pdf_path}.")
      return None

  return text

# --- 2. Question Answering ---
def answer_question_from_text(text, question):
  """Uses a QA model to find the answer to a question in the text."""
  if not text:
      print("Cannot answer question: Input text is empty.")
      return "Error: Input text is empty."

  try:
    # Load a pre-trained Question Answering pipeline
    # 'distilbert-base-cased-distilled-squad' is a good, relatively fast model
    # You can explore others like 'bert-large-uncased-whole-word-masking-finetuned-squad'
    # or 'deepset/roberta-base-squad2'

    qa_pipeline = pipeline("question-answering", model="bert-large-uncased-whole-word-masking-finetuned-squad")
    #qa_pipeline = pipeline("feature-extraction", model="google-bert/bert-base-uncased")
    print("Question Answering pipeline loaded.")

    # The pipeline expects a dictionary with 'question' and 'context'
    result = qa_pipeline(question=question, context=text)

    # The result is a dictionary containing the answer, score, start, and end positions
    answer = result.get('answer', 'Could not find an answer.')
    score = result.get('score', 0.0)

    # Set a confidence threshold for the answer
    confidence_threshold = 0.8 # Adjust this threshold as needed

    if score > confidence_threshold and answer != 'Could not find an answer.':
        return f"Answer: {answer} (Confidence: {score:.2f})"
    else:
        # If confidence is low, the answer might not be accurate or found
        return f"Could not find a confident answer. (Best match: '{answer}' with score {score:.2f})"


  except Exception as e:
      print(f"An error occurred during question answering: {e}")
      # Consider uncommenting raise e to see the full traceback if debugging
      # raise e
      return "Error: Could not generate answer due to an unexpected error."


# --- Main Execution ---
if __name__ == "__main__":
    print(f"Attempting to extract text from: {PDF_FILE_PATH}")
    cv_text = extract_text_from_pdf(PDF_FILE_PATH)

    if cv_text:
        print(f"Successfully extracted text ({len(cv_text)} characters).")
        print(f"\nAsking the question: '{QUESTION}'")
        answer = answer_question_from_text(cv_text, QUESTION)
        print("\n--- Answer ---")
        print(answer)
    else:
        print("\n--- Result ---")
        print("Failed to extract text from the PDF. Cannot answer the question.")

Attempting to extract text from: /content/PranabKumar.pdf
Successfully extracted text (5840 characters).

Asking the question: 'My phone number and my email address'


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


Question Answering pipeline loaded.

--- Answer ---
Could not find a confident answer. (Best match: 'pranab308@gmail.com' with score 0.24)
