In [2]:
import base64
import requests
import tiktoken 
import json

### Encode PDF files

In [3]:
files = ["mothersday.pdf", "theaddress.pdf", "beautifulwhitehorse.pdf"]

pdf_info = {}

for file_path in files:
    with open(file_path, "rb") as f:
        pdf_base64 = base64.b64encode(f.read()).decode("utf-8")
    pdf_info[file_path] = {
        "base64": pdf_base64,
        "text_content": None,
        "textcontent_num_tokens": None
    }


### Generate text using OCR

In [4]:

url = "http://localhost:8000/ocr"

for file_path, info in pdf_info.items():
    payload = {
        "file_base64": info["base64"],
        "start_page": 1,
        "end_page": 100
    }
    resp = requests.post(url, json=payload)
    if resp.status_code == 200:
        info["text_content"] = resp.json().get("ocr_results", "")
    else:
        info["text_content"] = ""
        print(f"OCR failed for {file_path}: {resp.status_code}")


OCR failed for mothersday.pdf: 500
OCR failed for theaddress.pdf: 500
OCR failed for beautifulwhitehorse.pdf: 500


### Count tokens for each pdf 

In [None]:
encoding = tiktoken.encoding_for_model("gpt-4")  # or your model

for file_path, info in pdf_info.items():
    if info["text_content"]:
        info["textcontent_num_tokens"] = len(encoding.encode(info["text_content"]))

In [None]:
import json

# Save dictionary to file
with open("file_contents.json", "w", encoding="utf-8") as f:
    json.dump(file_contents, f, ensure_ascii=False, indent=2)

In [None]:
# Load dictionary from file
with open("file_contents.json", "r", encoding="utf-8") as f:
    loaded_file_contents = json.load(f)

loaded_file_contents

### Sample Questions

In [1]:
user_questions = [
  # The Summer of the Beautiful White Horse
  "Why was it difficult for Aram to believe that his cousin Mourad had stolen the white horse, even when he saw him riding it?",
  "Uncle Khosrove is described as having a distinct temperament. What was his typical reaction to any problem?",
  "In the Garoghlanian family, what was the relationship between flesh (physical inheritance) and spirit (temperament/qualities), and how does Mourad embody this idea?",

  # Mother’s Day
  "Why does Mrs Fitzgerald advise Mrs Pearson to be firm with her family?",
  "How do Doris and Cyril react when they first notice the change in their mother’s attitude?",
  "The play highlights gender and family dynamics in the 1950s. Do you think these issues are still relevant today? Justify briefly.",

  # The Address
  "Why did the narrator feel uncertain when she first visited Mrs Dorling’s house after the war?",
  "What role do familiar household objects play in evoking memories for the narrator?",
  "Why does the narrator finally decide to ‘forget the address’? Do you think it was the right choice?",
  "Compare the pre-war and post-war circumstances of the narrator. How did her perception of belongings change with time?"
]


### Create Prompt

In [2]:

# Combine all OCR text into a single context
context = ""
for info in pdf_info.values():
    if info["text_content"]:
        context += info["text_content"] + "\n\n"

# Build a single prompt asking for dictionary-style answers
prompt = f"""
You are an AI assistant. Use the following context to answer the questions. 
Return your answers as a JSON dictionary where each question is a key and the answer is the value.

Context:
{context}

Questions:
{user_questions}

Answer:
"""

print(prompt[:500], "...")  # preview first 500 chars


NameError: name 'pdf_info' is not defined