# Dataset creation

### Read PDF file

In [1]:
import os

FILES = ['../pdf/' + fname for fname in os.listdir('../pdf/')]
print('Total PDF files in the directory:', len(FILES))

Total PDF files in the directory: 85


In [2]:
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.strategies import PartitionStrategy

EXAMPLE_FILE = FILES[1]
example_chunks = partition_pdf(
    filename=EXAMPLE_FILE,
    strategy=PartitionStrategy.HI_RES,
    languages=['eng'],
    chunking_strategy='by_title',
    max_characters=10000,
    combine_text_under_n_chars=2000,
    new_after_n_chars=6000,
    extract_images_in_pdf=False, # skip image extraction
    infer_table_structure=False # skip table extraction
)

print(f'Total chunk from {EXAMPLE_FILE.split("/")[-1]}: {len(example_chunks)}')

Total chunk from Tips+Tricks_Stunning+Flash+Portraits.pdf: 6


In [3]:
print(f'{EXAMPLE_FILE} file chunks:')
for chunk in example_chunks:
    print(chunk.text)

../pdf/Tips+Tricks_Stunning+Flash+Portraits.pdf file chunks:
photzyTM

TIPS AND TRICKS FOR CAPTURING STUNNING PORTRAITS USING YOUR FLASH

Quick Guide Written by Kevin Landwer-Johan

Advertise with us SPONSORED Before you dive into this guide, here's a few other free resources to help you learn photography: Free Photography eBooks 3 Free Photography Cheat Sheets What is Your #1 Photography Killer? Take this 30 second quiz to find out Free access to our library of 250+ Grab 3 free photography cheat the #1 thing holding your downloadable (pdf) tutorials on sheets that will help you photography back. everything you can imagine. understand the basics. Take Quiz → Download Cheat Sheets → Download eBooks → Want quick photography tips? Check out our friends at DailyPhotoTips.com they'll send you 1 solid photography tip to your inbox, 5 days a week. So you can start your day right, with actionable tips to help you on your creative journey. Subscribe now → (free for a limited time) SPONSORED Adv

In [4]:
import re

def clean_text(text):
    text = re.sub(r'\n+', '\n', text)
    return text.strip()

cleaned_text = [clean_text(chunk.text) for chunk in example_chunks]

In [5]:
cleaned_text

["photzyTM\nTIPS AND TRICKS FOR CAPTURING STUNNING PORTRAITS USING YOUR FLASH\nQuick Guide Written by Kevin Landwer-Johan\nAdvertise with us SPONSORED Before you dive into this guide, here's a few other free resources to help you learn photography: Free Photography eBooks 3 Free Photography Cheat Sheets What is Your #1 Photography Killer? Take this 30 second quiz to find out Free access to our library of 250+ Grab 3 free photography cheat the #1 thing holding your downloadable (pdf) tutorials on sheets that will help you photography back. everything you can imagine. understand the basics. Take Quiz → Download Cheat Sheets → Download eBooks → Want quick photography tips? Check out our friends at DailyPhotoTips.com they'll send you 1 solid photography tip to your inbox, 5 days a week. So you can start your day right, with actionable tips to help you on your creative journey. Subscribe now → (free for a limited time) SPONSORED Advertise with us\nWhat is Your #1 Photography Killer?\nTake Q

### Intialize langchain and chromadb

In [6]:
from langchain_chroma import Chroma
from langchain_ollama.embeddings import OllamaEmbeddings

embedding_function = OllamaEmbeddings(model='nomic-embed-text')
vectorstore = Chroma(
    collection_name='photography_collection', 
    embedding_function=embedding_function,
    persist_directory='../chromadb'
)

vectorstore.reset_collection()

In [7]:
from langchain.storage import InMemoryStore
from langchain.retrievers.multi_vector import MultiVectorRetriever

docstore = InMemoryStore()
id_key = 'doc_id'

retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=docstore,
    id_key=id_key
)

In [8]:
from uuid import uuid4
from langchain_core.documents import Document

doc_ids = [str(uuid4()) for _ in example_chunks]
texts = [
    Document(
        page_content=text,
        metadata={
            id_key: doc_ids[i],
            'content': text,
            'filename': example_chunks[i].metadata.filename,
            'page_number': example_chunks[i].metadata.page_number
        }
    ) for i, text in enumerate(cleaned_text)
]

retriever.vectorstore.add_documents(texts)
retriever.docstore.mset(list(zip(doc_ids, cleaned_text)))

#### Testing

In [9]:
result = retriever.invoke('How do I control my flash during my photography?')
result

['SETTINGS FOR FLASH PHOTOGRAPHY\nI manage my exposures manually, including my flash output. I find this allows me the highest level of control. Let me walk you through the process I often use when using flash when taking a portrait photograph.\nI start by positioning my subject with the desired background and lighting. As I do this, I consider the brightness of the background in relation to the amount of light illuminating my subject. I’ll take a spot meter reading from my subject’s face and then another spot reading from the background. Doing this gives me a clear indication of what difference, if any, there is between the light values in those areas of my composition.\nNext, I set my flash so it will provide sufficient light on my subject. This may be a little over or under the value of the ambient light. I adjust my camera’s exposure setting to match the output of my flash.\nWhen my flash is set to emit a greater amount of light than the ambient light on the background, that area o

### Create question

In [10]:
from pydantic import BaseModel
from typing import List

class Question(BaseModel):
    prompt: List[str]

In [11]:
from langchain_anthropic import ChatAnthropic
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import PydanticOutputParser

model = ChatAnthropic(
    model='claude-3-5-sonnet-20241022', 
    api_key='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',
    temperature=.8
)

question_parser = PydanticOutputParser(pydantic_object=Question)
question_prompt = PromptTemplate(
    template="""Generate 3 different FaQ question based on the text and only generate question strictly on the text provided. Do not include any markdown formatting or code blocks in your response.
    {format_instructions}
    {chunk}

    Return the output in the following format:
    {{
        "prompt": ["First question here", "Second question here", "Third question here"]
    }}
    """,
    input_variables=['chunk'],
    partial_variables={"format_instructions": question_parser.get_format_instructions()}
)

question_chain = question_prompt | model | question_parser
example_questions = question_chain.invoke(cleaned_text[0])
example_questions

Question(prompt=['When is it appropriate to use flash for portrait photography?', 'How did the author first learn to use flash in portrait photography?', 'What is the key to successful flash use in portraits according to the text?'])

### Generate answer candidate

In [12]:
from langchain_core.messages import HumanMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough

class Answer(BaseModel):
    prompt: List[str]

def parse_docs(docs):
    return [doc for doc in docs]

def build_prompt(kwargs):
    question = kwargs['question']
    context = kwargs['context']
    
    answer_prompt = [
        {
            'type': 'text',
            'text': f"""Generate 3 different FaQ answers based only on the following context, which only include text. Do not include any markdown formatting or code blocks in your response.
            Context: {context}
            Question: {question}

            Return the output in the following JSON format:
            {{
                "prompt": ["First answer here", "Second answer here", "Third answer here"]
            }}
            """
        }
    ]

    return ChatPromptTemplate.from_messages(
        [
            HumanMessage(content=answer_prompt)
        ]
    )

answer_chain = (
    {
        'context': retriever | RunnableLambda(parse_docs),
        'question': RunnablePassthrough()
    }
    | RunnableLambda(build_prompt)
    | model
    | PydanticOutputParser(pydantic_object=Answer)
)

example_question = example_questions.prompt[0]
candidate_example_question = answer_chain.invoke(example_question)
candidate_example_question

Answer(prompt=["Flash can be used any time you want to make a portrait, not just when it's dark. It's particularly useful for balancing ambient light to add a professional look to your photographs.", "Flash is appropriate when the light is very strong and casts hard shadows, as it can be used to soften them. It's also useful when the available light is very flat and dull, as it can add depth to the portrait.", "Flash is suitable when the ambient light is not suitable for the desired portrait effect. As a former newspaper photographer demonstrated, it's an essential tool when you need to ensure you get the shot regardless of lighting conditions."])

### Ranking the response

In [None]:
from langchain_anthropic import ChatAnthropic
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import PydanticOutputParser

model = ChatAnthropic(
    model='claude-3-5-sonnet-20241022', 
    api_key='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',
    temperature=.8
)

parser = PydanticOutputParser(pydantic_object=QnAPair)

qa_prompt = PromptTemplate(
    template="""Generate 3 different FaQ based on the text and only generate question and answer strictly on the text provided. Do not include any markdown formatting or code blocks in your response.
    {format_instructions}
    {chunk}

    Return the output in the following format:
    {{
        "pairs": [
            {{
                "question": "First question here",
                "answer": "First answer here"
            }},
            {{
                "question": "Second question here",
                "answer": "Second answer here"
            }},
            {{
                "question": "Third question here",
                "answer": "Third answer here"
            }}
        ]
    }}""",
    input_variables=['chunk'],
    partial_variables={"format_instructions": parser.get_format_instructions()}
)

chain = qa_prompt | model | parser
example_result = chain.invoke(cleaned_text[0])
example_result.pairs

[QnA(question='When can flash be used for portraits?', answer="Flash can be used any time you want to make a portrait, not just when it's dark. It can be balanced with ambient light to add a professional look, used to soften hard shadows in strong light, or add depth when available light is flat and dull."),
 QnA(question='What is the key to successful flash use for portraits?', answer='The key to successful flash use for portraits is to control its level compared to the ambient light, adding either a little more power or less power and adjusting the aperture setting to capture the desired exposure.'),
 QnA(question='How did the author initially learn to use flash for portraits?', answer='The author first began using flash for portraits as a newspaper photographer, where he had to use flash whenever the ambient light was not suitable. He carried minimal equipment with just one simple flash, which helped him better understand how to work with ambient light and balance it with flash.')]

### Ranking the response

In [115]:
example_prompt = example_result.dict()['pairs'][0]
example_prompt

/var/folders/17/504r7zwd36jcb_pwxly46r940000gn/T/ipykernel_95385/3380009285.py:1: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  example_prompt = example_result.dict()['pairs'][0]


{'question': 'When can flash be used for portraits?',
 'answer': "Flash can be used any time you want to make a portrait, not just when it's dark. It can be balanced with ambient light to add a professional look, used to soften hard shadows in strong light, or add depth when available light is flat and dull."}

In [None]:
reference = vectorstore.similarity_search(example_prompt['question'], k=1)[0].page_content


'SETTINGS FOR FLASH PHOTOGRAPHY\nI manage my exposures manually, including my flash output. I find this allows me the highest level of control. Let me walk you through the process I often use when using flash when taking a portrait photograph.\nI start by positioning my subject with the desired background and lighting. As I do this, I consider the brightness of the background in relation to the amount of light illuminating my subject. I’ll take a spot meter reading from my subject’s face and then another spot reading from the background. Doing this gives me a clear indication of what difference, if any, there is between the light values in those areas of my composition.\nNext, I set my flash so it will provide sufficient light on my subject. This may be a little over or under the value of the ambient light. I adjust my camera’s exposure setting to match the output of my flash.\nWhen my flash is set to emit a greater amount of light than the ambient light on the background, that area of