### Read PDF file

In [None]:
import os

FILES = ['../pdf/' + fname for fname in os.listdir('../pdf/')]
print('Total PDF files in the directory:', len(FILES))

Total PDF files in the directory: 19


In [2]:
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.strategies import PartitionStrategy

EXAMPLE_FILE = FILES[1]
example_chunks = partition_pdf(
    filename=EXAMPLE_FILE,
    strategy=PartitionStrategy.HI_RES,
    languages=['eng'],
    chunking_strategy='by_title',
    max_characters=10000,
    combine_text_under_n_chars=2000,
    new_after_n_chars=6000,
    extract_images_in_pdf=False, # skip image extraction
    infer_table_structure=False # skip table extraction
)

print(f'Total chunk from {EXAMPLE_FILE.split("/")[-1]}: {len(example_chunks)}')

Total chunk from Color+Theory_Make+it+Work.pdf: 6


In [3]:
print(f'{EXAMPLE_FILE} file chunks:')
for chunk in example_chunks:
    print(chunk.text)
    break

../pdf/Color+Theory_Make+it+Work.pdf file chunks:
photzyTM

COLOR THEORY (AND HOW TO MAKE IT WORK FOR YOU)

Quick Guide Written by Jo Plumridge

Advertise with us SPONSORED Before you dive into this guide, here's a few other free resources to help you learn photography: Free Photography eBooks 3 Free Photography Cheat Sheets What is Your #1 Photography Killer? Take this 30 second quiz to find out Free access to our library of 250+ Grab 3 free photography cheat the #1 thing holding your downloadable (pdf) tutorials on sheets that will help you photography back. everything you can imagine. understand the basics. Take Quiz → Download Cheat Sheets → Download eBooks → Want quick photography tips? Check out our friends at DailyPhotoTips.com they'll send you 1 solid photography tip to your inbox, 5 days a week. So you can start your day right, with actionable tips to help you on your creative journey. Subscribe now → (free for a limited time) SPONSORED Advertise with us

What is Your #1 Photo

In [4]:
import re

def clean_text(text):
    text = re.sub(r'\n+', '\n', text)
    return text.strip()

cleaned_text = [clean_text(chunk.text) for chunk in example_chunks]

In [5]:
cleaned_text[:5]

["photzyTM\nCOLOR THEORY (AND HOW TO MAKE IT WORK FOR YOU)\nQuick Guide Written by Jo Plumridge\nAdvertise with us SPONSORED Before you dive into this guide, here's a few other free resources to help you learn photography: Free Photography eBooks 3 Free Photography Cheat Sheets What is Your #1 Photography Killer? Take this 30 second quiz to find out Free access to our library of 250+ Grab 3 free photography cheat the #1 thing holding your downloadable (pdf) tutorials on sheets that will help you photography back. everything you can imagine. understand the basics. Take Quiz → Download Cheat Sheets → Download eBooks → Want quick photography tips? Check out our friends at DailyPhotoTips.com they'll send you 1 solid photography tip to your inbox, 5 days a week. So you can start your day right, with actionable tips to help you on your creative journey. Subscribe now → (free for a limited time) SPONSORED Advertise with us\nWhat is Your #1 Photography Killer?\nTake Quiz →\nDownload eBooks →\n

### Intialize langchain and chromadb

In [6]:
from langchain_chroma import Chroma
from langchain_ollama.embeddings import OllamaEmbeddings

embedding_function = OllamaEmbeddings(model='nomic-embed-text')
vectorstore = Chroma(
    collection_name='photography_collection', 
    embedding_function=embedding_function,
    persist_directory='../chromadb'
)

vectorstore.reset_collection()

In [7]:
from langchain.storage import InMemoryStore
from langchain.retrievers.multi_vector import MultiVectorRetriever

docstore = InMemoryStore()
id_key = 'doc_id'

retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=docstore,
    id_key=id_key
)

In [8]:
from uuid import uuid4
from langchain_core.documents import Document

doc_ids = [str(uuid4()) for _ in example_chunks]
texts = [
    Document(
        page_content=text,
        metadata={
            id_key: doc_ids[i],
            'content': text,
            'filename': example_chunks[i].metadata.filename,
            'page_number': example_chunks[i].metadata.page_number
        }
    ) for i, text in enumerate(cleaned_text)
]

retriever.vectorstore.add_documents(texts)
retriever.docstore.mset(list(zip(doc_ids, cleaned_text)))

#### Testing

In [9]:
result = retriever.invoke('How do I control my flash during my photography?')
result

["photzyTM\nCOLOR THEORY (AND HOW TO MAKE IT WORK FOR YOU)\nQuick Guide Written by Jo Plumridge\nAdvertise with us SPONSORED Before you dive into this guide, here's a few other free resources to help you learn photography: Free Photography eBooks 3 Free Photography Cheat Sheets What is Your #1 Photography Killer? Take this 30 second quiz to find out Free access to our library of 250+ Grab 3 free photography cheat the #1 thing holding your downloadable (pdf) tutorials on sheets that will help you photography back. everything you can imagine. understand the basics. Take Quiz → Download Cheat Sheets → Download eBooks → Want quick photography tips? Check out our friends at DailyPhotoTips.com they'll send you 1 solid photography tip to your inbox, 5 days a week. So you can start your day right, with actionable tips to help you on your creative journey. Subscribe now → (free for a limited time) SPONSORED Advertise with us\nWhat is Your #1 Photography Killer?\nTake Quiz →\nDownload eBooks →\n

### Create question

In [10]:
from pydantic import BaseModel
from typing import List

class Question(BaseModel):
    prompt: List[str]

In [None]:
import os
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from langchain_ollama.chat_models import ChatOllama
from langchain_core.runnables import RunnableLambda

model = ChatOllama(model='llama3.1', temperature=.8)
question_parser = PydanticOutputParser(pydantic_object=Question)
question_prompt = PromptTemplate(
    template="""Generate 3 different FaQ questions based on the text and only generate questions strictly on the text provided. Do not include any markdown formatting or code blocks in your response.
    {format_instructions}
    {chunk}

    Only return the output in the following format:
    {{
        "prompt": ["First question here", "Second question here", "Third question here"]
    }}
    """,
    input_variables=['chunk'],
    partial_variables={"format_instructions": question_parser.get_format_instructions()}
)

clean_output = RunnableLambda(
lambda x: x.split("</think>")[-1].strip() if isinstance(x, str) 
else x.content.split("</think>")[-1].strip()
)

question_chain = question_prompt | model | clean_output | question_parser
example_questions = question_chain.invoke(cleaned_text[0])
example_questions

Question(prompt=['What is color theory?', 'How can the color wheel be used to enhance my photography?', 'Are there any additional resources recommended in this text?'])

### Generate answer candidate

In [12]:
from langchain_core.messages import HumanMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

class Answer(BaseModel):
    prompt: List[str]

def parse_docs(docs):
    return [doc for doc in docs]

def build_prompt(kwargs):
    question = kwargs['question']
    context = kwargs['context']
    
    answer_prompt = [
        {
            'type': 'text',
            'text': f"""Generate 3 different FaQ answers based only on the following context, which only include text. Do not include any markdown formatting or code blocks in your response.
            Context: {context}
            Question: {question}

            Return the output in the following JSON format:
            {{
                "prompt": ["First answer here", "Second answer here", "Third answer here"]
            }}
            """
        }
    ]

    return ChatPromptTemplate.from_messages(
        [
            HumanMessage(content=answer_prompt)
        ]
    )

answer_chain = (
    {
        'context': retriever | RunnableLambda(parse_docs),
        'question': RunnablePassthrough()
    }
    | RunnableLambda(build_prompt)
    | model
    | clean_output
    | PydanticOutputParser(pydantic_object=Answer)
)

example_question = example_questions.prompt[0]
candidate_example_question = answer_chain.invoke(example_question)
candidate_example_question

Answer(prompt=['Color theory is a fundamental concept in art and photography that explains how colors interact to create specific visual effects and emotions.', 'It involves understanding the components of color, such as hue (the actual color), value (the lightness or darkness), and saturation (the intensity or purity of the color).', 'There are various color schemes used in color theory, such as complementary (opposite colors on the color wheel), triadic (three evenly spaced colors), tetradic (two sets of complementary colors), and others, each serving a different aesthetic purpose.'])

### Ranking the response

In [13]:
import torch

reference = retriever.vectorstore.similarity_search(example_question, k=1)[0].page_content

# score similarity
candidate_embeds = embedding_function.embed_documents(candidate_example_question.prompt)
reference_embed = embedding_function.embed_query(reference)

scores = [
    torch.nn.functional.cosine_similarity(
        torch.tensor(reference_embed),
        torch.tensor(candidate_embed),
        dim=0
    ).item()
    for candidate_embed in candidate_embeds
]

sorted_pairs = sorted(zip(candidate_example_question.prompt, scores), key=lambda x: x[1], reverse=True)
sorted_answer = [pair[0] for pair in sorted_pairs]
sorted_answer

['Color theory is a fundamental concept in art and photography that explains how colors interact to create specific visual effects and emotions.',
 'It involves understanding the components of color, such as hue (the actual color), value (the lightness or darkness), and saturation (the intensity or purity of the color).',
 'There are various color schemes used in color theory, such as complementary (opposite colors on the color wheel), triadic (three evenly spaced colors), tetradic (two sets of complementary colors), and others, each serving a different aesthetic purpose.']

### Create preference dataset

In [14]:
dataset = {
    'prompt': example_question,
    'chosen': sorted_answer[0],
    'rejected': sorted_answer[-1]
}
dataset

{'prompt': 'What is color theory?',
 'chosen': 'Color theory is a fundamental concept in art and photography that explains how colors interact to create specific visual effects and emotions.',
 'rejected': 'There are various color schemes used in color theory, such as complementary (opposite colors on the color wheel), triadic (three evenly spaced colors), tetradic (two sets of complementary colors), and others, each serving a different aesthetic purpose.'}

### Pipeline

In [15]:
from IPython import display

def chunk_file(fname):
    def clean_text(text):
        text = re.sub(r'\n+', '\n', text)
        return text.strip()
    
    chunks = partition_pdf(
        filename=fname,
        strategy=PartitionStrategy.HI_RES,
        languages=['eng'],
        chunking_strategy='by_title',
        max_characters=10000,
        combine_text_under_n_chars=2000,
        new_after_n_chars=6000,
        extract_images_in_pdf=False, # skip image extraction
        infer_table_structure=False # skip table extraction
    )

    return [clean_text(chunk.text) for chunk in chunks]

chunk_dict = {}
data_dir = os.listdir('../pdf')
for i, fname in enumerate(data_dir):
    print(f'[{i+1}/{len(data_dir)}] Chunking {fname}')
    key = fname.split('.')[0]
    path = os.path.join('../pdf', fname)
    result = chunk_file(path)
    chunk_dict[key] = result
    print(f'[{i+1}/{len(data_dir)}] Chunking {fname} - Total chunk: {len(result)}')
display.clear_output()
print('DONE')

DONE


In [16]:
vectorstore.reset_collection()

def populate_db(fname, chunks):
    doc_ids = [str(uuid4()) for _ in chunks]
    texts = [
        Document(
            page_content=text,
            metadata={
                id_key: doc_ids[i],
                'content': text,
                'filename': fname
            }
        ) for i, text in enumerate(chunks)
    ]

    retriever.vectorstore.add_documents(texts)
    retriever.docstore.mset(list(zip(doc_ids, chunks)))

for k, v in chunk_dict.items():
    print(f'Popuplating {k}')
    populate_db(k, v)
display.clear_output()
print('DONE')

DONE


In [17]:
result = retriever.invoke('How do I control my flash during my photography?')
result[0]

'CONTINUOUS SHOOTING MODE\nContinuous mode, sometimes also called burst mode, allows the camera to keep shooting while holding down the shutter release button.\nContinuous mode is very helpful when photographing fast action, like wildlife, sports, or pets and children playing.\nThis action tells the camera to take photos one right after another, capturing the images in rapid succession.\nIt also works well for wedding photographers who need to capture spur-of-the-moment candid photos.\nSome cameras have two options for continuous shooting – continuous high and continuous low.\nThe high setting takes the series of photos faster than the low setting.\nThe maximum speed of the continuous drive modes can range from 3 to 10 frames per second.\nUsing continuous drive mode gives the photographer a wider range of shots to choose from!\nHaving a series of shots taken one right after another helps to ensure you don’t miss the peak action.\nThe image on the following page was taken in a series of

In [19]:
def generate_question(text):
    parser = PydanticOutputParser(pydantic_object=Question)
    prompt = PromptTemplate(
        template="""Generate 3 different FaQ question based on the text and only generate question strictly on the text provided. Do not include any markdown formatting or code blocks in your response.
        {format_instructions}
        {chunk}

        Return the output in the following format:
        {{
            "prompt": ["First question here", "Second question here", "Third question here"]
        }}
        """,
        input_variables=['chunk'],
        partial_variables={"format_instructions": parser.get_format_instructions()}
    )

    chain = prompt | model | clean_output | parser
    return chain.invoke(text)

question_lists = []
for k, chunks in chunk_dict.items():
    print(f'Generating question for {k}')
    for chunk in chunks:
        question = generate_question(chunk)
        question_lists.append(question)
display.clear_output()
question_lists[:5]

Generating question for Camera+Basics_Drive+Modes+Explained
Generating question for Color+Theory_Make+it+Work
Generating question for Composition+Framing+and+Space
Generating question for 2023+LR+Spring+Release
Generating question for Asymmetrical+Balance
Generating question for Creative+Nature
Generating question for Crazy+High+Contrast
Generating question for Composition+Puzzle
Generating question for Bugs_Spiders_Not_Included


OutputParserException: Invalid json output: ```json
{
    "prompt": [
        "What is macro photography?"
        "Which camera settings can be set to Auto to help you get better results?"
        "Is it necessary to travel to specialized locations to shoot insects? Is a noisy image still considered a great image."
    ]
}
```
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE 

In [36]:
def generate_answer(text):
    def parse_docs(docs):
        return [doc for doc in docs]

    def build_prompt(kwargs):
        question = kwargs['question']
        context = kwargs['context']
        
        answer_prompt = [
            {
                'type': 'text',
                'text': f"""Generate 3 different FaQ answers based only on the following context, which only include text. Do not include any markdown formatting or code blocks in your response.
                Context: {context}
                Question: {question}

                Return the output in the following JSON format:
                {{
                    "prompt": ["First answer here", "Second answer here", "Third answer here"]
                }}
                """
            }
        ]

        return ChatPromptTemplate.from_messages(
            [
                HumanMessage(content=answer_prompt)
            ]
        )

    chain = (
        {
            'context': retriever | RunnableLambda(parse_docs),
            'question': RunnablePassthrough()
        }
        | RunnableLambda(build_prompt)
        | model
        | clean_output
        | PydanticOutputParser(pydantic_object=Answer)
    )

    return chain.invoke(text)

qna_pairs = {}
for chunk in question_lists:
    print(f'Generating answer for {chunk.prompt}')
    for question in chunk.prompt:
        qna_pairs[question] = generate_answer(question)
display.clear_output()

BadRequestError: Error code: 400 - {'type': 'error', 'error': {'type': 'invalid_request_error', 'message': 'Your credit balance is too low to access the Anthropic API. Please go to Plans & Billing to upgrade or purchase credits.'}}