Installing Dependencies


1. Uninstall all LlamaIndex-related packages
pip uninstall -y llama-index llama-index-core llama-index-vector-stores-qdrant

2. Clear pip cache to prevent reinstalling cached versions
pip cache purge

3. Reinstall the latest versions
pip install -U qdrant_client fastembed
pip install llama-index
pip install llama-index-vector-stores-qdrant

In [None]:
# %pip install llama-index-vector-stores-qdrant llama-index-readers-file llama-index-embeddings-fastembed llama-index-llms-openai

In [2]:
# pip show llama-index-vector-stores-qdrant llama-index-core llama-index 

In [1]:
pwd

'/Users/silviaseceleanu/Dev/norm-ai-takehome-fullstack/app'

In [2]:
cd ..

/Users/silviaseceleanu/Dev/norm-ai-takehome-fullstack


### Implementing DocumentService in utils.py

- Your task is to process data and create a list of Document objects from it.
- Implement the create_documents() method. 
    - This involves reading the PDF, 
    - parsing the text into meaningful sections, and 
    - creating Document objects that encapsulate these sections and their content.

The parsing logic should accurately identify and separate different laws or sections within the PDF, ensuring the data structure aligns with the Document class requirements.


### Parse PDF


In [3]:
# using unstructured
# use unstructured to parse the pdf
# import unstructured

# !pip install unstructured
# !pip install unstructured[local-inference] 
# !pip install pdfminer.six

# from unstructured.partition.pdf import partition_pdf

In [None]:
%%writefile app/extract_text.py
import fitz  # PyMuPDF
import os

pdf_file_path = 'docs/laws.pdf'
text_file_path = 'docs/laws_text.txt'

def extract_pdf_text(pdf_path: str, output_path: str) -> None:
    """
    Extract text from a PDF file and save it to a text file.
    
    Args:
        pdf_path: Path to the PDF file
        output_path: Path where to save the extracted text
    """
    try:
        # Open the PDF
        doc = fitz.open(pdf_path)
        
        # Extract text from all pages
        text = ""
        for page in doc:
            text += page.get_text()
        
        # Save the text to a file
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(text)
            
        print(f"Successfully extracted text from {pdf_path} to {output_path}")
        
    except Exception as e:
        print(f"Error extracting text: {str(e)}")
    finally:
        if 'doc' in locals():
            doc.close()

if __name__ == "__main__":
    # Get the absolute paths
    current_dir = os.path.dirname(os.path.abspath(__file__))
    project_root = os.path.dirname(current_dir)
    
    pdf_path = os.path.join(project_root, pdf_file_path)
    output_path = os.path.join(project_root, text_file_path)
    
    # Extract and save the text
    extract_pdf_text(pdf_path, output_path) 

In [None]:
%%writefile app/process_text.py
from pydantic import BaseModel, Field
from llama_index.core.schema import Document
# from datetime import datetime, date
import os
import re
import json

class DocumentMetadata(BaseModel):
    level_id: str = Field(..., description="The identifier of the level (e.g., '6.11.1')")
    section_id: str = Field(..., description="The highest level section identifier (e.g., '6')")
    # section_title: Optional[str] = Field(None, description="The title of the section (e.g., 'Thievery')")
    # source_file: str = Field(..., description="Source document name")
    title: str = Field(..., description="Document title")
    # page_number: Optional[int] = Field(None, description="Page number in the source document")
    # parsed_at: date = Field(default_factory=date.today, description="Date document was parsed")


class DocumentService(BaseModel):
    file_path: str

    @staticmethod
    def chunk_text(text: str, has_title: bool = True) -> tuple[list[dict], str]:
        """
        Splits `text` into chunks, each chunk containing:
            {
            "heading": <the heading line>,
            "text": <subsequent lines until the next heading, joined by spaces>
            }
        A line is considered a heading if it matches either:
        - numeric_heading_pattern (like "1.2.")
        - word_colon_pattern (like "Citations:")
        
        Arguments:
            text: str - The text to chunk
            has_title: bool - Whether the text has a title line
        Returns: 
            tuple[list[dict], str] - A tuple containing:
                list[dict] - A list of chunks, each chunk containing a heading and the subsequent text
                title: str - The title of the document
        """
        title = None
        # Assumptions about header format
        numeric_heading_pattern = re.compile(r'^[0-9]+(?:\.[0-9]+)*\.$')
        word_colon_pattern = re.compile(r'^[A-Za-z]+:$', re.IGNORECASE)

        def _is_new_heading(line: str) -> bool:
            """
            Returns True if the line matches either:
            - A numeric heading like "1.1."
            - A single word followed by a colon (e.g. "Citations:", "References:")
            """
            stripped = line.strip()
            if numeric_heading_pattern.match(stripped):
                return True
            if word_colon_pattern.match(stripped):
                return True
            return False

        lines = text.splitlines()
        chunks = []
        current_heading = None
        current_body = []

        # Assumption: first line title 
        start_index = 0 
        if has_title:
            start_index = 1
            title = lines[0]    
        
        for line in lines[start_index:]:
            if _is_new_heading(line):
                # Finalize the previous chunk if we have one
                if current_heading is not None:
                    joined_text = " ".join(current_body).strip()
                    chunks.append({
                        "heading": current_heading,
                        "text": joined_text
                    })
                    current_body = []
                # Start a new heading
                current_heading = line.strip()
            else:
                # Accumulate lines for the current chunk
                current_body.append(line.strip())

        # Finalize the last chunk
        if current_heading is not None:
            joined_text = " ".join(current_body).strip()
            chunks.append({
                "heading": current_heading,
                "text": joined_text
            })

        return chunks, title

    def create_documents(self) -> list[Document]:
        """
        Enrich chunk metadata and output a list of Document objects.
        """
        
        with open(self.file_path, 'r') as file:
            text = file.read()
        
        docs = []
        chunks, title = self.chunk_text(text, has_title=True)
        # add metadata to each chunk
        for chunk in chunks:
            metadata = DocumentMetadata(
                level_id=chunk['heading'],
                section_id=chunk['heading'].split('.')[0],
                title=title,
                # parsed_at=datetime.now().date()
            )

            docs.append(Document(
                metadata=metadata.model_dump(),
                text=chunk['text']
            ))

        return docs

    @staticmethod
    def save_docs_to_json(docs: list[Document], path: str) -> None:
        docs_dict = [doc.to_dict() for doc in docs]
        with open(path, "w") as f:
            json.dump(docs_dict, f, indent=2)

    @staticmethod
    def read_docs_from_json(path: str) -> list[Document]:
        with open(path, "r") as f:
            docs_dict = json.load(f)
        return [Document.from_dict(doc_dict) for doc_dict in docs_dict]


if __name__ == "__main__":
    
    file_path = 'docs/laws_text.txt'
    output_path = 'docs/laws_processed.json'
    
    document_service = DocumentService(file_path=file_path)
    docs = document_service.create_documents()
    document_service.save_docs_to_json(docs, output_path)


In [30]:
!python app/process_text.py

In [None]:

from app.process_text import DocumentService

text = """1.
Peace
1.1.
The law requires petty lords and landed knights to take their disputes to their
liege lord, and abide by his judgment, while disputes between great houses were
adjudicated by the Crown.
"""
chunks, title = DocumentService.chunk_text(text, has_title=False)
print(title)
for i, c in enumerate(chunks, start=1):
    print(f"Chunk {i}: {repr(c)}\n")

text = """The World Economic Forum 
1.1.
The law requires petty lords and landed knights to take their disputes to their
liege lord, and abide by his judgment, while disputes between great houses were
adjudicated by the Crown.
"""

chunks, title = DocumentService.chunk_text(text, has_title=True)
for i, c in enumerate(chunks, start=1):
    print(title)
    print(f"Chunk {i}: {repr(c)}\n")

In [None]:
from app.process_text import DocumentService

document_service = DocumentService(file_path='docs/laws_text.txt')
docs = document_service.create_documents()
docs[:5]

### Enhancing QdrantService in utils.py: 
- This step requires completing the query method.
- Implement logic to initialize the query engine, run the query with the provided string, and format the results into an Output object containing the query, response, and relevant citations.
- Ensure that self.k, which determines the number of similar vectors to return, is effectively used in the query processing.
- Feel free to complete this section however you would like, but one option is the CitationQueryEngine from llama_index.

In [3]:
%%writefile app/qdrant_service.py
from pydantic import BaseModel
import qdrant_client
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core.schema import Document
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core import VectorStoreIndex, Settings, StorageContext
from llama_index.core.query_engine import CitationQueryEngine
from dataclasses import dataclass
import os
import yaml

key = os.environ['OPENAI_API_KEY']


@dataclass
class Input:
    query: str
    file_path: str

@dataclass
class Citation:
    source: str
    text: str
    score: str

class Output(BaseModel):
    query: str
    response: str
    citations: list[Citation]

    def __str__(self):
        return (f"Query: {self.query}\n"
                f"Response: {self.response}\n"
                f"Citations: {self.citations}")

class QdrantService:
    def __init__(self, k: int = 5):
        self.index = None
        self.k = k

        with open("app/config.yml", 'r') as file:
            config = yaml.safe_load(file)

        Settings.llm = OpenAI(model=config['llm_model'])
        Settings.embed_model = OpenAIEmbedding(model=config['embed_model'])

    def connect(self, docs = list[Document]) -> None:
        client = qdrant_client.QdrantClient(location=":memory:")
                
        vector_store = QdrantVectorStore(
            client=client, 
            collection_name='medieval_laws',
        )
        storage_context = StorageContext.from_defaults(
            vector_store=vector_store
        )

        self.index = VectorStoreIndex.from_vector_store(
            vector_store=vector_store,
            storage_context=storage_context
        )

    def load(self, docs = list[Document]) -> None:
        parser = SimpleNodeParser()
        nodes = parser.get_nodes_from_documents(docs)
        self.index.insert_nodes(nodes)

    # def load(self, docs: list[Document]) -> None:
    #     self.index = VectorStoreIndex.from_documents(docs)

    def query(self, query_str: str) -> Output:
        """
        This method needs to initialize the query engine, run the query, and return
        the result as a pydantic Output class. This is what will be returned as
        JSON via the FastAPI endpount. 
        worth noting:llama-index package has a CitationQueryEngine...

        Also, be sure to make use of self.k (the number of vectors to return based
        on semantic similarity).

        # Example output object
        citations = [
            Citation(source="Law 1", text="Theft is punishable by hanging"),
            Citation(source="Law 2", text="Tax evasion is punishable by banishment."),
        ]

        output = Output(
            query=query_str, 
            response=response_text, 
            citations=citations
            )

        return output

        """
        query_engine = CitationQueryEngine.from_args(
            index=self.index,
            similarity_top_k=self.k
        )
        response = query_engine.query(query_str)
        source_nodes = response.source_nodes

        # Process citations
        citations = []
        for i, node_with_score in enumerate(source_nodes, start=1):
            node = node_with_score.node
            score = str(round(node_with_score.score, 2))
            metadata = node.metadata
            source = f"{metadata.get('title')} - {metadata.get('level_id')}"
            text = node.get_text()
            citations.append(Citation(source=source, text=text, score=score))

        # Create the Output object
        output = Output(
            query=query_str,
            response=str(response),
            citations=citations
        )
        
        return output

Overwriting app/qdrant_service.py


In [4]:
from app.process_text import DocumentService
from app.qdrant_service import QdrantService
import yaml

with open("app/config.yml", 'r') as file:
    config = yaml.safe_load(file)

docs = DocumentService.read_docs_from_json(config['processed_file_path'])
index = QdrantService() 
index.connect() 
index.load(docs)


  from .autonotebook import tqdm as notebook_tqdm
  self._client.create_payload_index(


In [5]:
from app.qdrant_service import Output

def print_output(output: Output) -> None:
    output_dict = output.model_dump()
    print(f"Query: {output_dict['query']}\n")
    print('--------------------------------')
    print(f"Response: {output_dict['response']}\n")
    print('--------------------------------')
    citations_text = ''.join([f'{c["source"]} {c["text"]} [Score: {c["score"]}]\n' for c in output_dict['citations']])
    print(f"Citations: \n{citations_text}")

In [6]:
output = index.query("what happens if I pilfer?")
print_output(output)

Query: what happens if I pilfer?

--------------------------------
Response: If you pilfer, or steal, it is customary for you to be punished by losing a finger or a hand [3]. If you steal from a sept, you can be considered to have stolen from the gods and thus receive a harsher punishment [1].

--------------------------------
Citations: 
Laws of the Seven Kingdoms - 6.3. Source 1:
Those who steal from a sept can be considered to have stolen from the gods, and thus receive a harsher punishment.
 [Score: 0.78]
Laws of the Seven Kingdoms - 6.2. Source 2:
Pickpockets can likewise be punished by cutting off a hand.
 [Score: 0.78]
Laws of the Seven Kingdoms - 6.1. Source 3:
It is customary for a thief to be punished by losing a finger or a hand.
 [Score: 0.78]
Laws of the Seven Kingdoms - 6. Source 4:
Thievery
 [Score: 0.77]
Laws of the Seven Kingdoms - 7.1. Source 5:
Poaching is forbidden. Lords are generally not tolerant towards poachers and punishments for poaching can include being forc

In [7]:
output = index.query("what happens if I steal?")
print_output(output)

Query: what happens if I steal?

--------------------------------
Response: If you steal, it is customary for you to be punished by losing a finger or a hand [2]. If you steal from a sept, you can be considered to have stolen from the gods, and thus receive a harsher punishment [1].

--------------------------------
Citations: 
Laws of the Seven Kingdoms - 6.3. Source 1:
Those who steal from a sept can be considered to have stolen from the gods, and thus receive a harsher punishment.
 [Score: 0.82]
Laws of the Seven Kingdoms - 6.1. Source 2:
It is customary for a thief to be punished by losing a finger or a hand.
 [Score: 0.8]
Laws of the Seven Kingdoms - 6.2. Source 3:
Pickpockets can likewise be punished by cutting off a hand.
 [Score: 0.8]
Laws of the Seven Kingdoms - 6. Source 4:
Thievery
 [Score: 0.8]
Laws of the Seven Kingdoms - 7.1. Source 5:
Poaching is forbidden. Lords are generally not tolerant towards poachers and punishments for poaching can include being forced to join the

### 3. Setting up the FastAPI endpoint and containerization using Docker:
- Create an API endpoint that accepts a query string and returns a JSON response.
- This endpoint should interact with the QdrantService to process the query and return the results.
- Ensure the output is correctly serialized using the Output class from pydantic.
- Use Docker to containerize the application. Feel free to modify the existing Dockerfile to suit any
changes made during development.

In [None]:
%%writefile app/main.py
"""
Please create an endpoint that accepts a query string, e.g., "what happens if I steal 
from the Sept?" and returns a JSON response serialized from the Pydantic Output class.
"""
import yaml
from fastapi import FastAPI, Query, HTTPException
from pydantic import BaseModel
from llama_index.core.schema import Document
from app.qdrant_service import QdrantService, Output
from app.process_text import DocumentService

with open("app/config.yml", 'r') as file:
    config = yaml.safe_load(file)

app = FastAPI()
qdrant_service = QdrantService(k=config['k'])


@app.on_event("startup")
def startup_event():
    docs = DocumentService.read_docs_from_json(config['processed_file_path'])
    qdrant_service.connect()
    qdrant_service.load(docs)

@app.get("/query", response_model=Output)
def query_endpoint(query: str = Query(..., min_length=1, max_length=1000)):
    try:
        result = qdrant_service.query(query)
        return result
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

In [None]:
# !pip list 
# !pip freeze > requirements2.txt
!pip list | grep llama-index
!pip list | grep qdrant



In [None]:
import requests

query = "What happens if I steal from the Sept?"
response = requests.get("http://localhost/query", params={"query": query})
print(response.json())

### docker

#### docker cleanup

In [38]:
!docker ps -q | xargs -r docker stop

01e04006c034


In [39]:
!docker ps

CONTAINER ID   IMAGE     COMMAND   CREATED   STATUS    PORTS     NAMES


In [40]:
!docker container prune -f

Deleted Containers:
5e8c24c6b980e1a182ee1a6023782753298ac9e709be2a10e4e455284cc83d93
01e04006c034dc7fdf892d279769d5f2d087e6ce83d36a1c808bf38626e43021
628f2a34d574ffe27d9516d0054d1663c88aa3934e1a183100b0eac21b317a76

Total reclaimed space: 20.48kB


In [41]:
!docker rmi qdrant_fastapi_app

Untagged: qdrant_fastapi_app:latest
Deleted: sha256:e84ca7dda4d32fe1ecfe7ac636c7fc36837aac4f4d3b3625e38476064439f456


In [17]:
# !docker builder prune -f

#### docker build and run

In [42]:
!docker build -t qdrant_fastapi_app .

[1A[1B[0G[?25l[+] Building 0.0s (0/0)  docker:desktop-linux
[?25h[1A[0G[?25l[+] Building 0.0s (0/1)                                    docker:desktop-linux
[?25h[1A[0G[?25l[+] Building 0.2s (11/12)                                  docker:desktop-linux
[34m => => transferring dockerfile: 751B                                       0.0s
[0m[34m => [internal] load metadata for docker.io/library/python:3.11-slim        0.0s
[0m[34m => [internal] load .dockerignore                                          0.0s
[0m[34m => => transferring context: 2B                                            0.0s
[0m[34m => [1/7] FROM docker.io/library/python:3.11-slim@sha256:82c07f2f6e35255b  0.0s
[0m[34m => => resolve docker.io/library/python:3.11-slim@sha256:82c07f2f6e35255b  0.0s
[0m[34m => [internal] load build context                                          0.0s
[0m[34m => => transferring context: 2.12kB                                        0.0s
[0m[34m => CACHED [2/7] WO

In [43]:
# if multiple keys in env
!docker run -d -p 80:80 --env-file .env qdrant_fastapi_app

3d681456f3f80b003fda933584a841815c5417664b7f8723978c537328a71827


In [45]:
!docker ps

CONTAINER ID   IMAGE                COMMAND                  CREATED          STATUS         PORTS                NAMES
3d681456f3f8   qdrant_fastapi_app   "uvicorn app.main:ap…"   10 seconds ago   Up 8 seconds   0.0.0.0:80->80/tcp   admiring_mcnulty


In [46]:
!docker logs 3d681456f3f80b003fda933584a841815c5417664b7f8723978c537328a71827

INFO:     Started server process [1]
INFO:     Waiting for application startup.
  self._client.create_payload_index(
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:80 (Press CTRL+C to quit)


In [35]:
# !docker exec -it e713a790221d pip list | grep llama-index