In [28]:
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
)

from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import Chroma, FAISS
from langchain.storage import LocalFileStore
from langchain.chains import RetrievalQA
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda

# Data Loaders and Splitters

In [2]:
splitter = RecursiveCharacterTextSplitter()

loader = UnstructuredFileLoader("files/chapter_one.docx")

docs = loader.load()

splitter.split_documents(docs)

[Document(page_content="Part 1, Chapter 1\n\nPart One\n\n1\n\nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.\n\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The flat was seven flights up, and Winston, who was thirty-nine and had

In [3]:
loader = UnstructuredFileLoader("files/chapter_one.docx")

loader.load_and_split(text_splitter=splitter)

[Document(page_content="Part 1, Chapter 1\n\nPart One\n\n1\n\nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.\n\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The flat was seven flights up, and Winston, who was thirty-nine and had

In [4]:
loader = UnstructuredFileLoader("files/chapter_one.docx")

splitter = RecursiveCharacterTextSplitter(chunk_size=200)
loader.load_and_split(text_splitter=splitter)

[Document(page_content='Part 1, Chapter 1\n\nPart One\n\n1', metadata={'source': 'files/chapter_one.docx'}),
 Document(page_content='It was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors', metadata={'source': 'files/chapter_one.docx'}),
 Document(page_content='was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of', metadata={'source': 'files/chapter_one.docx'}),
 Document(page_content='cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory', metadata={'source': 'files/chapter_one.docx'}),
 Document(page_content='day in April, and the clocks were striking thirtee

In [5]:
loader = UnstructuredFileLoader("files/chapter_one.docx")

splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)
result = loader.load_and_split(text_splitter=splitter)

In [6]:
print(result[1].page_content)
print()
print(result[2].page_content)

It was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors

wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.


In [7]:
loader = UnstructuredFileLoader("files/chapter_one.docx")

splitter = CharacterTextSplitter(separator="\n", chunk_size=600, chunk_overlap=100)
loader.load_and_split(text_splitter=splitter)

Created a chunk of size 963, which is longer than the specified 600
Created a chunk of size 774, which is longer than the specified 600
Created a chunk of size 954, which is longer than the specified 600
Created a chunk of size 922, which is longer than the specified 600
Created a chunk of size 1168, which is longer than the specified 600
Created a chunk of size 821, which is longer than the specified 600
Created a chunk of size 700, which is longer than the specified 600
Created a chunk of size 745, which is longer than the specified 600
Created a chunk of size 735, which is longer than the specified 600
Created a chunk of size 1110, which is longer than the specified 600
Created a chunk of size 991, which is longer than the specified 600
Created a chunk of size 990, which is longer than the specified 600
Created a chunk of size 1741, which is longer than the specified 600
Created a chunk of size 2001, which is longer than the specified 600
Created a chunk of size 1900, which is longe

[Document(page_content='Part 1, Chapter 1\nPart One\n1\nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.', metadata={'source': 'files/chapter_one.docx'}),
 Document(page_content='The hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The fl

# Tiktoken

In [8]:
loader = UnstructuredFileLoader("files/chapter_one.docx")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)
loader.load_and_split(text_splitter=splitter)

[Document(page_content='Part 1, Chapter 1\nPart One\n1\nIt was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The flat was seven flights up, and Winston, who was thirty-nine and had a varic

# Vector Store

In [9]:
embedder = OpenAIEmbeddings()

vector = embedder.embed_query("Hi")

len(vector)

1536

In [10]:
vectors = embedder.embed_documents(
    [
        "Hi",
        "How",
        "are",
        "you",
    ]
)

vectors

[[-0.03629858192333016,
  -0.007224538187570183,
  -0.033718855541097256,
  -0.028663632678071895,
  -0.026865641732513677,
  0.03460482274185761,
  -0.012318847263635711,
  -0.007752209747023989,
  0.0019380524367559973,
  -0.0027018730680822924,
  0.024781013901381176,
  -0.0024771241998875156,
  -0.005732726535614379,
  -0.0029054499465086627,
  0.0066773232887656405,
  -0.0030324821179497563,
  0.03384914384922042,
  -0.0015032120884641694,
  0.021093827586875214,
  -0.008996472123429593,
  -0.02171921630874401,
  0.010384052476961034,
  0.0062441115908914826,
  0.007081220210444346,
  -0.01231233266196503,
  0.0008998100308185957,
  0.005876044512740216,
  -0.009888952994538019,
  -0.0030731974470689,
  -0.024572550373209837,
  0.010742348118267585,
  -0.013810659381252822,
  -0.024429232861745306,
  -0.014110324538845857,
  0.0024347802203507018,
  -0.018878911447619544,
  0.000561872345109932,
  -0.01127001874639878,
  0.018110203351640992,
  -0.009967126351940966,
  0.013028923

In [11]:
len(vectors[-1])

1536

In [12]:
cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)

loader = UnstructuredFileLoader("files/chapter_one.txt")
docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings,
    cache_dir,
)

vectorstore = Chroma.from_documents(docs, cached_embeddings)

In [13]:
result = vectorstore.similarity_search("where does winston live?")

len(result)

4

In [14]:
result

[Document(page_content="The Ministry of Love was the really frightening one. There were no windows in it at all. Winston had never been inside the Ministry of Love, nor within half a kilometre of it. It was a place impossible to enter except on official business, and then only by penetrating through a maze of barbed-wire entanglements, steel doors, and hidden machine-gun nests. Even the streets leading up to its outer barriers were roamed by gorilla-faced guards in black uniforms, armed with jointed truncheons.\nWinston turned round abruptly. He had set his features into the expression of quiet optimism which it was advisable to wear when facing the telescreen. He crossed the room into the tiny kitchen. By leaving the Ministry at this time of day he had sacrificed his lunch in the canteen, and he was aware that there was no food in the kitchen except a hunk of dark-coloured bread which had got to be saved for tomorrow's breakfast. He took down from the shelf a bottle of colourless liqu

In [4]:
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI()
llm.invoke("Hello, world!")

  llm = ChatOpenAI()


AIMessage(content='Hello! How can I assist you today?', additional_kwargs={}, response_metadata={'token_usage': <OpenAIObject at 0x17dc3cdd0> JSON: {
  "prompt_tokens": 11,
  "completion_tokens": 10,
  "total_tokens": 21,
  "prompt_tokens_details": {
    "cached_tokens": 0,
    "audio_tokens": 0
  },
  "completion_tokens_details": {
    "reasoning_tokens": 0,
    "audio_tokens": 0,
    "accepted_prediction_tokens": 0,
    "rejected_prediction_tokens": 0
  }
}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-60399752-3f2c-42f8-a044-f5610d264639-0')

In [4]:
import langsmith.run_helpers

print(dir(langsmith.run_helpers))

['Any', 'AsyncGenerator', 'Awaitable', 'Callable', 'Dict', 'Generator', 'LangSmithExtra', 'List', 'Mapping', 'Optional', 'TYPE_CHECKING', 'TypedDict', '_METADATA', '_PARENT_RUN_TREE', '_PROJECT_NAME', '_TAGS', '_TraceableContainer', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', '_collect_extra', '_get_inputs', '_setup_run', 'annotations', 'as_runnable', 'cast', 'client', 'contextlib', 'contextvars', 'functools', 'futures', 'get_run_tree_context', 'inspect', 'is_traceable_function', 'logger', 'logging', 'os', 'run_trees', 'trace', 'traceable', 'traceback', 'utils', 'uuid']


# RetrievalQA

In [15]:
cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)

loader = UnstructuredFileLoader("files/chapter_one.txt")
docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings,
    cache_dir,
)

vectorstore = FAISS.from_documents(docs, cached_embeddings)

In [22]:
llm = ChatOpenAI(model="gpt-4o-mini")
chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="map_rerank",
    retriever=vectorstore.as_retriever(),
)

chain.run("Where does Winston live?")



''

# Stuff LCEL Chain

In [23]:
llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0.1,
)

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)

loader = UnstructuredFileLoader("files/chapter_one.txt")
docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings,
    cache_dir,
)

vectorstore = FAISS.from_documents(docs, cached_embeddings)

In [27]:
retriver = vectorstore.as_retriever()

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a helpful assistant. Answer questions using only the following context. If you don't know the answer jsut say you don't know. Don't make it up:\n\n{context}",
        ),
        ("human", "{question}"),
    ]
)

chain = (
    {
        "context": retriver,
        "question": RunnablePassthrough(),
    }
    | prompt
    | llm
)

chain.invoke("Describe Victory Mashions")

AIMessage(content='Victory Mansions is described as a building that Winston Smith enters at the beginning of the story. It is characterized by a hallway that smells of boiled cabbage and old rag mats. The building has glass doors, and the lift is seldom working, requiring Winston to climb seven flights of stairs to reach his flat. The overall atmosphere of Victory Mansions is grimy, and it is part of a landscape that includes rotting nineteenth-century houses and bombed sites. The building itself is not elaborately described, but it serves as a setting that reflects the bleakness of the world Winston lives in.', additional_kwargs={}, response_metadata={'token_usage': <OpenAIObject at 0x37631a030> JSON: {
  "prompt_tokens": 2173,
  "completion_tokens": 119,
  "total_tokens": 2292,
  "prompt_tokens_details": {
    "cached_tokens": 0,
    "audio_tokens": 0
  },
  "completion_tokens_details": {
    "reasoning_tokens": 0,
    "audio_tokens": 0,
    "accepted_prediction_tokens": 0,
    "reje

# Map Reduce LCEL Chain

In [29]:
llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0.1,
)

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)

loader = UnstructuredFileLoader("files/chapter_one.txt")
docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings,
    cache_dir,
)

vectorstore = FAISS.from_documents(docs, cached_embeddings)

retriver = vectorstore.as_retriever()

In [32]:
map_doc_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            Use the following portion of a long document to see if any of the text is relevant to answer the question. Return any relevant text verbatim. If there is no relevant text, return : ''
            -------
            {context}
            """,
        ),
        ("human", "{question}"),
    ]
)
map_doc_chain = map_doc_prompt | llm


def map_docs(inputs):
    documents = inputs["documents"]
    question = inputs["question"]
    return "\n\n".join(
        map_doc_chain.invoke(
            {"context": doc.page_content, "question": question}
        ).content
        for doc in documents
    )


map_chain = {"documents": retriver, "question": RunnablePassthrough()} | RunnableLambda(
    map_docs
)


final_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            Given the following extracted parts of a long document and a question, create a final answer. 
            If you don't know the answer, just say that you don't know. Don't try to make up an answer.
            ------
            {context}
            """,
        ),
        ("human", "{question}"),
    ]
)

chain = {"context": map_chain, "question": RunnablePassthrough()} | final_prompt | llm

result = chain.invoke("Describe Victory Mashions")

In [34]:
print(result.content)

Victory Mansions is a building where Winston Smith lives. It is characterized by glass doors that let in gritty dust, and the hallway has a smell of boiled cabbage and old rag mats. The building is poorly maintained, with a lift that seldom works and electricity cut off during daylight hours as part of an economy drive. Winston's flat is located seven flights up, contributing to the generally dreary and uninviting atmosphere of the environment.
