### Document Loader

In [None]:
# csv
from langchain.document_loaders import CSVLoader
loader = CSVLoader('some_data/penguins.csv')
data = loader.load()
print(data)  # a list
print(data[0].page_content)

In [None]:
# html
!pip install beautifulsoup4
from langchain.document_loaders import BSHTMLLoader
loader = BSHTMLLoader('some_data/some_website.html')
data = loader.load()
data

In [None]:
# pdf
!pip install pypdf
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader('some_data/SomeReport.pdf')
pages = loader.load_and_split()
type(pages)  # list
print(pages[0].page_content)

### Integrations

In [None]:
from langchain.document_loaders import HNLoader  # Hacker News
loader = HNLoader(link_of_hacker_news)
data = loader.load()
print(data[0].page_content, data[0].metadata)

from langchain.prompts import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.chat_models import ChatOpenAI

f = open('key.txt')
api_key = f.read()
model = ChatOpenAI(openai_api_key=api_key)
human_prompt = HumanMessagePromptTemplate.from_template('Please give me a single sentence summary of the following:\n{document}')
chat_prompt = ChatPromptTemplate.from_messages([human_prompt])
result = model(chat_prompt.format_prompt(document=data[0].page_content).to_messages())
print(result.content)

In [None]:
# example: user wiki page to anwser
# !pip install wikipedia
from langchain.prompts import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import WikipediaLoader


def answer_question_about(person_name,question):
    # Get Wikipedia Article
    docs = WikipediaLoader(query=person_name, load_max_docs=1)
    context_text = docs.load()[0].page_content
    
    # Connect to OpenAI Model
    f = open('keyfile.txt')
    api_key = f.read()
    model = ChatOpenAI(openai_api_key=api_key)
    
    # Ask Model Question
    human_prompt = HumanMessagePromptTemplate.from_template('Answer this question\n{question}, here is some extra context:\n{document}')
    
    # Assemble chat prompt
    chat_prompt = ChatPromptTemplate.from_messages([human_prompt])
    
    #result
    result = model(chat_prompt.format_prompt(question=question, document=data[0].page_content).to_messages())
    
    print(result.content)

### Document Transformers

In [None]:
with open('some_data/FDR_State_of_Union_1944.txt') as file:
    speech_text = file.read()

# split by character
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(separator="\n\n",chunk_size=1000) #1000 is default value
texts = text_splitter.create_documents([speech_text])
print(type(texts)) # list
print('\n')
print(texts[0].page_content)
type(texts[0]) # langchain.schema.document.Document

In [None]:
# split by token
# !pip install tiktoken
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size = 500) #now chunk size is a hard length based on tokens
texts = text_splitter.split_text(speech_text) # texts is a list

### Text Embeddings

In [None]:
from langchain.embeddings import OpenAIEmbeddings
import os
f = open('key.txt')
os.environ['OPENAI_API_KEY'] = f.read()

embeddings = OpenAIEmbeddings()
text = "Some normal text to send to OpenAI to be embedded into a N dimensional vector"
embedded_text = embeddings.embed_query(text)
type(embedded_text)  # list 

# Embeded document
from langchain.document_loaders import CSVLoader
loader = CSVLoader('some_data/penguins.csv')
data = loader.load()
type(data)  # list
type(data[0])  # langchain.schema.document.Document
embedded_docs = embeddings.embed_documents([text.page_content for text in data])

### Vector Store

In [None]:
# HERE ARE THE VERSION NUMBERS THAT WORKED FOR ME:
# CAREFUL WITH PYDANTIC, DO IT LAST SINCE CHROMA AND LANGCHAIN AUTO INSTALL IT AS A DEPENDENCY
# Use this to install specific versions numbers:
# !pip install package_name==0.3.26
import chromadb
print(chromadb.__version__)
import langchain
print(langchain.__version__)
import pydantic
print(pydantic.__version__)

import chromadb
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader

# load the document and split it into chunks
loader = TextLoader("some_data/FDR_State_of_Union_1944.txt")
documents = loader.load()

# split it into chunks
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=500)
docs = text_splitter.split_documents(documents)

import os
f = open('key.txt')
os.environ['OPENAI_API_KEY'] = f.read()
embedding_function = OpenAIEmbeddings()

# load it into Chroma
db = Chroma.from_documents(docs, embedding_function, persist_directory='./speech_embedding_db')

# Helpful to force a save
db.persist()

# Load Embeddings from db
db_connection = Chroma(persist_directory='./speech_embedding_db/', embedding_function=embedding_function)

# Get a similarity search
new_doc = "What did FDR say about the cost of food law?"
docs = db_connection.similarity_search(new_doc)
print(docs[0].page_content)

# Add document to db
# load the document and split it into chunks
loader = TextLoader("some_data/Lincoln_State_of_Union_1862.txt")
documents = loader.load()
# split it into chunks
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=500)
docs = text_splitter.split_documents(documents)
# load it into Chroma
db = Chroma.from_documents(docs, embedding_function,persist_directory='./speech_embedding_db')

### Vector Store Retriever

In [None]:
retriever = db_connection.as_retriever()
search_kwargs = {"score_threshold":0.8, "k":4}
docs = retriever.get_relevant_documents("President",
                                       search_kwargs=search_kwargs)
docs[0].page_content

### Use Chat Model to Multi Query

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.retrievers.multi_query import MultiQueryRetriever
question = "When was this declassified?"
llm = ChatOpenAI(temperature=0)
retriever_from_llm = MultiQueryRetriever.from_llm(retriever=db.as_retriever(), llm=llm)

# Set logging for the queries
# import logging
# logging.basicConfig()
# logging.getLogger('langchain.retrievers.multi_query').setLevel(logging.INFO)

unique_docs = retriever_from_llm.get_relevant_documents(query=question)
print(unique_docs[0].page_content)

### Context Compression

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
llm = ChatOpenAI(temperature=0)
compressor = LLMChainExtractor.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=db_connection.as_retriever())
docs = db_connection.similarity_search('When was this declassified?')
print(docs[0])
print(compressed_docs[0].page_content)

### Example

In [None]:
# Ask a Legal Research Assistant Bot about the US Constitution

# Takes in a question about the US Constitution and returns the most relevant
# part of the constitution. Notice it may not directly answer the actual question!

# PART ONE:
# LOAD "some_data/US_Constitution in a Document object
loader = TextLoader("some_data/US_Constitution.txt")
documents = loader.load()

# PART TWO
# Split the document into chunks (you choose how and what size)
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=500)
docs = text_splitter.split_documents(documents)

# PART THREE
# EMBED THE Documents (now in chunks) to a persisted ChromaDB
embedding_function = OpenAIEmbeddings()
db = Chroma.from_documents(docs, embedding_function, persist_directory='./US_Constitution')
db.persist()

def us_constitution_helper(question):
    # PART FOUR
    # Use ChatOpenAI and ContextualCompressionRetriever to return the most
    # relevant part of the documents.

    # results = db.similarity_search("What is the 13th Amendment?")
    # print(results[0].page_content) # NEED TO COMPRESS THESE RESULTS!
    llm = ChatOpenAI(temperature=0)
    compressor = LLMChainExtractor.from_llm(llm)

    compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, 
                                                           base_retriever=db.as_retriever())

    compressed_docs = compression_retriever.get_relevant_documents(question)

    return compressed_docs[0].page_content

print(us_constitution_helper("What is the 13th Amendment?"))