In [1]:
import os, json, re, getpass
from dotenv import load_dotenv
import pandas as pd
from langchain_groq import ChatGroq
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
import config

In [3]:
import config
from dotenv import load_dotenv
load_dotenv(".env", override=True)
if "GROQ_API_KEY" not in os.environ:
    os.environ["GROQ_API_KEY"] = config.groq_key

In [20]:
llm = ChatGroq(model="llama3-8b-8192")

In [4]:
system_template = 'You are an reciepe assistant that specializes in {diet_pref} dishes that can be prepared in {time} time'
sys_msg_prmpt = SystemMessagePromptTemplate.from_template(system_template)

In [5]:
human_template = '{recipe_req}'
hmn_msg_prmpt = HumanMessagePromptTemplate.from_template(human_template)

In [6]:
sys_msg_prmpt.input_variables, hmn_msg_prmpt.input_variables

(['diet_pref', 'time'], ['recipe_req'])

In [7]:
chat_prmpt = ChatPromptTemplate.from_messages([sys_msg_prmpt, hmn_msg_prmpt])
chat_prmpt.input_variables

['diet_pref', 'recipe_req', 'time']

In [8]:
prompt = chat_prmpt.format_prompt(diet_pref='Indian', time='20 min', recipe_req='snack').to_messages()
result = llm.invoke(prompt)
result

# Exercise 1: Prompt and Models

Creating a function to generate a hobby itneary for a given budget using llm

In [40]:
def travel_idea(hobby, budget):

    llm = ChatGroq(model="llama3-8b-8192")
    system_template = 'Create a travel itenary for {hobby} within the given budget {budget}.'
    system_prmpt = SystemMessagePromptTemplate.from_template(system_template)
    chat_prmpt = ChatPromptTemplate.from_messages([system_prmpt])
    prompt = chat_prmpt.format_prompt(hobby=hobby, budget=budget).to_messages()
    result = llm.invoke(prompt)

    return result

In [41]:
travel_idea('fishing', '$10000').to_json()["kwargs"]["content"]

"What a great adventure you're about to embark on! With a budget of $10,000, you can explore some fantastic fishing destinations around the world. Here's a suggested itinerary for you:\n\n**Day 1-5: Galapagos Islands, Ecuador**\n\n* Fly from Quito, Ecuador to the Galapagos Islands ( approx. $800 return)\n* Spend 5 days exploring the unique fishing spots in the islands, targeting species like marlin, sailfish, and tuna\n* Guided fishing trips with local experts will cost around $200-300 per day\n* Accommodation: Stay in a comfortable eco-lodge or hotel in Puerto Ayora, Santa Cruz Island (approx. $100-150 per night)\n\n**Day 6-12: Costa Rica**\n\n* Fly from the Galapagos to San Jose, Costa Rica (approx. $500 return)\n* Spend 6 days exploring the Pacific coast of Costa Rica, targeting species like tarpon, snook, and roosterfish\n* Guided fishing trips with local experts will cost around $250-350 per day\n* Accommodation: Stay in a beachfront hotel or eco-lodge in Tamarindo or Puerto Viejo

# Few Shot Prompt Template

Give an example of input output pairs for the LLM

In [28]:
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, AIMessagePromptTemplate

In [51]:
# AI Bot legal --> Simple Terms

system_template = 'You are a helpful legal assistant that translates complex legal terms into plain and simple terms.'
sys_msg_prompt = SystemMessagePromptTemplate.from_template(system_template)

In [46]:
# Few Shot Examples

# Input Human
legal_text = """This contract, effective upon execution, requires both parties to adhere to its terms. 
                Any changes must be agreed upon in writing. Breach of any provision entitles the other 
                party to pursue legal remedies. All disputes arising from this agreement will be resolved 
                in the designated jurisdiction's courts."""
example_input_1 = HumanMessagePromptTemplate.from_template(legal_text)
# AI Output
output = """Contract effective upon signing; disputes resolved in specified court."""

example_output_1 = AIMessagePromptTemplate.from_template(output)

In [47]:
human_template = "{legal_text}"
human_prompt = HumanMessagePromptTemplate.from_template(human_template)

In [55]:
chat_prmpt = ChatPromptTemplate.from_messages([sys_msg_prompt, human_prompt, example_input_1, example_output_1])

In [56]:
example_legal_text = """This agreement binds both parties to its terms upon signing. 
                        Any changes must be in writing and signed by both parties. 
                        If either party fails to follow the terms, the other may seek legal remedies. 
                        Any disputes will be resolved in the court of the specified jurisdiction."""
request = chat_prmpt.format_prompt(legal_text=example_legal_text).to_messages()

In [58]:
llm = ChatGroq(model="llama-3.1-8b-instant")
result = llm.invoke(request)
result

AIMessage(content='', additional_kwargs={}, response_metadata={'token_usage': {'completion_tokens': 1, 'prompt_tokens': 188, 'total_tokens': 189, 'completion_time': 0.001333333, 'prompt_time': 0.044276491, 'queue_time': -9223372036.899054, 'total_time': 0.045609824}, 'model_name': 'llama-3.1-8b-instant', 'system_fingerprint': 'fp_f66ccb39ec', 'finish_reason': 'stop', 'logprobs': None}, id='run-01fef76d-2cc9-48be-9355-5eac6394b9e3-0', usage_metadata={'input_tokens': 188, 'output_tokens': 1, 'total_tokens': 189})

Few shot prompt not working with groq, maybe due to token limits

# Parsing Outputs

In [75]:
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser, CommaSeparatedListOutputParser

# Step 1: Import Parser and make an instance
output_parser = JsonOutputParser()
output_parser.get_format_instructions()

'Return a JSON object.'

In [76]:
human_template = "{request}\n{format_instructions}"
human_prompt = HumanMessagePromptTemplate.from_template(human_template)
chat_prompt = ChatPromptTemplate.from_messages([human_prompt])
model_request = chat_prompt.format_prompt(request='Give me five facts about dogs',
                                          format_instructions=output_parser.get_format_instructions()).to_messages()
output_parser.parse(llm.invoke(model_request).content)

{'breeds': 340,
 'communication': {'body_language': True,
  'barking': True,
  'whining': True,
  'vocalizations': True},
 'senses': {'smell': '700 times stronger than humans',
  'hearing': '4 times more sensitive than humans',
  'vision': 'limited color spectrum compared to humans'},
 'lifespan': {'average': '10-13 years', 'varies_by_breed': True},
 'training': {'capable_of_learning': True,
  'responds_to_positive_reinforcement': True}}

## What to do when the parser is not good enough for formating outputs

In [100]:
from langchain.output_parsers import DatetimeOutputParser, OutputFixingParser
op_parse = DatetimeOutputParser()
op_parse.get_format_instructions()

"Write a datetime string that matches the following pattern: '%Y-%m-%dT%H:%M:%S.%fZ'.\n\nExamples: 0827-02-04T18:01:46.873161Z, 0892-12-02T12:27:08.938545Z, 0408-06-21T08:30:56.457048Z\n\nReturn ONLY this string, no other words!"

In [115]:
system_template = "Provide the date required by the user in the format {system_format_instruction}"
system_prompt = SystemMessagePromptTemplate.from_template(system_template)

In [121]:
template_text = '{request}\n{format_instructions}'
human_prompt = HumanMessagePromptTemplate.from_template(template_text)
chat_prmpt = ChatPromptTemplate.from_messages([human_prompt]) # system_prompt, 

In [122]:
model_request = chat_prmpt.format_prompt(request="What date was the 13th Ammendement ratified in te US?",
                                        format_instructions = op_parse.get_format_instructions(),
                                        system_format_instruction =  op_parse.get_format_instructions())

In [123]:
llm = ChatGroq(model="llama3-8b-8192", temperature=0)
result = llm.invoke(model_request).content

In [124]:
result

'December 18, 1865'

In [125]:
## OutputParser doesn't always fix the problem
new_parser = OutputFixingParser.from_llm(parser=op_parse, llm=llm)
#op_parse.parse(result)
new_parser.parse(result)

datetime.datetime(2022, 12, 18, 0, 0)

### Using Pydantic Library
Is used for type validation

- List doesn't work in the parser and groq output

In [182]:
from langchain_core.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field

In [189]:
class Scientist(BaseModel):

    name: str = Field(description='Name of a Scientist')
    discoversies: tuple = Field(description='Python tuple of discoveries')


In [190]:
parser = PydanticOutputParser(pydantic_object=Scientist)

In [191]:
print(parser.get_format_instructions())

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"properties": {"name": {"description": "Name of a Scientist", "title": "Name", "type": "string"}, "discoversies": {"description": "Python tuple of discoveries", "items": {}, "title": "Discoversies", "type": "array"}}, "required": ["name", "discoversies"]}
```


In [192]:
system_prompt = SystemMessagePromptTemplate.from_template("You should only return the output in the requested format and nothing else.")
human_prompt = HumanMessagePromptTemplate.from_template("{request}\n Output in {format} format")
chat_prompt = ChatPromptTemplate.from_messages([system_prompt, human_prompt])
request = chat_prompt.format_prompt(request="Tell me about a famous Scientist",
                                    format = parser.get_format_instructions()).to_messages()

In [193]:
result = llm.invoke(request)
result.content

'{"name": "Marie Curie", "discoversies": ["Radioactivity", "Elements Polonium and Radium"]}'

In [194]:
parser.parse(result.content)

Scientist(name='Marie Curie', discoversies=('Radioactivity', 'Elements Polonium and Radium'))

In [174]:
type(parser.parse(result.content))

__main__.Scientist

## Serialization: Saving and Loading Prompts

In [197]:
from langchain_core.prompts import PromptTemplate

In [199]:
template = "Tell me a fact about {planet}"
prompt = PromptTemplate(template=template, input_variables=["planet"])

In [200]:
prompt.save("saved_prompts/my_prompt.json")

In [201]:
from langchain.prompts import load_prompt
loaded_prompt = load_prompt("saved_prompts/my_prompt.json")
loaded_prompt

PromptTemplate(input_variables=['planet'], input_types={}, partial_variables={}, template='Tell me a fact about {planet}')

# Excercise 2: Build a historical quiz bot

Build a quiz bot which does the following:

- Class should be present
- Write a class which has the date as the correct answer
    - First method which generates a history question on the given topic
    - Second method, send the generated question to the AI chatbot which returns the datetime object
    - Get the user answer: yyyy-mm-dd
    - Find how off the user's answer was from the ai answer

In [4]:
from langchain_groq import ChatGroq
import pandas as pd
from langchain.output_parsers import DatetimeOutputParser
from langchain_core.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
import config
import os
from dotenv import load_dotenv
load_dotenv(".env", override=True)
if "GROQ_API_KEY" not in os.environ:
    os.environ["GROQ_API_KEY"] = config.groq_key

In [2]:
class Solution:

    def __init__(self):
        self.llm = ChatGroq(model="llama3-8b-8192", temperature=0)
        self.system_prompt = SystemMessagePromptTemplate.from_template("You are a datetime bot which returns only the asked date in the {ques_format} format, nothing else")
        self.op_parser = DatetimeOutputParser()


    def gen_ques(self, topic:str):
        """Generate a question for the given topic"""

        # human_template = "On which date did {topic} end?"
        system_template = """You generate a single question inquiring about a date related to {topic}. You only return the quiz question. 
                             Do not ask about multiple dates."""
        system_prompt = SystemMessagePromptTemplate.from_template(template=system_template)
        human_prompt = HumanMessagePromptTemplate.from_template(template="Generate a question about {topic}.")
        chat_prompt = ChatPromptTemplate.from_messages([system_prompt, human_prompt])
        request = chat_prompt.format_prompt(topic=topic).to_messages()
        question = self.llm.invoke(request)
        print("Question: ",question.content)
        return question.content
    
    def get_ai_ans(self, question):
        
        chat_prompt =  ChatPromptTemplate.from_messages([self.system_prompt, question])
        request = chat_prompt.format_prompt(ques_format=self.op_parser.get_format_instructions()).to_messages()
        model_ans = self.llm.invoke(request)
        parsed_ans = self.op_parser.parse(model_ans.content)

        return parsed_ans
    
    def get_user_ans(self):

        date = input("Enter date in dd-mm-yyyy format: ")
        date = pd.to_datetime(date, format="%d-%m-%Y")
        print("User date: ", date)
        return date
    
    def check_ans(self, ai_ans, user_ans):

        system_prompt = SystemMessagePromptTemplate.from_template("""You have to tell the difference between two dates in days, months and years, with sign values date_1 - date_2. 
                                                                  Only give the final difference in a single line, no explanation required. The answer should be similar to:
                                                                  The difference in dates is x years, y months and z dates, where x, y, z are the differences with the proper sign values. 
                                                                  """)
        human_prompt = HumanMessagePromptTemplate.from_template("Find  {date_1} - {date_2}")
        chat_prompt = ChatPromptTemplate.from_messages([system_prompt, human_prompt])
        request = chat_prompt.format_prompt(date_1=ai_ans, date_2=user_ans).to_messages()
        ans = self.llm.invoke(request).content
        print(ans)



In [5]:
obj = Solution()
request = obj.gen_ques("World War 2")
ai_ans = obj.get_ai_ans(request)
user_date = obj.get_user_ans()
print("AI ans: ", ai_ans)
obj.check_ans(ai_ans, user_date)

Question:  On what date did the United States drop an atomic bomb on the Japanese city of Hiroshima?
User date:  2023-12-04 00:00:00
AI ans:  1945-08-06 06:15:27
The difference in dates is 78 years, 4 months and 28 days.


# Data Connections

- Load Documents: pdf, html, etc
- Convert to vector embeddings
- Query data from those documents

install pypdf and beautifulsoup4

In [1]:
import pandas as pd
from langchain_core.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
from langchain_groq import ChatGroq
from langchain.output_parsers import DatetimeOutputParser
import config
import os
from dotenv import load_dotenv
load_dotenv(".env", override=True)
if "GROQ_API_KEY" not in os.environ:
    os.environ["GROQ_API_KEY"] = config.groq_key
os.environ["HTTP_PROXY"] = config.proxy
os.environ["HTTPS_PROXY"] = config.proxy

In [13]:
from langchain.document_loaders import CSVLoader, BSHTMLLoader, PyPDFLoader

In [3]:
loader = CSVLoader("data/inputs/test_data.csv") # does lazy loading
data = loader.load()
type(data[0])

langchain_core.documents.base.Document

In [4]:
print(data[0].page_content)

ï»¿Name: Jane Doe
Point1: 15/20
Point2: 30/50
Point3: 18/20
Extra: Had extraodinary participation in MUN.


In [9]:
loader = BSHTMLLoader("data/inputs/test_html.html")

In [10]:
data = loader.load()
data

[Document(metadata={'source': 'data/inputs/test_html.html', 'title': ''}, page_content='Heading 1')]

In [12]:
data[0].page_content

'Heading 1'

In [15]:
loader = PyPDFLoader("data/inputs/test_pdf.pdf")
data = loader.load()
data[0].page_content

'PRAJUAL PILLAI\nML Engineer\n@prajualpillai05@gmail.com ♂phone+91-9415008195 /githubhttps:/ /github.com/prajualpillai\n/linkedinhttps:/ /www.linkedin.com/in/prajual-pillai-b4945112b/\nEXPERIENCE\nML Engineer II\nNissan Digital LLP\nὌ5September 2023 – Ongoing ♂¶ap-¶arkerTrivandrum, India\n•Developing an in-house MMM model with mape of 22%\n•Developed a forecasting model to predict the powetrain mix till 2030\n•Part of the team which was runner up in Inter-Nissan AWS Gameday\n•Developed a used car price forecasting poc which was one of the 8\nshortlisted ideas(out of 200+ ) for Nissan Ideathon/Hackathon.\n•Held a session on the usage of DVC for model & data versioning\nML Engineer\nSearce Inc.\nὌ5July 2021 – September 2023 ♂¶ap-¶arkerPune, India\nClient: Futureproof July, 2023 – July, 2023\n•Developed a CI/CD pipeline for model training pipeline using DVC\n•Set up the architecture for integrating DVC with VertexAI\n•Developed the model and data versioning module for the pipeline\nClient

## Integrations

- Document loaders which are integrated to some specific 3rd party, such as google cloud, wiki, aws, etc 

In [5]:
from langchain.document_loaders import HNLoader
os.environ["HTTP_PROXY"] = config.proxy
os.environ["HTTPS_PROXY"] = config.proxy

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [6]:
loader = HNLoader("https://en.wikipedia.org/wiki/Main_Page")

In [7]:
data = loader.load()
data

SSLError: HTTPSConnectionPool(host='en.wikipedia.org', port=443): Max retries exceeded with url: /wiki/Main_Page (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1129)')))

## Document Loader Exercise

Using wikipedia integration, make a function that accepts a famous historical figure name and a question about them, and then uses a ChatModel to answer questions with the additional context.

- Make a call to wiki to get the doc
- Insert additional context to the chat model and answer the user's question

In [1]:
import pandas as pd
from langchain_groq import ChatGroq
from langchain_core.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
import config
import os
from langchain.document_loaders import WikipediaLoader
if "GROQ_API_KEY" not in os.environ:
    os.environ["GROQ_API_KEY"] = config.groq_key
# os.environ["HTTP_PROXY"] = config.proxy
# os.environ["HTTPS_PROXY"] = config.proxy

In [2]:
# doc = WikipediaLoader(query="HUNTER X HUNTER", load_max_docs=2).load()

In [12]:
llm = ChatGroq(model="llama3-8b-8192", temperature=0)

def get_wiki_content(topic):
    content = WikipediaLoader(query=topic, load_max_docs=2).load()
    ans = ''
    for page in content:
        ans += " " + page.page_content
    return ans

def get_answer(content, question):

    system_template = "Given the content {content}, fetch the answer to the question asked by the user."
    system_prompt = SystemMessagePromptTemplate.from_template(system_template)

    human_template = "{question}"
    human_prompt = HumanMessagePromptTemplate.from_template(human_template)

    chat_prompt = ChatPromptTemplate([system_prompt, human_prompt])
    request = chat_prompt.format_prompt(content = content, question = question).to_messages()
    ans = llm.invoke(request).content

    return ans

content = get_wiki_content("Nicholas Cage")
print(content)
ans = get_answer(content, "When was he born?")
print("Ans: ", ans)

 Nicolas Kim Coppola (born January 7, 1964), known professionally as Nicolas Cage, is an American actor and film producer. He is the recipient of various accolades, including an Academy Award, a Screen Actors Guild Award, and a Golden Globe Award as well as nominations for two BAFTA Awards. Known for his versatility as an actor, his participation in various film genres has gained him a cult following.
Born into the Coppola family, Cage began his career in films such as Fast Times at Ridgemont High (1982) and Valley Girl (1983), as well as various films by his uncle Francis Ford Coppola such as Rumble Fish (1983), The Cotton Club (1984), and Peggy Sue Got Married (1986). He received critical success for his roles in Moonstruck and  Raising Arizona (both 1987), before earning an Academy Award for Best Actor for the dramatic film Leaving Las Vegas (1995). He was Oscar-nominated for playing twins Charlie and Donald Kaufman in the comedy-drama film Adaptation (2002).
Cage established himsel

## Document Transformers

The length of string fed from a document might be too large to feed into a model. Transformers splits these strings into chunks which will later serve useful for uembeddings, which can be then looked up using a dustance similarity later on.

Splitting can be of two types:
- Splitting on a specific character
- Splitting based on token coubt

In [14]:
from langchain.text_splitter import CharacterTextSplitter

In [15]:
with open('data/fdr_speech.txt') as file:
    speech_text = file.read()

In [18]:
len(speech_text.split(" "))

3415

In [19]:
test_splitter = CharacterTextSplitter(separator = "\n\n", chunk_size=1000)
texts = test_splitter.create_documents([speech_text])

In [21]:
texts[0]

Document(metadata={}, page_content='IN FULFILLING my duty to report upon the State of the Union, I am proud to say to you that the spirit of the American people was never higher than it is todayâ€”the Union was never more closely knit togetherâ€”this country was never more deeply determined to face the solemn tasks before it.\n\nThe response of the American people has been instantaneous, and it will be sustained until our security is assured.\n\nExactly one year ago today I said to this Congress: "When the dictators. . . are ready to make war upon us, they will not wait for an act of war on our part. . . . Theyâ€”not weâ€”will choose the time and the place and the method of their attack."\n\nWe now know their choice of the time: a peaceful Sunday morningâ€” December 7, 1941.\n\nWe know their choice of the place: an American outpost in the Pacific.\n\nWe know their choice of the method: the method of Hitler himself.')

In [23]:
# !pip install --proxy=10.203.32.190:8080 tiktoken

In [26]:
test_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=500)
texts = test_splitter.split_text(speech_text)

## Text Embeddings

- Choose your embedding model carefully, embedding using model A might not be same as that created by model B.
- If a change is to be made all the historical data will have to be changed to the new one

In [5]:
# !pip install sentence_transformers
# !pip install --proxy=10.203.32.190:8080 --upgrade --quiet  langchain sentence_transformers

You should consider upgrading via the 'C:\Users\ndh00961\Desktop\personal_data\personal_code\langchain_personal_exp\lang_env\Scripts\python.exe -m pip install --upgrade pip' command.


In [1]:
from langchain.embeddings import HuggingFaceEmbeddings
import pandas as pd
from langchain_groq import ChatGroq
from langchain_core.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
import config
import os
from langchain.document_loaders import WikipediaLoader
if "GROQ_API_KEY" not in os.environ:
    os.environ["GROQ_API_KEY"] = config.groq_key
# os.environ["HTTP_PROXY"] = config.proxy
# os.environ["HTTPS_PROXY"] = config.proxy




In [2]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
  from tqdm.autonotebook import tqdm, trange
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [4]:
text = "this is a normal text string that I want to embedd as a vector"
embedded_text = embeddings.embed_query(text=text)

In [8]:
from langchain.document_loaders import CSVLoader
loader = CSVLoader("data/inputs/test_data.csv")
data = loader.load()
data

[Document(metadata={'source': 'data/inputs/test_data.csv', 'row': 0}, page_content='ï»¿Name: Jane Doe\nPoint1: 15/20\nPoint2: 30/50\nPoint3: 18/20\nExtra: Had extraodinary participation in MUN.')]

In [11]:
embeded_docs = embeddings.embed_documents([text.page_content for text in data])
embeded_docs

[[-0.03533982113003731,
  0.01674029417335987,
  -0.009271835908293724,
  0.019534962251782417,
  -0.005103622563183308,
  0.020624931901693344,
  -0.0357869528234005,
  0.02432936243712902,
  -0.0183588694781065,
  0.021252483129501343,
  0.050105124711990356,
  -0.005557351280003786,
  0.04851042106747627,
  0.05048724263906479,
  -0.034426599740982056,
  0.023692147806286812,
  0.010839156806468964,
  -0.0028201015666127205,
  0.05511929839849472,
  -0.03273586928844452,
  -0.03653135150671005,
  0.03010758012533188,
  -0.046307001262903214,
  0.006526881828904152,
  -0.015589394606649876,
  0.028450822457671165,
  0.04805194213986397,
  0.015371604822576046,
  0.014143470674753189,
  -0.02525527961552143,
  0.007123374845832586,
  0.017143912613391876,
  -0.030109021812677383,
  -0.07371345907449722,
  1.7974085722016753e-06,
  -0.047407034784555435,
  0.0011947546154260635,
  0.011869785375893116,
  -0.046314746141433716,
  -0.02209063619375229,
  -0.055808067321777344,
  0.002864

## Vector Store

- Store embeddings to a permanent space
- We will require the vector and the text that it was embeded from
- VS should be able to 'quiered'

Notes:
- The embeddings are identified in the vector db using uuids
- So if the same uuid is passed to merge into an existing db, it'll throw an error
- But if you recode your embeddings, i.e create new embeddings of the same text, you'll create new uuids for the same embeddings
- This will cause multiple embeddings of the same text to be present in the db

In [16]:
# !pip install --proxy=10.203.32.190:8080 faiss-cpu

### WORKFLOW

- Load a document
- Split it into chunks
- Use the embedding model to convert the chunks into vectors/embeddings
- Save the vectors to vector store
- 'query' to similarity search the db

In [19]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader

In [60]:
# Loading the document
doc = TextLoader('data/inputs/fdr_speech.txt').load()

# Splitting the document into chunks
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=500)
chunked_text = text_splitter.split_documents(doc)

# Convert text to vector embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

# Save the vector chunks to vector_store
library = FAISS.from_documents(chunked_text, embedding=embeddings)
library.save_local("./vector_db/course_test/fdr_speech")

# Query similarity search the vector store
loaded_lib = FAISS.load_local("./vector_db/course_test/fdr_speech", 
                              embeddings=embeddings,
                              allow_dangerous_deserialization=True)

In [None]:
query = "What is the meaning of victory?"
ans = loaded_lib.similarity_search(query)
print(ans[0].page_content)

In [62]:
# Loading saved db
loaded_lib = FAISS.load_local("./vector_db/course_test/fdr_speech", 
                              embeddings=embeddings,
                              allow_dangerous_deserialization=True)

l_doc = TextLoader("./data/inputs/lincon_speech.txt").load()
l_docs = text_splitter.split_documents(l_doc)

new_db_text = FAISS.from_documents(l_docs, embedding=embeddings)

# merging new db with old db
loaded_lib.merge_from(new_db_text)

# Saving updated dbh
loaded_lib.save_local("./vector_db/course_test/fdr_speech")

Created a chunk of size 686, which is longer than the specified 500


In [65]:
loaded_lib = FAISS.load_local("./vector_db/course_test/fdr_speech", 
                              embeddings=embeddings,
                              allow_dangerous_deserialization=True)
print(len(loaded_lib.docstore._dict), len(new_db_text.docstore._dict), len(library.docstore._dict))

44 30 14


In [67]:
sim_doc = loaded_lib.similarity_search('salvery')
sim_doc[0].metadata

{'source': './data/inputs/lincon_speech.txt'}

### Vector Store Retrievers

- Sometimes vector stores have to be passed as retriever objects
- This can be achieved using the as_retrievers() method
- The documents can be fetched using the .invoke method

In [72]:
retriever = loaded_lib.as_retriever()

In [73]:
results = retriever.invoke("hitler")
results

[Document(metadata={'source': 'data/inputs/fdr_speech.txt'}, page_content="We must guard against complacency. We must not underrate the enemy. He is powerful and cunningâ€”and cruel and ruthless. He will stop at nothing that gives him a chance to kill and to destroy. He has trained his people to believe that their highest perfection is achieved by waging war. For many years he has prepared for this very conflict- planning, and plotting, and training, arming, and fighting. We have already tasted defeat. We may suffer further setbacks. We must face the fact of a hard war, a long war, a bloody war, a costly war.\n\nWe must, on the other hand, guard against defeatism. That has been one of the chief weapons of Hitler's propaganda machineâ€”used time and again with deadly results. It will not be used successfully on the American people.\n\nWe must guard against divisions among ourselves and among all the other United Nations. We must be particularly vigilant against racial discrimination in 

### Multi Query Retrievers

- MQR uses LLM to generate multiple query variations allowing us to focus on key ideas rather than the exact phrasing

In [82]:
doc = TextLoader("./data/inputs/fdr_speech.txt").load()
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=500)
docs = text_splitter.split_documents(doc)

In [90]:
# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
# embedding_function = FAISS.from_documents(docs, embedding=embeddings)
vectore_store = FAISS.load_local("./vector_db/course_test/fdr_speech",
                                      embeddings=embeddings,
                                      allow_dangerous_deserialization=True)


In [86]:
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_groq import ChatGroq

In [130]:
if "GROQ_API_KEY" not in os.environ:
    os.environ["GROQ_API_KEY"] = config.groq_key
os.environ["HTTP_PROXY"] = config.proxy
os.environ["HTTPS_PROXY"] = config.proxy
question = "what did fdr say about Hitler?"
llm = ChatGroq(model="llama3-8b-8192", temperature=0)

In [97]:
retriever_from_llm = MultiQueryRetriever.from_llm(retriever=vectore_store.as_retriever(),
                                                  llm = llm)


In [98]:
unique_documents = retriever_from_llm.get_relevant_documents(query=question)
unique_documents

[Document(metadata={'source': './data/inputs/lincon_speech.txt'}, page_content='A moderate reservation from the interest on the bonds would compensate the United States for the preparation and distribution of the notes and a general supervision of the system, and would lighten the burden of that part of the public debt employed as securities. The public credit, moreover, would be greatly improved and the negotiation of new loans greatly facilitated by the steady market demand for Government bonds which the adoption of the proposed system would create. It is an additional recommendation of the measure, of considerable weight, in my judgment, that it would reconcile as far as possible all existing interests by the opportunity offered to existing institutions to reorganize under the act, substituting only the secured uniform national circulation for the local and various circulation, secured and unsecured, now issued by them.\n\nThe receipts into the treasury from all sources, including l

### Context Compression
Use LLMs to compress our outputs/display only the relevant outputs

Worflow:
 - Decide on the LLM to use
 - Use a LLMChanExtractor which uses this LLM
 - The LLMChainExtractor is used in Contextual Compression

In [102]:
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain_groq import ChatGroq
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

In [104]:
vector_db = FAISS.load_local("./vector_db/course_test/fdr_speech",
                                      embeddings=embeddings,
                                      allow_dangerous_deserialization=True)
compressor = LLMChainExtractor.from_llm(llm=llm)

compression_retriever = ContextualCompressionRetriever(base_compressor = compressor,
                                                       base_retriever = vector_db.as_retriever())

In [107]:
compressed_doc = compression_retriever.invoke("What is the impact of war?")

In [116]:
print(compressed_doc[0].page_content, end="\n")
print()
print(compressed_doc[0].metadata["source"])

Here are the extracted relevant parts:

War costs money. So far, we have hardly even begun to pay for it. We have devoted only 15 percent of our national income to national defense. As will appear in my Budget Message tomorrow, our war program for the coming fiscal year will cost 56 billion dollars or, in other words, more than half of the estimated annual national income. That means taxes and bonds and bonds and taxes. It means cutting luxuries and other non-essentials. In a word, it means an "all-out" war by individual effort and family effort in a united country.

This part of the context is relevant to the question "What is the impact of war?" as it discusses the financial costs of war and the impact it has on the country, including the need for increased taxes and reduced luxuries.

data/inputs/fdr_speech.txt


## Data Connections Exercise

Create an LLM bot which can:
- Takes questions about the US Constitution and returns the most relevant part on the constitution as the answer.
- It may not directly answer the question.
- Return the compressed version of the answer

In [173]:
from langchain.document_loaders import PyPDFLoader

def create_vector_db():
    
    # Load text
    docs = TextLoader("data/langchain-course-main/01-Data-Connections/some_data/US_Constitution.txt").load()
    # docs = PyPDFLoader("./data/inputs/constitution.pdf").load()
    
    # Chunk the text using text_splitter
    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=500)
    chunked_doc = text_splitter.split_documents(docs)

    # Convert text to embeddings/create vector db
    library = FAISS.from_documents(chunked_doc, embedding=embeddings)

    # Save Embeddings
    library.save_local("vector_db/law_bot")

    pass

def law_bot(question):

    # Load vector_db
    vector_db = FAISS.load_local("vector_db/law_bot",
                                 embeddings=embeddings,
                                 allow_dangerous_deserialization=True)
    
    # Convert the vector db to a retriever
    llm = ChatGroq(model="llama3-8b-8192", temperature=0)
    # retriever_from_llm = MultiQueryRetriever.from_llm(retriever=vector_db.as_retriever(),
    #                                                   llm = llm)
    retriever = vector_db.as_retriever()
    # Initialize Compressor
    compressor = LLMChainExtractor.from_llm(llm=llm)

    # Pass the compressor and the retiever to the ContextualCompressor to get the compressed answer
    compressed_retriever = ContextualCompressionRetriever(base_compressor = compressor,
                                                          base_retriever = retriever)
    
    # Invoke the retriever
    ans = compressed_retriever.invoke(question)

    return ans

create_vector_db()

In [174]:
question = "12th Amendment?"
ans = law_bot(question)
ans

[Document(metadata={'source': 'data/langchain-course-main/01-Data-Connections/some_data/US_Constitution.txt'}, page_content='Here are the extracted relevant parts:\n\nThe Electors shall meet in their respective states and vote by ballot for President and Vice-President, one of whom, at least, shall not be an inhabitant of the same state with themselves...\n\nThe person having the greatest number of votes for President, shall be the President, if such number be a majority of the whole number of Electors appointed...\n\nThe person having the greatest number of votes as Vice-President, shall be the Vice-President, if such number be a majority of the whole number of Electors appointed...\n\nNote that these parts are directly related to the 12th Amendment, which deals with the process of electing the President and Vice-President.'),
 Document(metadata={'source': 'data/langchain-course-main/01-Data-Connections/some_data/US_Constitution.txt'}, page_content='Here are the extracted relevant par