In [1]:
# Prompt -> LLM -> Response
# (New data/external data) + Prompt -> LLM -> Response
# How are we retrieving this information to add it to the prompt
# Copy paste new information
# Can programmatically read the new information
# Can scrape the information

### CONTEXT WINDOW ###
# chunk -> -> embeddings -> DB -> Retrieve appropriate information to add it in the prompt
# Appropriate -> data -> embeddings (models) -> DB -> perform query (similarity) -> get relevant content data
# appropriate + prompt -> LLM -> Response

# Document Loaders

## CSV Loader

In [2]:
import os

In [3]:
from langchain.document_loaders import CSVLoader

In [4]:
os.path.isdir("../datasets/sns_datasets")

True

In [5]:
loader = CSVLoader(file_path="../datasets/sns_datasets/titanic.csv")

In [6]:
data = loader.load()

In [7]:
type(data[0])

langchain_core.documents.base.Document

In [8]:
data[0].page_content

'survived: 0\npclass: 3\nsex: male\nage: 22.0\nsibsp: 1\nparch: 0\nfare: 7.25\nembarked: S\nclass: Third\nwho: man\nadult_male: True\ndeck: \nembark_town: Southampton\nalive: no\nalone: False'

In [9]:
data[0].metadata

{'source': '../datasets/sns_datasets/titanic.csv', 'row': 0}

In [10]:
print(data[0].page_content)

survived: 0
pclass: 3
sex: male
age: 22.0
sibsp: 1
parch: 0
fare: 7.25
embarked: S
class: Third
who: man
adult_male: True
deck: 
embark_town: Southampton
alive: no
alone: False


In [11]:
loader = CSVLoader(file_path="../datasets/sns_datasets/titanic.csv", source_column='sex')

In [12]:
loader.load()

[Document(page_content='survived: 0\npclass: 3\nsex: male\nage: 22.0\nsibsp: 1\nparch: 0\nfare: 7.25\nembarked: S\nclass: Third\nwho: man\nadult_male: True\ndeck: \nembark_town: Southampton\nalive: no\nalone: False', metadata={'source': 'male', 'row': 0}),
 Document(page_content='survived: 1\npclass: 1\nsex: female\nage: 38.0\nsibsp: 1\nparch: 0\nfare: 71.2833\nembarked: C\nclass: First\nwho: woman\nadult_male: False\ndeck: C\nembark_town: Cherbourg\nalive: yes\nalone: False', metadata={'source': 'female', 'row': 1}),
 Document(page_content='survived: 1\npclass: 3\nsex: female\nage: 26.0\nsibsp: 0\nparch: 0\nfare: 7.925\nembarked: S\nclass: Third\nwho: woman\nadult_male: False\ndeck: \nembark_town: Southampton\nalive: yes\nalone: True', metadata={'source': 'female', 'row': 2}),
 Document(page_content='survived: 1\npclass: 1\nsex: female\nage: 35.0\nsibsp: 1\nparch: 0\nfare: 53.1\nembarked: S\nclass: First\nwho: woman\nadult_male: False\ndeck: C\nembark_town: Southampton\nalive: yes\nal

## HTML Loader

In [13]:
from langchain.document_loaders import UnstructuredHTMLLoader

In [14]:
loader = UnstructuredHTMLLoader(file_path="../datasets/harry_potter_html/001.htm")

In [15]:
data = loader.load()

data

ModuleNotFoundError: No module named 'iso639.language'

In [None]:
len(data)

1

In [None]:
print(data[0].page_content)

A Day of Very Low Probability

Beneath the moonlight glints a tiny fragment of silver, a fraction of a line…

(black robes, falling)

…blood spills out in litres, and someone screams a word.

Every inch of wall space is covered by a bookcase. Each bookcase has six shelves, going almost to the ceiling. Some bookshelves are stacked to the brim with hardback books: science, maths, history, and everything else. Other shelves have two layers of paperback science fiction, with the back layer of books propped up on old tissue boxes or lengths of wood, so that you can see the back layer of books above the books in front. And it still isn’t enough. Books are overflowing onto the tables and the sofas and making little heaps under the windows.

This is the living-room of the house occupied by the eminent Professor Michael Verres-Evans, and his wife, Mrs. Petunia Evans-Verres, and their adopted son, Harry James Potter-Evans-Verres.

There is a letter lying on the living-room table, and an unstampe

In [None]:
from langchain.document_loaders import BSHTMLLoader

In [None]:
loader = BSHTMLLoader(file_path="../datasets/harry_potter_html/001.htm")

data = loader.load()

In [None]:
len(data)

1

In [None]:
print(data[0].page_content)

A Day of Very Low Probability

Beneath the moonlight glints a tiny fragment of silver, a fraction of a line…
(black robes, falling)
…blood spills out in litres, and someone screams a word.

Every inch of wall space is covered by a bookcase. Each bookcase has six shelves, going almost to the ceiling. Some bookshelves are stacked to the brim with hardback books: science, maths, history, and everything else. Other shelves have two layers of paperback science fiction, with the back layer of books propped up on old tissue boxes or lengths of wood, so that you can see the back layer of books above the books in front. And it still isn’t enough. Books are overflowing onto the tables and the sofas and making little heaps under the windows.
This is the living-room of the house occupied by the eminent Professor Michael Verres-Evans, and his wife, Mrs. Petunia Evans-Verres, and their adopted son, Harry James Potter-Evans-Verres.
There is a letter lying on the living-room table, and an unstamped en

## JSON Loader

In [None]:
import json

json_filepath = "../datasets/population_data.json"

with open(json_filepath) as f:
    loaded_json = json.loads(f.read())
    
len(loaded_json)

12407

In [None]:
loaded_json

[{'Country Name': 'Arab World',
  'Country Code': 'ARB',
  'Year': '1960',
  'Value': '96388069'},
 {'Country Name': 'Arab World',
  'Country Code': 'ARB',
  'Year': '1961',
  'Value': '98882541.4'},
 {'Country Name': 'Arab World',
  'Country Code': 'ARB',
  'Year': '1962',
  'Value': '101474075.8'},
 {'Country Name': 'Arab World',
  'Country Code': 'ARB',
  'Year': '1963',
  'Value': '104169209.2'},
 {'Country Name': 'Arab World',
  'Country Code': 'ARB',
  'Year': '1964',
  'Value': '106978104.6'},
 {'Country Name': 'Arab World',
  'Country Code': 'ARB',
  'Year': '1965',
  'Value': '109907857'},
 {'Country Name': 'Arab World',
  'Country Code': 'ARB',
  'Year': '1966',
  'Value': '112969031.8'},
 {'Country Name': 'Arab World',
  'Country Code': 'ARB',
  'Year': '1967',
  'Value': '116161856.2'},
 {'Country Name': 'Arab World',
  'Country Code': 'ARB',
  'Year': '1968',
  'Value': '119466968.6'},
 {'Country Name': 'Arab World',
  'Country Code': 'ARB',
  'Year': '1969',
  'Value': '1

In [None]:
from langchain.document_loaders import JSONLoader

In [None]:
loader = JSONLoader(file_path=json_filepath, jq_schema="Value")

data = loader.load()

data

ImportError: jq package not found, please install it with `pip install jq`

## Markdown Loader

In [None]:
from langchain.document_loaders import UnstructuredMarkdownLoader

In [None]:
md_filepath = "../datasets/harry_potter_md/001.md"

os.path.isfile(md_filepath)

True

In [None]:
loader = UnstructuredMarkdownLoader(file_path=md_filepath)

data = loader.load()

data

[Document(page_content='A Day of Very Low Probability\n\nBeneath the moonlight glints a tiny fragment of silver, a fraction of a line…\n\n(black robes, falling)\n\n…blood spills out in litres, and someone screams a word.\n\nEvery inch of wall space is covered by a bookcase. Each bookcase has six shelves, going almost to the ceiling. Some bookshelves are stacked to the brim with hardback books: science, maths, history, and everything else. Other shelves have two layers of paperback science fiction, with the back layer of books propped up on old tissue boxes or lengths of wood, so that you can see the back layer of books above the books in front. And it still isn’t enough. Books are overflowing onto the tables and the sofas and making little heaps under the windows.\n\nThis is the living-room of the house occupied by the eminent Professor Michael Verres-Evans, and his wife, Mrs. Petunia Evans-Verres, and their adopted son, Harry James Potter-Evans-Verres.\n\nThere is a letter lying on th

In [None]:
len(data)

1

In [None]:
print(data[0].page_content)

A Day of Very Low Probability

Beneath the moonlight glints a tiny fragment of silver, a fraction of a line…

(black robes, falling)

…blood spills out in litres, and someone screams a word.

Every inch of wall space is covered by a bookcase. Each bookcase has six shelves, going almost to the ceiling. Some bookshelves are stacked to the brim with hardback books: science, maths, history, and everything else. Other shelves have two layers of paperback science fiction, with the back layer of books propped up on old tissue boxes or lengths of wood, so that you can see the back layer of books above the books in front. And it still isn’t enough. Books are overflowing onto the tables and the sofas and making little heaps under the windows.

This is the living-room of the house occupied by the eminent Professor Michael Verres-Evans, and his wife, Mrs. Petunia Evans-Verres, and their adopted son, Harry James Potter-Evans-Verres.

There is a letter lying on the living-room table, and an unstampe

## PDF Loader

In [None]:
from langchain.document_loaders import PyPDFLoader

pdf_filepath = "../datasets/harry_potter_pdf/hpmor-trade-classic.pdf"

loader = PyPDFLoader(file_path=pdf_filepath)

data = loader.load()

data

[Document(page_content='Harry Potter and the Methods of Rationality', metadata={'source': '../datasets/harry_potter_pdf/hpmor-trade-classic.pdf', 'page': 0}),
 Document(page_content='', metadata={'source': '../datasets/harry_potter_pdf/hpmor-trade-classic.pdf', 'page': 1}),
 Document(page_content='Harry Potter and the Methods of Rationality\nBy Eliezer Ydkowsky\niii', metadata={'source': '../datasets/harry_potter_pdf/hpmor-trade-classic.pdf', 'page': 2}),
 Document(page_content='', metadata={'source': '../datasets/harry_potter_pdf/hpmor-trade-classic.pdf', 'page': 3}),
 Document(page_content='Contents\n1 A Day of Very Low Probability 1\n2 Everything I Believe Is False 9\n3 Comparing Reality To Its Alternatives 15\n4 The Efﬁcient Market Hypothesis 21\n5 The Fundamental Attribution Error 27\n6 The Planning Fallacy 37\n7 Reciprocation 61\n8 Positive Bias 87\n9 Title Redacted, Part I 101\n10 Self-Awareness, Part II 107\n11 Omake Files 1, 2, 3 117\n12 Impulse Control 127\n13 Asking the Wron

In [None]:
len(data)

1357

In [None]:
print(data[1].page_content)




In [None]:
data[0].metadata

{'source': '../datasets/harry_potter_pdf/hpmor-trade-classic.pdf', 'page': 0}

## Integrations

In [None]:
# Wikipedia

In [None]:
from langchain.document_loaders import WikipediaLoader

In [None]:
loader = WikipediaLoader(query='India', load_max_docs=2)

In [None]:
data = loader.load()

In [None]:
print(data[0].page_content)

India, officially the Republic of India (ISO: Bhārat Gaṇarājya), is a country in South Asia.  It is the seventh-largest country by area; the most populous country as of June 2023; and from the time of its independence in 1947, the world's most populous democracy. Bounded by the Indian Ocean on the south, the Arabian Sea on the southwest, and the Bay of Bengal on the southeast, it shares land borders with Pakistan to the west; China, Nepal, and Bhutan to the north; and Bangladesh and Myanmar to the east. In the Indian Ocean, India is in the vicinity of Sri Lanka and the Maldives; its Andaman and Nicobar Islands share a maritime border with Thailand, Myanmar, and Indonesia.
Modern humans arrived on the Indian subcontinent from Africa no later than 55,000 years ago.
Their long occupation, initially in varying forms of isolation as hunter-gatherers, has made the region highly diverse, second only to Africa in human genetic diversity. Settled life emerged on the subcontinent in the western 

In [None]:
data[0].metadata

{'title': 'India',
 'summary': "India, officially the Republic of India (ISO: Bhārat Gaṇarājya), is a country in South Asia.  It is the seventh-largest country by area; the most populous country as of June 2023; and from the time of its independence in 1947, the world's most populous democracy. Bounded by the Indian Ocean on the south, the Arabian Sea on the southwest, and the Bay of Bengal on the southeast, it shares land borders with Pakistan to the west; China, Nepal, and Bhutan to the north; and Bangladesh and Myanmar to the east. In the Indian Ocean, India is in the vicinity of Sri Lanka and the Maldives; its Andaman and Nicobar Islands share a maritime border with Thailand, Myanmar, and Indonesia.\nModern humans arrived on the Indian subcontinent from Africa no later than 55,000 years ago.\nTheir long occupation, initially in varying forms of isolation as hunter-gatherers, has made the region highly diverse, second only to Africa in human genetic diversity. Settled life emerged o

## ArXiv

In [None]:
from langchain_community.document_loaders import ArxivLoader

In [None]:
loader = ArxivLoader(query='2201.03916', load_max_docs=1)

data = loader.load()

In [None]:
len(data)

1

In [None]:
print(data[0].page_content)

Journal of Artiﬁcial Intelligence Research 74 (2022) 517-568
Submitted 01/2022; published 06/2022
Automated Reinforcement Learning (AutoRL):
A Survey and Open Problems
Jack Parker-Holder∗
jackph@robots.ox.ac.uk
University of Oxford
Raghu Rajan∗
rajanr@cs.uni-freiburg.de
University of Freiburg
Xingyou Song∗
xingyousong@google.com
Google Research, Brain Team
André Biedenkapp
biedenka@cs.uni-freiburg.de
University of Freiburg
Yingjie Miao
yingjiemiao@google.com
Google Research, Brain Team
Theresa Eimer
eimer@tnt.uni-hannover.de
Leibniz University Hannover
Baohe Zhang
zhangb@cs.uni-freiburg.de
University of Freiburg
Vu Nguyen
vutngn@amazon.com
Amazon Australia
Roberto Calandra
rcalandra@fb.com
Meta AI
Aleksandra Faust†
sandrafaust@google.com
Google Research, Brain Team
Frank Hutter†
fh@cs.uni-freiburg.de
University of Freiburg & Bosch Center for Artiﬁcial Intelligence
Marius Lindauer†
lindauer@tnt.uni-hannover.de
Leibniz University Hannover
Abstract
The combination of Reinforcement Learnin

In [None]:
data[0].metadata

{'Published': '2022-06-02',
 'Title': 'Automated Reinforcement Learning (AutoRL): A Survey and Open Problems',
 'Authors': 'Jack Parker-Holder, Raghu Rajan, Xingyou Song, André Biedenkapp, Yingjie Miao, Theresa Eimer, Baohe Zhang, Vu Nguyen, Roberto Calandra, Aleksandra Faust, Frank Hutter, Marius Lindauer',
 'Summary': 'The combination of Reinforcement Learning (RL) with deep learning has led to\na series of impressive feats, with many believing (deep) RL provides a path\ntowards generally capable agents. However, the success of RL agents is often\nhighly sensitive to design choices in the training process, which may require\ntedious and error-prone manual tuning. This makes it challenging to use RL for\nnew problems, while also limits its full potential. In many other areas of\nmachine learning, AutoML has shown it is possible to automate such design\nchoices and has also yielded promising initial results when applied to RL.\nHowever, Automated Reinforcement Learning (AutoRL) involve

In [None]:
# Loading the chat model

import os
from langchain_openai import ChatOpenAI
from langchain.globals import set_llm_cache
from langchain.cache import InMemoryCache

with open("../openai_api_key.txt", 'r') as f:
    os.environ['OPENAI_API_KEY'] = f.read()

chat = ChatOpenAI()
set_llm_cache(InMemoryCache())

In [None]:
from langchain.prompts.chat import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate

system_template = "You are a Peer Reviewer"
human_template = "Read the paper with the title: '{title}'\n\nAnd Content: {content} and critically list down all the issues in the paper"

systemp_message_prompt = SystemMessagePromptTemplate.from_template(system_template)
human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

chat_prompt = ChatPromptTemplate.from_messages([systemp_message_prompt, human_message_prompt])
prompt = chat_prompt.format_prompt(title=data[0].metadata['Title'], content=data[0].metadata['Summary'])

response = chat(messages = prompt.to_messages())

print(response.content)

The paper titled 'Automated Reinforcement Learning (AutoRL): A Survey and Open Problems' addresses the intersection of reinforcement learning (RL) and deep learning, highlighting the potential of (deep) RL in developing generally capable agents. The paper acknowledges that the success of RL agents is heavily dependent on design choices in the training process, which can be time-consuming and error-prone when done manually. The authors argue that this hinders the broader application of RL and limits its full potential.

The paper introduces the concept of Automated Reinforcement Learning (AutoRL) as a way to automate the design choices involved in RL training, similar to how AutoML has automated tasks in other areas of machine learning. AutoRL is described as a burgeoning research area that not only leverages existing AutoML techniques but also addresses unique challenges specific to RL, leading to the development of novel methods.

The authors suggest that AutoRL has shown promise in v

In [None]:
def peer_review(article_id):
    chat = ChatOpenAI()
    loader = ArxivLoader(query=article_id, load_max_docs=2)
    data = loader.load()
    first_record = data[0]
    page_content = first_record.page_content
    title = first_record.metadata['Title']
    summary = first_record.metadata['Summary']
    
    summary_list = []
    for record in data:
        summary_list.append(record.metadata['Summary'])
    full_summary = "\n\n".join(summary_list)
    
    system_template = "You are a Peer Reviewer"
    human_template = "Read the paper with the title: '{title}'\n\nAnd Content: {content} and critically list down all the issues in the paper"

    systemp_message_prompt = SystemMessagePromptTemplate.from_template(system_template)
    human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

    chat_prompt = ChatPromptTemplate.from_messages([systemp_message_prompt, human_message_prompt])
    prompt = chat_prompt.format_prompt(title=title, content=page_content)

    response = chat(messages = prompt.to_messages())

    return response.content

In [None]:
print(peer_review('1706.03762'))

Overall, the paper "Attention Is All You Need" by Vaswani et al. presents a novel model architecture, the Transformer, which is based solely on attention mechanisms and does not use recurrent or convolutional layers. The authors demonstrate the effectiveness of the Transformer model on machine translation tasks and English constituency parsing, achieving state-of-the-art results.

Here are some key points and issues identified in the paper:

1. **Strengths**:
   - The paper clearly introduces the Transformer model architecture and provides a comprehensive overview of the model components, including attention mechanisms, encoder-decoder stacks, and positional encodings.
   - The experimental results show significant improvements in translation quality and training efficiency compared to existing models, highlighting the effectiveness of the Transformer model.
   - Attention visualizations are provided to demonstrate the model's ability to capture long-range dependencies and perform task

In [None]:
print(peer_review('2201.03514'))

BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 16385 tokens. However, your messages resulted in 18470 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}

In [None]:
# Create a bot that can answer questions based on wikipedia articles

# Text Splitter

In [None]:
filepath = "../datasets/Harry Potter 1 - Sorcerer's Stone.txt"

with open(filepath, 'r') as f:
    hp_book = f.read()
    
print(hp_book)

Harry Potter and the Sorcerer's Stone


CHAPTER ONE

THE BOY WHO LIVED

Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say
that they were perfectly normal, thank you very much. They were the last
people you'd expect to be involved in anything strange or mysterious,
because they just didn't hold with such nonsense.

Mr. Dursley was the director of a firm called Grunnings, which made
drills. He was a big, beefy man with hardly any neck, although he did
have a very large mustache. Mrs. Dursley was thin and blonde and had
nearly twice the usual amount of neck, which came in very useful as she
spent so much of her time craning over garden fences, spying on the
neighbors. The Dursleys had a small son called Dudley and in their
opinion there was no finer boy anywhere.

The Dursleys had everything they wanted, but they also had a secret, and
their greatest fear was that somebody would discover it. They didn't
think they could bear it if anyone found out about the Potters. Mr

In [None]:
len(hp_book)

439742

In [None]:
len(hp_book.split())

78451

In [None]:
len(hp_book.split("\n"))

10703

In [None]:
len(hp_book.split("\n\n"))

3032

In [None]:
from collections import Counter

In [None]:
line_len_list = []

for line in hp_book.split('\n\n'):
    curr_line_len = len(line)
    line_len_list.append(curr_line_len)

Counter(line_len_list)

Counter({37: 18,
         12: 17,
         17: 13,
         262: 5,
         454: 1,
         748: 1,
         377: 1,
         64: 17,
         331: 2,
         833: 1,
         982: 1,
         696: 1,
         391: 3,
         70: 16,
         144: 8,
         745: 1,
         195: 6,
         513: 1,
         68: 15,
         311: 4,
         265: 6,
         261: 1,
         320: 3,
         532: 1,
         427: 3,
         191: 13,
         226: 2,
         119: 11,
         30: 19,
         137: 5,
         27: 22,
         88: 8,
         254: 5,
         42: 26,
         43: 26,
         78: 18,
         289: 9,
         183: 7,
         512: 1,
         22: 23,
         394: 2,
         482: 1,
         458: 3,
         801: 1,
         46: 24,
         324: 2,
         40: 26,
         59: 18,
         90: 8,
         110: 12,
         476: 4,
         106: 14,
         221: 4,
         310: 3,
         109: 9,
         9: 12,
         65: 8,
         158: 4,
         459: 

In [None]:
# Character level splitting

from langchain.text_splitter import CharacterTextSplitter

In [None]:
def len_func(text):
    return len(text)

In [None]:
(100 + 100 + 900) + 300

In [None]:
text_splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=1200,
    chunk_overlap=100,
    length_function=len_func,
    is_separator_regex=False
)

In [None]:
para_list = text_splitter.create_documents(texts=[hp_book])

para_list

[Document(page_content="Harry Potter and the Sorcerer's Stone\n\n\nCHAPTER ONE\n\nTHE BOY WHO LIVED\n\nMr. and Mrs. Dursley, of number four, Privet Drive, were proud to say\nthat they were perfectly normal, thank you very much. They were the last\npeople you'd expect to be involved in anything strange or mysterious,\nbecause they just didn't hold with such nonsense.\n\nMr. Dursley was the director of a firm called Grunnings, which made\ndrills. He was a big, beefy man with hardly any neck, although he did\nhave a very large mustache. Mrs. Dursley was thin and blonde and had\nnearly twice the usual amount of neck, which came in very useful as she\nspent so much of her time craning over garden fences, spying on the\nneighbors. The Dursleys had a small son called Dudley and in their\nopinion there was no finer boy anywhere."),
 Document(page_content="The Dursleys had everything they wanted, but they also had a secret, and\ntheir greatest fear was that somebody would discover it. They didn

In [None]:
manual_character_split_chunks = []

for para in hp_book.split("\n\n"):
    manual_character_split_chunks.append(para)
    
len(manual_character_split_chunks)

3032

In [None]:
langchain_character_split_chunks = []

for para in para_list:
    langchain_character_split_chunks.append(para)
    
len(langchain_character_split_chunks)

419

In [None]:
first_chunk = para_list[0]

In [None]:
first_chunk.metadata = {"source": filepath}

In [None]:
first_chunk.metadata

{'source': "../datasets/Harry Potter 1 - Sorcerer's Stone.txt"}

In [None]:
res_para_list = []

cnt = 0
for para in para_list:
    para.metadata = {"source": filepath, "chunk_number": cnt}
    cnt += 1
    res_para_list.append(para)

In [None]:
res_para_list[100].metadata

{'source': "../datasets/Harry Potter 1 - Sorcerer's Stone.txt",
 'chunk_number': 100}

In [None]:
extra_line = " ".join(['word']*500)

len(text_splitter.create_documents(texts = [extra_line + hp_book])[0].page_content)

Created a chunk of size 2536, which is longer than the specified 1200


2536

In [None]:
text_splitter.create_documents(texts = [extra_line + hp_book])[0]

Created a chunk of size 2536, which is longer than the specified 1200


Document(page_content="word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word wo

## Recursive Character Splitter

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ' '],
    chunk_size = 200,
    chunk_overlap = 100,
    length_function = len_func,
    is_separator_regex=False
)

In [None]:
print("\n\n".join(["\n".join([" ".join(['word']*100)]*20)]*10))

word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word
word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word


In [None]:
chunk_list = text_splitter.create_documents(texts = [extra_line + hp_book])

chunk_list

[Document(page_content='word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word'),
 Document(page_content='word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word'),
 Document(page_content='word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word'),
 Document(page_content='word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word word'),
 Document(page_content='word word word word word word word word word word word word word wor

## Split by tokens

In [None]:
!pip install tiktoken

In [None]:
sample_sent = "This is a sample sentence for you to tell me how the tokens are split in this sentence"

sample_sent.split(" ")

["Thi", "s", "is", "sample"]

['This',
 'is',
 'a',
 'sample',
 'sentence',
 'for',
 'you',
 'to',
 'tell',
 'me',
 'how',
 'the',
 'tokens',
 'are',
 'split',
 'in',
 'this',
 'sentence']

In [None]:
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n\n",
    chunk_size = 1200,
    chunk_overlap=100,
    model_name = "text-embedding-3-small",
    encoding_name= "text-embedding-3-small"
)

In [None]:
doc_list = text_splitter.create_documents([hp_book])

doc_list

[Document(page_content='Harry Potter and the Sorcerer\'s Stone\n\n\nCHAPTER ONE\n\nTHE BOY WHO LIVED\n\nMr. and Mrs. Dursley, of number four, Privet Drive, were proud to say\nthat they were perfectly normal, thank you very much. They were the last\npeople you\'d expect to be involved in anything strange or mysterious,\nbecause they just didn\'t hold with such nonsense.\n\nMr. Dursley was the director of a firm called Grunnings, which made\ndrills. He was a big, beefy man with hardly any neck, although he did\nhave a very large mustache. Mrs. Dursley was thin and blonde and had\nnearly twice the usual amount of neck, which came in very useful as she\nspent so much of her time craning over garden fences, spying on the\nneighbors. The Dursleys had a small son called Dudley and in their\nopinion there was no finer boy anywhere.\n\nThe Dursleys had everything they wanted, but they also had a secret, and\ntheir greatest fear was that somebody would discover it. They didn\'t\nthink they could

In [None]:
[len(doc.page_content) for doc in doc_list]

[4137,
 4535,
 4275,
 4586,
 4652,
 4111,
 4754,
 4633,
 4948,
 4483,
 4899,
 4667,
 4936,
 4787,
 4683,
 4178,
 4170,
 4109,
 4512,
 4379,
 4507,
 4629,
 4526,
 4465,
 4646,
 4429,
 4406,
 4433,
 4696,
 4532,
 4658,
 4465,
 4204,
 4394,
 4268,
 4604,
 4701,
 4820,
 4549,
 4282,
 4386,
 4686,
 4510,
 4757,
 4804,
 4636,
 4495,
 4720,
 4191,
 4654,
 4331,
 4505,
 4410,
 4837,
 4740,
 4586,
 4737,
 4719,
 4562,
 4918,
 4424,
 4478,
 4732,
 4628,
 4780,
 4699,
 4752,
 4738,
 4812,
 4723,
 4851,
 4595,
 4836,
 4670,
 4297,
 4366,
 4276,
 4402,
 4486,
 4484,
 4881,
 4661,
 4443,
 4222,
 4714,
 4689,
 4682,
 4662,
 4214,
 4560,
 4509,
 4624,
 4722,
 4828,
 4392,
 4682,
 4626,
 4092,
 4642,
 4588,
 4596,
 4709,
 606]

In [None]:
doc_list = text_splitter.split_text(hp_book)

doc_list

['Harry Potter and the Sorcerer\'s Stone\n\n\nCHAPTER ONE\n\nTHE BOY WHO LIVED\n\nMr. and Mrs. Dursley, of number four, Privet Drive, were proud to say\nthat they were perfectly normal, thank you very much. They were the last\npeople you\'d expect to be involved in anything strange or mysterious,\nbecause they just didn\'t hold with such nonsense.\n\nMr. Dursley was the director of a firm called Grunnings, which made\ndrills. He was a big, beefy man with hardly any neck, although he did\nhave a very large mustache. Mrs. Dursley was thin and blonde and had\nnearly twice the usual amount of neck, which came in very useful as she\nspent so much of her time craning over garden fences, spying on the\nneighbors. The Dursleys had a small son called Dudley and in their\nopinion there was no finer boy anywhere.\n\nThe Dursleys had everything they wanted, but they also had a secret, and\ntheir greatest fear was that somebody would discover it. They didn\'t\nthink they could bear it if anyone fou

In [None]:
from langchain.docstore.document import Document

res_doc_list = []

for doc_txt in doc_list:
    curr_doc = Document(page_content=doc_txt, metadata={"source": filepath})
    res_doc_list.append(curr_doc)
    
res_doc_list

[Document(page_content='Harry Potter and the Sorcerer\'s Stone\n\n\nCHAPTER ONE\n\nTHE BOY WHO LIVED\n\nMr. and Mrs. Dursley, of number four, Privet Drive, were proud to say\nthat they were perfectly normal, thank you very much. They were the last\npeople you\'d expect to be involved in anything strange or mysterious,\nbecause they just didn\'t hold with such nonsense.\n\nMr. Dursley was the director of a firm called Grunnings, which made\ndrills. He was a big, beefy man with hardly any neck, although he did\nhave a very large mustache. Mrs. Dursley was thin and blonde and had\nnearly twice the usual amount of neck, which came in very useful as she\nspent so much of her time craning over garden fences, spying on the\nneighbors. The Dursleys had a small son called Dudley and in their\nopinion there was no finer boy anywhere.\n\nThe Dursleys had everything they wanted, but they also had a secret, and\ntheir greatest fear was that somebody would discover it. They didn\'t\nthink they could

In [None]:
python_code = """def peer_review(article_id):
    chat = ChatOpenAI()
    loader = ArxivLoader(query=article_id, load_max_docs=2)
    data = loader.load()
    first_record = data[0]
    page_content = first_record.page_content
    title = first_record.metadata['Title']
    summary = first_record.metadata['Summary']
    
    summary_list = []
    for record in data:
        summary_list.append(record.metadata['Summary'])
    full_summary = "\n\n".join(summary_list)
    
    system_template = "You are a Peer Reviewer"
    human_template = "Read the paper with the title: '{title}'\n\nAnd Content: {content} and critically list down all the issues in the paper"

    systemp_message_prompt = SystemMessagePromptTemplate.from_template(system_template)
    human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

    chat_prompt = ChatPromptTemplate.from_messages([systemp_message_prompt, human_message_prompt])
    prompt = chat_prompt.format_prompt(title=title, content=page_content)

    response = chat(messages = prompt.to_messages())

    return response.content"""

In [None]:
# code splitter

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, Language

text_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON,
    chunk_size=50,
    chunk_overlap=10
)

In [None]:
text_splitter.create_documents(texts = [python_code])

[Document(page_content='def peer_review(article_id):'),
 Document(page_content='chat = ChatOpenAI()'),
 Document(page_content='loader = ArxivLoader(query=article_id,'),
 Document(page_content='load_max_docs=2)'),
 Document(page_content='data = loader.load()'),
 Document(page_content='first_record = data[0]'),
 Document(page_content='page_content = first_record.page_content'),
 Document(page_content="title = first_record.metadata['Title']"),
 Document(page_content="summary = first_record.metadata['Summary']"),
 Document(page_content='summary_list = []'),
 Document(page_content='for record in data:'),
 Document(page_content="summary_list.append(record.metadata['Summary'])"),
 Document(page_content='full_summary = "'),
 Document(page_content='".join(summary_list)'),
 Document(page_content='system_template = "You are a Peer Reviewer"'),
 Document(page_content='human_template = "Read the paper with the'),
 Document(page_content="with the title: '{title}'"),
 Document(page_content='And Conte

## Embeddings

In [None]:
import numpy as np

In [None]:
# Let's start with OpenAI models

import os
from langchain.embeddings import OpenAIEmbeddings

In [None]:
with open("../openai_api_key.txt", 'r') as f:
    os.environ['OPENAI_API_KEY'] = f.read()

In [None]:
embedding_function = OpenAIEmbeddings()

In [None]:
text = "The scar had not pained Harry for nineteen years. All was well"

embedded_text = embedding_function.embed_query(text)

In [None]:
np.array(embedded_text).shape

(1536,)

In [None]:
from langchain.docstore.document import Document

doc_lines = [
    Document(page_content="It is our choices, Harry, that show what we truly are, far more than our abilities", metadata = {"source": "Harry Potter"}),
    Document(page_content=text, metadata = {"source": "Harry Potter"}),
]

doc_lines

[Document(page_content='It is our choices, Harry, that show what we truly are, far more than our abilities', metadata={'source': 'Harry Potter'}),
 Document(page_content='The scar had not pained Harry for nineteen years. All was well', metadata={'source': 'Harry Potter'})]

In [None]:
# Extract the page_content

line_list = [doc.page_content for doc in doc_lines]

line_list

['It is our choices, Harry, that show what we truly are, far more than our abilities',
 'The scar had not pained Harry for nineteen years. All was well']

In [None]:
embedded_docs = [embedding_function.embed_query(line) for line in line_list]

np.array(embedded_docs).shape

(2, 1536)

In [None]:
embedded_docs = embedding_function.embed_documents(line_list)

np.array(embedded_docs).shape

(2, 1536)

In [None]:
# MTEB leaderboard

In [None]:
!pip install sentence_transformers



In [None]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-base-en-v1.5"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}

embedding_function = HuggingFaceBgeEmbeddings(
    model_name = model_name,
    model_kwargs = model_kwargs,
    encode_kwargs = encode_kwargs
)

In [None]:
bge_embed_record = embedding_function.embed_query("This is some random text")
bge_embed_records = embedding_function.embed_documents(["This is some random text"])

print(np.array(bge_embed_record).shape)
print(np.array(bge_embed_records).shape)

(768,)
(1, 768)


In [None]:
from langchain_community.embeddings import FakeEmbeddings

embedding_function = FakeEmbeddings(size=300)

fake_embed_record = embedding_function.embed_query("This is some random text")
fake_embed_records = embedding_function.embed_documents(["This is some random text"])

In [None]:
np.array(fake_embed_record).shape

(300,)

In [None]:
np.array(fake_embed_records).shape

(1, 300)

# Vectorstores

In [None]:
!pip install "chromadb==0.4.24" "faiss-cpu==1.8.0"



In [None]:
!pip show chromadb

Name: chromadb
Version: 0.4.24
Summary: Chroma.
Home-page: 
Author: 
Author-email: Jeff Huber <jeff@trychroma.com>, Anton Troynikov <anton@trychroma.com>
License: 
Location: d:\codework\github\langchain_training\.conda\lib\site-packages
Requires: bcrypt, build, chroma-hnswlib, fastapi, grpcio, importlib-resources, kubernetes, mmh3, numpy, onnxruntime, opentelemetry-api, opentelemetry-exporter-otlp-proto-grpc, opentelemetry-instrumentation-fastapi, opentelemetry-sdk, orjson, overrides, posthog, pulsar-client, pydantic, pypika, PyYAML, requests, tenacity, tokenizers, tqdm, typer, typing-extensions, uvicorn
Required-by: 


In [None]:
!pip show faiss-cpu

Name: faiss-cpu
Version: 1.8.0
Summary: A library for efficient similarity search and clustering of dense vectors.
Home-page: 
Author: 
Author-email: Kota Yamaguchi <yamaguchi_kota@cyberagent.co.jp>
License: MIT License
Location: d:\codework\github\langchain_training\.conda\lib\site-packages
Requires: numpy
Required-by: 


In [None]:
from langchain_community.document_loaders import WikipediaLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings, FakeEmbeddings
from langchain_community.embeddings import FakeEmbeddings

In [None]:
loader = WikipediaLoader(query='Elon Musk', load_max_docs=5)
documents = loader.load()
documents

[Document(page_content='Elon Reeve Musk (; EE-lon; born June 28, 1971) is a businessman and investor. He is the founder, chairman, CEO, and CTO of SpaceX; angel investor, CEO, product architect, and former chairman of Tesla, Inc.; owner, executive chairman, and CTO of X Corp.; founder of the Boring Company and xAI; co-founder of Neuralink and OpenAI; and president of the Musk Foundation. He is one of the wealthiest people in the world, with an estimated net worth of US$190 billion as of March 2024, according to the Bloomberg Billionaires Index, and $195 billion according to Forbes, primarily from his ownership stakes in Tesla and SpaceX.A member of the wealthy South African Musk family, Elon was born in Pretoria and briefly attended the University of Pretoria before immigrating to Canada at age 18, acquiring citizenship through his Canadian-born mother. Two years later, he matriculated at Queen\'s University at Kingston in Canada. Musk later transferred to the University of Pennsylvani

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=100)
docs = text_splitter.split_documents(documents=documents)
print(len(docs))
docs

76


[Document(page_content='Elon Reeve Musk (; EE-lon; born June 28, 1971) is a businessman and investor. He is the founder, chairman, CEO, and CTO of SpaceX; angel investor, CEO, product architect, and former chairman of Tesla, Inc.; owner, executive chairman, and CTO of X Corp.; founder of the Boring Company and xAI; co-founder of Neuralink and OpenAI; and president of the Musk Foundation. He is one of the wealthiest', metadata={'title': 'Elon Musk', 'summary': "Elon Reeve Musk (; EE-lon; born June 28, 1971) is a businessman and investor. He is the founder, chairman, CEO, and CTO of SpaceX; angel investor, CEO, product architect, and former chairman of Tesla, Inc.; owner, executive chairman, and CTO of X Corp.; founder of the Boring Company and xAI; co-founder of Neuralink and OpenAI; and president of the Musk Foundation. He is one of the wealthiest people in the world, with an estimated net worth of US$190 billion as of March 2024, according to the Bloomberg Billionaires Index, and $195

In [None]:
model_name = "BAAI/bge-large-en-v1.5"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {"normalize_embeddings": True}

embedding_function = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

######### 

# embedding_function = FakeEmbeddings(size=300)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
query = "Who is Elon Musk's Father?"

In [None]:
# Process
# Query -> Query Embeddings
# Chunks -> Chunk Embeddings -> Vectorstore
# Query Embeddings and Chunk Embeddings will be matched to get the results

In [None]:
# FAISS (in memory database)

from langchain.vectorstores import FAISS

In [None]:
db = FAISS.from_documents(docs, embedding_function)

# 'document in text' - embeddings
# Query -> query embeddings -> match with the embeddings in the vector store -> return the text connected to those embeddings

In [None]:
# Querying

matched_docs = db.similarity_search(query=query, k=5)

matched_docs

[Document(page_content="Elon Musk's paternal great-grandmother was a Dutchwoman descended from the Dutch Free Burghers, while one of his maternal great-grandparents came from Switzerland. His paternal grandmother was English from Liverpool and his paternal grandfather Walter Henry J. Musk was South African. Elon Musk's father, Errol Musk, is a South African former electrical and mechanical engineer consultant and", metadata={'title': 'Musk family', 'summary': 'The Musk family is a wealthy family of South African origin that is largely active in the United States and Canada. The Musks are of English, Anglo-Canadian, Pennsylvania Dutch, and Swiss descent. The family is known for its entrepreneurial endeavours. Elon Musk was formerly the wealthiest person in the world, with an estimated net worth of US$232 billion as of December 2023, according to the Bloomberg Billionaires Index.', 'source': 'https://en.wikipedia.org/wiki/Musk_family'}),
 Document(page_content="Elon Reeve Musk was born o

In [None]:
["errol musk" in doc.page_content.lower() for doc in matched_docs]

[True, True, False, True, False]

In [None]:
from langchain.vectorstores import Chroma

In [None]:
db = Chroma.from_documents(docs, embedding_function, persist_directory="../output/elon_musk_db")

In [None]:
# Loading the existing database

loaded_db = Chroma(persist_directory="../output/elon_musk_db", embedding_function=embedding_function)

In [None]:
# Query

print(query)

matched_docs = db.similarity_search(query=query, k=3)

matched_docs

Who is Elon Musk's Father?


[Document(page_content="Elon Musk's paternal great-grandmother was a Dutchwoman descended from the Dutch Free Burghers, while one of his maternal great-grandparents came from Switzerland. His paternal grandmother was English from Liverpool and his paternal grandfather Walter Henry J. Musk was South African. Elon Musk's father, Errol Musk, is a South African former electrical and mechanical engineer consultant and", metadata={'source': 'https://en.wikipedia.org/wiki/Musk_family', 'summary': 'The Musk family is a wealthy family of South African origin that is largely active in the United States and Canada. The Musks are of English, Anglo-Canadian, Pennsylvania Dutch, and Swiss descent. The family is known for its entrepreneurial endeavours. Elon Musk was formerly the wealthiest person in the world, with an estimated net worth of US$232 billion as of December 2023, according to the Bloomberg Billionaires Index.', 'title': 'Musk family'}),
 Document(page_content="Elon Musk's paternal great

In [None]:
# Adding the family information

family_data_loader = WikipediaLoader(query="Musk Family", load_max_docs=1)
family_documents = family_data_loader.load()
family_documents

[Document(page_content='The Musk family is a wealthy family of South African origin that is largely active in the United States and Canada. The Musks are of English, Anglo-Canadian, Pennsylvania Dutch, and Swiss descent. The family is known for its entrepreneurial endeavours. Elon Musk was formerly the wealthiest person in the world, with an estimated net worth of US$232 billion as of December 2023, according to the Bloomberg Billionaires Index.\n\n\n== History ==\nElon Musk\'s paternal great-grandmother was a Dutchwoman descended from the Dutch Free Burghers, while one of his maternal great-grandparents came from Switzerland. His paternal grandmother was English from Liverpool and his paternal grandfather Walter Henry J. Musk was South African. Elon Musk\'s father, Errol Musk, is a South African former electrical and mechanical engineer consultant and property developer, who was involved in the emerald business at some point in the 1980s, and was a member of the South African Progress

In [None]:
family_docs = text_splitter.split_documents(documents=family_documents)
print(len(family_docs))
family_docs

11


[Document(page_content='The Musk family is a wealthy family of South African origin that is largely active in the United States and Canada. The Musks are of English, Anglo-Canadian, Pennsylvania Dutch, and Swiss descent. The family is known for its entrepreneurial endeavours. Elon Musk was formerly the wealthiest person in the world, with an estimated net worth of US$232 billion as of December 2023, according to the', metadata={'title': 'Musk family', 'summary': 'The Musk family is a wealthy family of South African origin that is largely active in the United States and Canada. The Musks are of English, Anglo-Canadian, Pennsylvania Dutch, and Swiss descent. The family is known for its entrepreneurial endeavours. Elon Musk was formerly the wealthiest person in the world, with an estimated net worth of US$232 billion as of December 2023, according to the Bloomberg Billionaires Index.', 'source': 'https://en.wikipedia.org/wiki/Musk_family'}),
 Document(page_content='in the world, with an e

In [None]:
# Adding new information

db = Chroma.from_documents(family_docs, embedding_function, persist_directory="../output/elon_musk_db")

In [None]:
matched_docs = db.similarity_search(query=query, k=4)

matched_docs

[Document(page_content="Elon Musk's paternal great-grandmother was a Dutchwoman descended from the Dutch Free Burghers, while one of his maternal great-grandparents came from Switzerland. His paternal grandmother was English from Liverpool and his paternal grandfather Walter Henry J. Musk was South African. Elon Musk's father, Errol Musk, is a South African former electrical and mechanical engineer consultant and", metadata={'source': 'https://en.wikipedia.org/wiki/Musk_family', 'summary': 'The Musk family is a wealthy family of South African origin that is largely active in the United States and Canada. The Musks are of English, Anglo-Canadian, Pennsylvania Dutch, and Swiss descent. The family is known for its entrepreneurial endeavours. Elon Musk was formerly the wealthiest person in the world, with an estimated net worth of US$232 billion as of December 2023, according to the Bloomberg Billionaires Index.', 'title': 'Musk family'}),
 Document(page_content="Elon Musk's paternal great

In [None]:
# Deleting the information
# Updating the information

# Retrievers

In [None]:
retriever = db.as_retriever()

retriever

VectorStoreRetriever(tags=['Chroma', 'HuggingFaceBgeEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x0000021BC6676D60>)

In [None]:
matched_docs = retriever.get_relevant_documents(query=query)

matched_docs

[Document(page_content="Elon Musk's paternal great-grandmother was a Dutchwoman descended from the Dutch Free Burghers, while one of his maternal great-grandparents came from Switzerland. His paternal grandmother was English from Liverpool and his paternal grandfather Walter Henry J. Musk was South African. Elon Musk's father, Errol Musk, is a South African former electrical and mechanical engineer consultant and", metadata={'source': 'https://en.wikipedia.org/wiki/Musk_family', 'summary': 'The Musk family is a wealthy family of South African origin that is largely active in the United States and Canada. The Musks are of English, Anglo-Canadian, Pennsylvania Dutch, and Swiss descent. The family is known for its entrepreneurial endeavours. Elon Musk was formerly the wealthiest person in the world, with an estimated net worth of US$232 billion as of December 2023, according to the Bloomberg Billionaires Index.', 'title': 'Musk family'}),
 Document(page_content="Elon Musk's paternal great

In [None]:
# How these retrievers should retreiver, how many items to retriever
# MMR - Maximum marginal relevance

retriever = db.as_retriever(search_type="mmr", search_kwargs = {"k": 5})

matched_docs = retriever.get_relevant_documents(query=query)

matched_docs

[Document(page_content="Elon Musk's paternal great-grandmother was a Dutchwoman descended from the Dutch Free Burghers, while one of his maternal great-grandparents came from Switzerland. His paternal grandmother was English from Liverpool and his paternal grandfather Walter Henry J. Musk was South African. Elon Musk's father, Errol Musk, is a South African former electrical and mechanical engineer consultant and", metadata={'source': 'https://en.wikipedia.org/wiki/Musk_family', 'summary': 'The Musk family is a wealthy family of South African origin that is largely active in the United States and Canada. The Musks are of English, Anglo-Canadian, Pennsylvania Dutch, and Swiss descent. The family is known for its entrepreneurial endeavours. Elon Musk was formerly the wealthiest person in the world, with an estimated net worth of US$232 billion as of December 2023, according to the Bloomberg Billionaires Index.', 'title': 'Musk family'}),
 Document(page_content="Elon Reeve Musk was born o

In [None]:
retriever = db.as_retriever(search_type="similarity_score_threshold", search_kwargs = {"score_threshold": 0.5})

matched_docs = retriever.get_relevant_documents(query=query)

matched_docs

[Document(page_content="Elon Musk's paternal great-grandmother was a Dutchwoman descended from the Dutch Free Burghers, while one of his maternal great-grandparents came from Switzerland. His paternal grandmother was English from Liverpool and his paternal grandfather Walter Henry J. Musk was South African. Elon Musk's father, Errol Musk, is a South African former electrical and mechanical engineer consultant and", metadata={'source': 'https://en.wikipedia.org/wiki/Musk_family', 'summary': 'The Musk family is a wealthy family of South African origin that is largely active in the United States and Canada. The Musks are of English, Anglo-Canadian, Pennsylvania Dutch, and Swiss descent. The family is known for its entrepreneurial endeavours. Elon Musk was formerly the wealthiest person in the world, with an estimated net worth of US$232 billion as of December 2023, according to the Bloomberg Billionaires Index.', 'title': 'Musk family'}),
 Document(page_content="Elon Musk's paternal great

In [None]:
db._collection.delete(ids=["1"])

Delete of nonexisting embedding ID: 1
Delete of nonexisting embedding ID: 1


In [None]:
len(docs)

76

In [None]:
retriever = db.as_retriever(search_type="similarity_score_threshold", search_kwargs = {"score_threshold": 0.5})

matched_docs = retriever.get_relevant_documents(query=query)

matched_docs

[Document(page_content="Elon Musk's paternal great-grandmother was a Dutchwoman descended from the Dutch Free Burghers, while one of his maternal great-grandparents came from Switzerland. His paternal grandmother was English from Liverpool and his paternal grandfather Walter Henry J. Musk was South African. Elon Musk's father, Errol Musk, is a South African former electrical and mechanical engineer consultant and", metadata={'source': 'https://en.wikipedia.org/wiki/Musk_family', 'summary': 'The Musk family is a wealthy family of South African origin that is largely active in the United States and Canada. The Musks are of English, Anglo-Canadian, Pennsylvania Dutch, and Swiss descent. The family is known for its entrepreneurial endeavours. Elon Musk was formerly the wealthiest person in the world, with an estimated net worth of US$232 billion as of December 2023, according to the Bloomberg Billionaires Index.', 'title': 'Musk family'}),
 Document(page_content="Elon Musk's paternal great

In [None]:
docs

[Document(page_content='Elon Reeve Musk (; EE-lon; born June 28, 1971) is a businessman and investor. He is the founder, chairman, CEO, and CTO of SpaceX; angel investor, CEO, product architect, and former chairman of Tesla, Inc.; owner, executive chairman, and CTO of X Corp.; founder of the Boring Company and xAI; co-founder of Neuralink and OpenAI; and president of the Musk Foundation. He is one of the wealthiest', metadata={'title': 'Elon Musk', 'summary': "Elon Reeve Musk (; EE-lon; born June 28, 1971) is a businessman and investor. He is the founder, chairman, CEO, and CTO of SpaceX; angel investor, CEO, product architect, and former chairman of Tesla, Inc.; owner, executive chairman, and CTO of X Corp.; founder of the Boring Company and xAI; co-founder of Neuralink and OpenAI; and president of the Musk Foundation. He is one of the wealthiest people in the world, with an estimated net worth of US$190 billion as of March 2024, according to the Bloomberg Billionaires Index, and $195

# Other Retrievers

In [1]:
import chromadb
from dotenv import load_dotenv
from langchain.chat_models import ChatOpenAI
from langchain_community.document_loaders import WikipediaLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import Chroma

chunk_size = 400
chunk_overlap = 100

# Loading the environment variables
load_dotenv()

# Loading the chat model
chat = ChatOpenAI()

# Loading data
loader = WikipediaLoader(query='Steve Jobs', load_max_docs=5)
documents = loader.load()

# Text splitting
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
docs = text_splitter.split_documents(documents=documents)

# Embedding function
embedding_function = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-large-en-v1.5",
    model_kwargs={'device': 'cpu'},
    encode_kwargs={"normalize_embeddings": True}
)

# Vector store
db = Chroma.from_documents(docs, embedding_function, persist_directory="../output/steve_jobs_db")

retriever = db.as_retriever()

query = "When was Steve Jobs fired from Apple?"

  warn_deprecated(


  lis = BeautifulSoup(html).find_all('li')


# BM25 Retriever

In [None]:
# !pip install rank_bm25



In [None]:
from langchain.retrievers import BM25Retriever

In [None]:
bm25_retriever = BM25Retriever.from_documents(docs)

In [None]:
matched_docs = bm25_retriever.get_relevant_documents("Musk")

In [None]:
matched_docs

[Document(page_content="October 17. Weeks before the trial was set to begin, Musk reversed course, announcing that he would move forward with the acquisition. The deal was closed on October 27, with Musk immediately becoming Twitter's new owner and CEO. Twitter was taken private and merged into a new parent company named X Corp. Musk promptly fired several top executives, including previous CEO Parag Agrawal. Musk has", metadata={'title': 'Acquisition of Twitter by Elon Musk', 'summary': 'Business magnate Elon Musk initiated an acquisition of American social media company Twitter, Inc. on April 14, 2022, and concluded it on October 27, 2022. Musk had begun buying shares of the company in January 2022, becoming its largest shareholder by April with a 9.1 percent ownership stake. Twitter invited Musk to join its board of directors, an offer he initially accepted before declining. On April 14, Musk made an unsolicited offer to purchase the company, to which Twitter\'s board responded with

# Semantic Retrievers

In [None]:
!pip install python-dotenv



In [None]:
# chunk_size = 400
# chunk_overlap = 100

# # Loading the environment variables
# load_dotenv()

# # Loading the chat model
# chat = ChatOpenAI()

# # Loading data
# loader = WikipediaLoader(query='Steve Jobs', load_max_docs=5)
# documents = loader.load()

# # Text splitting
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
# docs = text_splitter.split_documents(documents=documents)

# # Embedding function
# embedding_function = HuggingFaceBgeEmbeddings(
#     model_name="BAAI/bge-large-en-v1.5",
#     model_kwargs={'device': 'cpu'},
#     encode_kwargs={"normalize_embeddings": True}
# )

# # Vector store
# db = Chroma.from_documents(docs, embedding_function, persist_directory="../output/steve_jobs_db")



  lis = BeautifulSoup(html).find_all('li')


In [None]:
retriever = db.as_retriever()

In [None]:
query = "When was Steve Jobs fired from Apple?"

## MultiQuery Retriever

In [None]:
from langchain.retrievers.multi_query import MultiQueryRetriever

In [None]:
mq_retriever = MultiQueryRetriever.from_llm(retriever=retriever, llm=chat)

In [None]:
import logging
logging.basicConfig()
logging.getLogger('langchain.retrievers.multi_query').setLevel(logging.INFO)

In [None]:
mq_retriever.get_relevant_documents(query=query)

INFO:langchain.retrievers.multi_query:Generated queries: ["1. What year did Apple terminate Steve Jobs' employment?", '2. At what point in time was Steve Jobs ousted from his position at Apple?', '3. When did Steve Jobs experience his departure from Apple through termination?']


[Document(page_content="In 1985, Jobs departed Apple after a long power struggle with the company's board and its then-CEO, John Sculley. That same year, Jobs took some Apple employees with him to found NeXT, a computer platform development company that specialized in computers for higher-education and business markets, serving as its CEO. In 1986, he helped develop the visual effects industry by funding the computer", metadata={'source': 'https://en.wikipedia.org/wiki/Steve_Jobs', 'summary': 'Steven Paul Jobs (February 24, 1955 – October 5, 2011) was an American businessman, inventor, and investor best known for co-founding the technology giant Apple Inc. Jobs was also the founder of NeXT and chairman and majority shareholder of Pixar. He was a pioneer of the personal computer revolution of the 1970s and 1980s, along with his early business partner and fellow Apple co-founder Steve Wozniak.\nJobs was born in San Francisco in 1955 and adopted shortly afterwards. He attended Reed Colleg

In [None]:
print(*["1. What year did Apple terminate Steve Jobs' employment?", '2. At what point in time was Steve Jobs ousted from his position at Apple?',
      '3. When did Steve Jobs experience his departure from Apple through termination?'], sep='\n')

1. What year did Apple terminate Steve Jobs' employment?
2. At what point in time was Steve Jobs ousted from his position at Apple?
3. When did Steve Jobs experience his departure from Apple through termination?


In [None]:
retrieved_docs = mq_retriever.get_relevant_documents(query=query)

INFO:langchain.retrievers.multi_query:Generated queries: ["1. What was the date of Steve Jobs' termination from Apple?", '2. At what time was Steve Jobs dismissed from his position at Apple?', '3. When did Apple part ways with Steve Jobs?']


In [None]:
['1985' in doc.page_content for doc in retrieved_docs]

[False, True, False, False, False, False]

In [None]:
print(retrieved_docs[1].page_content)

In 1985, Jobs departed Apple after a long power struggle with the company's board and its then-CEO, John Sculley. That same year, Jobs took some Apple employees with him to found NeXT, a computer platform development company that specialized in computers for higher-education and business markets, serving as its CEO. In 1986, he helped develop the visual effects industry by funding the computer


# Contextual Compression

In [None]:
# Retrieval

# Query -> get the responses

# Query + responses -> LLM

# Extract the relevant from the responses

In [None]:
db = Chroma(persist_directory="../output/steve_jobs.db", embedding_function=embedding_function)

In [None]:
sim_docs = retriever.get_relevant_documents(query=query)

In [None]:
sim_docs

[Document(page_content="In 1985, Jobs departed Apple after a long power struggle with the company's board and its then-CEO, John Sculley. That same year, Jobs took some Apple employees with him to found NeXT, a computer platform development company that specialized in computers for higher-education and business markets, serving as its CEO. In 1986, he helped develop the visual effects industry by funding the computer", metadata={'source': 'https://en.wikipedia.org/wiki/Steve_Jobs', 'summary': 'Steven Paul Jobs (February 24, 1955 – October 5, 2011) was an American businessman, inventor, and investor best known for co-founding the technology giant Apple Inc. Jobs was also the founder of NeXT and chairman and majority shareholder of Pixar. He was a pioneer of the personal computer revolution of the 1970s and 1980s, along with his early business partner and fellow Apple co-founder Steve Wozniak.\nJobs was born in San Francisco in 1955 and adopted shortly afterwards. He attended Reed Colleg

In [None]:
chat = ChatOpenAI(temperature=0)

In [None]:
# Document compressor

from langchain.retrievers.document_compressors import LLMChainExtractor

compressor = LLMChainExtractor.from_llm(llm=chat)

compressor

LLMChainExtractor(llm_chain=LLMChain(prompt=PromptTemplate(input_variables=['context', 'question'], output_parser=NoOutputParser(), template='Given the following question and context, extract any part of the context *AS IS* that is relevant to answer the question. If none of the context is relevant return NO_OUTPUT. \n\nRemember, *DO NOT* edit the extracted parts of the context.\n\n> Question: {question}\n> Context:\n>>>\n{context}\n>>>\nExtracted relevant parts:'), llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x0000025B530BAD90>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x0000025B535FB550>, temperature=0.0, openai_api_key='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', openai_proxy='')), get_input=<function default_get_input at 0x0000025B51FEC820>)

In [None]:
print(compressor.llm_chain.prompt.template)

Given the following question and context, extract any part of the context *AS IS* that is relevant to answer the question. If none of the context is relevant return NO_OUTPUT. 

Remember, *DO NOT* edit the extracted parts of the context.

> Question: {question}
> Context:
>>>
{context}
>>>
Extracted relevant parts:


In [None]:
# Compression Retriever

from langchain.retrievers import ContextualCompressionRetriever

compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever)

compression_retriever

ContextualCompressionRetriever(base_compressor=LLMChainExtractor(llm_chain=LLMChain(prompt=PromptTemplate(input_variables=['context', 'question'], output_parser=NoOutputParser(), template='Given the following question and context, extract any part of the context *AS IS* that is relevant to answer the question. If none of the context is relevant return NO_OUTPUT. \n\nRemember, *DO NOT* edit the extracted parts of the context.\n\n> Question: {question}\n> Context:\n>>>\n{context}\n>>>\nExtracted relevant parts:'), llm=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x0000025B530BAD90>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x0000025B535FB550>, temperature=0.0, openai_api_key='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', openai_proxy='')), get_input=<function default_get_input at 0x0000025B51FEC820>), base_retriever=VectorStoreRetriever(tags=['Chroma', 'HuggingFaceBgeEmbeddings'], vectorstore=<langchain_community.vec

In [None]:
matched_docs = compression_retriever.get_relevant_documents(query=query)



In [None]:
matched_docs

[Document(page_content="In 1985, Jobs departed Apple after a long power struggle with the company's board and its then-CEO, John Sculley.", metadata={'source': 'https://en.wikipedia.org/wiki/Steve_Jobs', 'summary': 'Steven Paul Jobs (February 24, 1955 – October 5, 2011) was an American businessman, inventor, and investor best known for co-founding the technology giant Apple Inc. Jobs was also the founder of NeXT and chairman and majority shareholder of Pixar. He was a pioneer of the personal computer revolution of the 1970s and 1980s, along with his early business partner and fellow Apple co-founder Steve Wozniak.\nJobs was born in San Francisco in 1955 and adopted shortly afterwards. He attended Reed College in 1972 before withdrawing that same year. In 1974, he traveled through India, seeking enlightenment before later studying Zen Buddhism. He and Wozniak co-founded Apple in 1976 to further develop and sell Wozniak\'s Apple I personal computer. Together, the duo gained fame and weal

In [None]:
[len(doc.page_content) for doc in matched_docs]

[113, 135]

# Parent Document Retriever

In [None]:
# Split paragraph
# split sentence
# match sentences with query
# get the paragraph with most matching sentences.

In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore

In [None]:
parent_splitter = CharacterTextSplitter(separator="\n\n", chunk_size=1000, chunk_overlap=100)
child_splitter = CharacterTextSplitter(separator="\n", chunk_size=200, chunk_overlap=50)

store = InMemoryStore() # parent documents

In [None]:
par_doc_retriever = ParentDocumentRetriever(vectorstore=db, docstore=store, child_splitter=child_splitter, parent_splitter=parent_splitter)

In [None]:
par_doc_retriever.add_documents(docs)

In [None]:
par_doc_retriever.get_relevant_documents(query=query)

[Document(page_content="In 1985, Jobs departed Apple after a long power struggle with the company's board and its then-CEO, John Sculley. That same year, Jobs took some Apple employees with him to found NeXT, a computer platform development company that specialized in computers for higher-education and business markets, serving as its CEO. In 1986, he helped develop the visual effects industry by funding the computer", metadata={'title': 'Steve Jobs', 'summary': 'Steven Paul Jobs (February 24, 1955 – October 5, 2011) was an American businessman, inventor, and investor best known for co-founding the technology giant Apple Inc. Jobs was also the founder of NeXT and chairman and majority shareholder of Pixar. He was a pioneer of the personal computer revolution of the 1970s and 1980s, along with his early business partner and fellow Apple co-founder Steve Wozniak.\nJobs was born in San Francisco in 1955 and adopted shortly afterwards. He attended Reed College in 1972 before withdrawing th