# Requirements

In [7]:
! pip install -q langchain
! pip install -q pypdf
! pip install -q yt_dlp
! pip install -q pydub
! pip install -q ffmpeg
! pip install -q ffprobe
! pip install -q chromadb
! pip install -q pysqlite3-binary

In [9]:
#latest version of chroma has some issues with sqlite3

# these three lines swap the stdlib sqlite3 lib with the pysqlite3 package
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

## API KEYS

In [10]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

# Document Loading

#### pdf

In [4]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("docs/MachineLearning-Lecture01.pdf")
pages = loader.load()

#### youtube

In [5]:
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import OpenAIWhisperParser
from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader

In [6]:
url="https://www.youtube.com/watch?v=jGwO_UgTS7I"
save_dir="docs/youtube/"
loader = GenericLoader(
    YoutubeAudioLoader([url],save_dir),
    OpenAIWhisperParser()
)
docs = loader.load()

[youtube] Extracting URL: https://www.youtube.com/watch?v=jGwO_UgTS7I
[youtube] jGwO_UgTS7I: Downloading webpage
[youtube] jGwO_UgTS7I: Downloading ios player API JSON
[youtube] jGwO_UgTS7I: Downloading android player API JSON
[youtube] jGwO_UgTS7I: Downloading m3u8 information
[info] jGwO_UgTS7I: Downloading 1 format(s): 140
[download] docs/youtube//Stanford CS229： Machine Learning Course, Lecture 1 - Andrew Ng (Autumn 2018).m4a has already been downloaded
[download] 100% of   69.76MiB


ERROR: Postprocessing: ffprobe and ffmpeg not found. Please install or provide the path using --ffmpeg-location


DownloadError: ERROR: Postprocessing: ffprobe and ffmpeg not found. Please install or provide the path using --ffmpeg-location

#### urls

In [7]:
from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://github.com/basecamp/handbook/blob/master/37signals-is-you.md")

In [8]:
docs = loader.load()

In [9]:
print(docs[0].page_content[:500])










































































handbook/37signals-is-you.md at master · basecamp/handbook · GitHub

















































Skip to content







Toggle navigation










            Sign up
          


 













        Product
        












Actions
        Automate any workflow
      







Packages
        Host and manage packages
      







Security
        Find and fix vulnerabilities
      







Codesp


#### notion

In [12]:
from langchain.document_loaders import NotionDirectoryLoader
loader = NotionDirectoryLoader("docs/Notion_DB")
docs = loader.load()

In [11]:
print(docs[0].page_content[0:200])
docs[0].metadata

IndexError: list index out of range

# Document Splitter

#### character

In [13]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

In [15]:
chunk_size =26
chunk_overlap = 4
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)
c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

In [16]:
text1 = 'abcdefghijklmnopqrstuvwxyz'
r_splitter.split_text(text1)

['abcdefghijklmnopqrstuvwxyz']

In [17]:
text2 = 'abcdefghijklmnopqrstuvwxyzabcdefg'
r_splitter.split_text(text2)

['abcdefghijklmnopqrstuvwxyz', 'wxyzabcdefg']

In [19]:
text3 = "a b c d e f g h i j k l m n o p q r s t u v w x y z"
r_splitter.split_text(text3)

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']

In [20]:
c_splitter.split_text(text3)

['a b c d e f g h i j k l m n o p q r s t u v w x y z']

In [24]:
c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separator = ' '
)
c_splitter.split_text(text3)

['a b c d e f g h i j k l m', 'l m n o p q r s t u v w x', 'w x y z']

In [22]:
some_text = """When writing documents, writers will use document structure to group content. \
This can convey to the reader, which idea's are related. For example, closely related ideas \
are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n  \
Paragraphs are often delimited with a carriage return or two carriage returns. \
Carriage returns are the "backslash n" you see embedded in this string. \
Sentences have a period at the end, but also, have a space.\
and words are separated by space."""
len(some_text)

496

In [25]:
c_splitter = CharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=0,
    separator = ' '
)
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=0, 
    separators=["\n\n", "\n", " ", ""]
)
c_splitter.split_text(some_text)

['When writing documents, writers will use document structure to group content. This can convey to the reader, which idea\'s are related. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also,',
 'have a space.and words are separated by space.']

In [26]:
r_splitter.split_text(some_text)

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.",
 'Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also, have a space.and words are separated by space.']

In [27]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)
r_splitter.split_text(some_text)

["When writing documents, writers will use document structure to group content. This can convey to the reader, which idea's are related.",
 'For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document.',
 'Paragraphs are often delimited with a carriage return or two carriage returns.',
 'Carriage returns are the "backslash n" you see embedded in this string.',
 'Sentences have a period at the end, but also, have a space.and words are separated by space.']

In [29]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("docs/MachineLearning-Lecture01.pdf")
pages = loader.load()

In [30]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=150,
    length_function=len
)

In [31]:
docs = text_splitter.split_documents(pages)

#### token

In [32]:
from langchain.text_splitter import TokenTextSplitter

In [33]:
text_splitter = TokenTextSplitter(chunk_size=1, chunk_overlap=0)

In [34]:
text1 = "foo bar bazzyfoo"
text_splitter.split_text(text1)

['foo', ' bar', ' b', 'az', 'zy', 'foo']

In [35]:
text_splitter = TokenTextSplitter(chunk_size=10, chunk_overlap=0)

In [36]:
docs = text_splitter.split_documents(pages)
docs[0]

Document(page_content='MachineLearning-Lecture01  \n', metadata={'source': 'docs/MachineLearning-Lecture01.pdf', 'page': 0})

#### context aware

In [37]:
from langchain.text_splitter import MarkdownHeaderTextSplitter

In [38]:
markdown_document = """# Title\n\n \
## Chapter 1\n\n \
Hi this is Jim\n\n Hi this is Joe\n\n \
### Section \n\n \
Hi this is Lance \n\n 
## Chapter 2\n\n \
Hi this is Molly"""

In [40]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

In [41]:
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
)
md_header_splits = markdown_splitter.split_text(markdown_document)

In [42]:
md_header_splits[0]

Document(page_content='Hi this is Jim  \nHi this is Joe', metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1'})

In [43]:
md_header_splits[1]

Document(page_content='Hi this is Lance', metadata={'Header 1': 'Title', 'Header 2': 'Chapter 1', 'Header 3': 'Section'})

# VectorStore & Embeddings

In [71]:
from langchain.document_loaders import PyPDFLoader

# Load PDF
loaders = [
    # Duplicate documents on purpose - messy data
    # PyPDFLoader("docs/MachineLearning-Lecture01.pdf"),
    # PyPDFLoader("docs/MachineLearning-Lecture01.pdf"),
    # PyPDFLoader("docs/MachineLearning-Lecture02.pdf"),
    # PyPDFLoader("docs/MachineLearning-Lecture03.pdf")
    PyPDFLoader("docs/Gadgeon.pdf"),
    PyPDFLoader("docs/doc.pdf"),
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [72]:
# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 100
)

In [73]:
splits = text_splitter.split_documents(docs)
len(splits)

102

#### Embeddings

In [74]:
from langchain.embeddings.openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

In [75]:
sentence1 = "i like dogs"
sentence2 = "i like canines"
sentence3 = "the weather is ugly outside"

embedding1 = embedding.embed_query(sentence1)
embedding2 = embedding.embed_query(sentence2)
embedding3 = embedding.embed_query(sentence3)

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-M9CVthH53cZTlzWYQuk6Ug3B on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-M9CVthH53cZTlzWYQuk6Ug3B on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/

In [76]:
import numpy as np

In [77]:
print(np.dot(embedding1, embedding2))
print(np.dot(embedding1, embedding3))
print(np.dot(embedding2, embedding3))

0.9631973137440455
0.771004645868524
0.7596054193696385


#### Vectorstores

In [78]:
from langchain.vectorstores import Chroma

In [79]:
persist_directory = 'docs/chroma/'
!rm -rf ./docs/chroma  # remove old database files if any

In [80]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-M9CVthH53cZTlzWYQuk6Ug3B on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-text-embedding-ada-002 in organization org-M9CVthH53cZTlzWYQuk6Ug3B on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/

In [81]:
print(vectordb._collection.count())

102


#### Similarity Search

In [91]:
question = "what is does webcardio do?"

In [92]:
docs = vectordb.similarity_search(question,k=3)

In [94]:
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))


In [95]:
pretty_print_docs(docs)

Document 1:

or
any
other
buildings
from
a
mobile
or
any
internet
accessible
device
from
anywhere
in
the
world.
Gadgeon
Medical
Systems
Pvt.
Ltd
a
fully
owned
subsidiary
of
Gadgeon
Smart
Systems
Private
Limited,
with
a
vision
to
be
the
leader
in
engineering
a
healthier
future
where
digitally
enabled,
personalized,
predictive,
preventive,
and
participatory
healthcare
helps
people
live
healthier
and
longer
lives.
The
company's
mission
is
to
focus
on
commercializing
innovative
technologies
to
create
clinical
outcomes
for
the
patients.
Our
flagship
healthcare
platform
–
WebCardio
is
a
connected
care
platform
that
is
already
transforming
the
way
Ambulatory
Cardiac
monitoring
is
delivered
and
has
become
the
new
normal
among
the
leading
cardiologists
and
neurologists
in
hospitals
across
India
and
abroad.
Our
Product
engineering
services
use
hardware,
embedded,
software,
and
IT
solutions
for
faster
design,
development,
and
launching
of
products.
Today,
lack
of
the
right
domain
experience
or
te

In [97]:
docs[0].page_content

"or\nany\nother\nbuildings\nfrom\na\nmobile\nor\nany\ninternet\naccessible\ndevice\nfrom\nanywhere\nin\nthe\nworld.\nGadgeon\nMedical\nSystems\nPvt.\nLtd\na\nfully\nowned\nsubsidiary\nof\nGadgeon\nSmart\nSystems\nPrivate\nLimited,\nwith\na\nvision\nto\nbe\nthe\nleader\nin\nengineering\na\nhealthier\nfuture\nwhere\ndigitally\nenabled,\npersonalized,\npredictive,\npreventive,\nand\nparticipatory\nhealthcare\nhelps\npeople\nlive\nhealthier\nand\nlonger\nlives.\nThe\ncompany's\nmission\nis\nto\nfocus\non\ncommercializing\ninnovative\ntechnologies\nto\ncreate\nclinical\noutcomes\nfor\nthe\npatients.\nOur\nflagship\nhealthcare\nplatform\n–\nWebCardio\nis\na\nconnected\ncare\nplatform\nthat\nis\nalready\ntransforming\nthe\nway\nAmbulatory\nCardiac\nmonitoring\nis\ndelivered\nand\nhas\nbecome\nthe\nnew\nnormal\namong\nthe\nleading\ncardiologists\nand\nneurologists\nin\nhospitals\nacross\nIndia\nand\nabroad.\nOur\nProduct\nengineering\nservices\nuse\nhardware,\nembedded,\nsoftware,\nand\nIT\nso

In [87]:
vectordb.persist()

# Retrieval