### 01. Document loader example

In [None]:
# pdf document loader
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader('policy-copy.pdf')
document = loader.load()
document

[Document(metadata={'source': 'policy-copy.pdf', 'page': 0}, page_content='Reference No.: W385291527\nDate: Jun 02, 2024\nSOUMAJIT BISWAS\nFLAT 26077, SOBHA DREAM ACRES, BALAGERE, BANGALORE\nBANGALORE\nKARNATAKA 560087\nMobile No: 7501552890\n \nSub: Risk Assumption Letter\n \nDear SOUMAJIT BISWAS,\n \nWe value your relationship with ICICI Lombard General Insurance Company Limited and thank you for choosing us as your preferred insurance provider.\n \nPlease find enclosedPolicy No. 3005/O/346569624/00/000, The same has been issued based on below mentioned details, provided by you at the time of policy \npurchase.\n \nInsured & Vehicle Details\nName of the Insured\nPeriod of Insurance - Own Damage\nVehicle Make / Model\nRTO City\nVehicle Registration No.\nVehicle Registration Date\nEngine No.\nChassis No.\nCurrent Year NCB(%)\nSOUMAJIT BISWAS\nJun 04, 2024 to Jun 03, 2025\nROYAL ENFIELD / CLASSIC 350 SIGNALS ABS\nKARNATAKA-BANGALORE\nKA01JX1155\nJun 14, 2023\nJ3A5FEP2975579\nME3J3C5FEP2

In [11]:
# web-based loader:
import bs4
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader(web_paths=("https://www.nseindia.com/reports/fii-dii",),
                        )
# loader = WebBaseLoader(web_paths=("https://www.nseindia.com/reports/fii-dii",),
#                         bs_kwargs=dict(parse_only=bs4.SoupStrainer(id="fiidiiTable")))                       
loader.load()


[Document(metadata={'source': 'https://www.nseindia.com/reports/fii-dii', 'title': '\r\n    FII/FPI & DII trading activity on NSE, BSE and MSEI Reports - NSE India\r\n', 'description': 'Get the latest Additional Surveillance Measure Reports at NSE India (National Stock Exchange of India). Combined FII/FPI trading data across NSE, BSE and MSEI collated on the basis of trades executed by FIIs/FPIs.', 'language': 'en'}, page_content="\n\n\n\n\n\n\n\r\n    FII/FPI & DII trading activity on NSE, BSE and MSEI Reports - NSE India\r\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nOption Chain\nListings\nIPO\nCirculars\nDaily Reports\nHolidays\nPress Releases\nContact Us\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\r\n                \xa0 \xa0\r\n                \nEnglish\nहिन्दी (Hindi)\nमराठी (Marathi)\nગુજરાતી (Gujarati)\nবাংলা (Bengali)\nಕನ್ನಡ (Kannada)\nதமிழ் (Tamil)\nతెలుగు 

### 02. Document Splitter

#### 02.01 PDF splitter example

In [12]:
# load document:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader('policy-copy.pdf')
document = loader.load()

# start splitting
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100,chunk_overlap=20)
final_document = text_splitter.split_documents(document)
final_document

[Document(metadata={'source': 'policy-copy.pdf', 'page': 0}, page_content='Reference No.: W385291527\nDate: Jun 02, 2024\nSOUMAJIT BISWAS'),
 Document(metadata={'source': 'policy-copy.pdf', 'page': 0}, page_content='SOUMAJIT BISWAS\nFLAT 26077, SOBHA DREAM ACRES, BALAGERE, BANGALORE\nBANGALORE\nKARNATAKA 560087'),
 Document(metadata={'source': 'policy-copy.pdf', 'page': 0}, page_content='KARNATAKA 560087\nMobile No: 7501552890\n \nSub: Risk Assumption Letter\n \nDear SOUMAJIT BISWAS,'),
 Document(metadata={'source': 'policy-copy.pdf', 'page': 0}, page_content='We value your relationship with ICICI Lombard General Insurance Company Limited and thank you for'),
 Document(metadata={'source': 'policy-copy.pdf', 'page': 0}, page_content='and thank you for choosing us as your preferred insurance provider.'),
 Document(metadata={'source': 'policy-copy.pdf', 'page': 0}, page_content='Please find enclosedPolicy No. 3005/O/346569624/00/000, The same has been issued based on below'),
 Document(me

#### 02.02 Document splitter for HTML example

In [16]:
from langchain_text_splitters import HTMLHeaderTextSplitter

header_to_split_on = [
    ("h1","Header 1"),
    ("h2","Header 2"),
    ("h3","Header 3")
]

html_splitter = HTMLHeaderTextSplitter(header_to_split_on)

html_splitter.split_text_from_url("https://en.wikipedia.org/wiki/Sun")

[Document(metadata={}, page_content='Main menu  \nmove to sidebar hide  \nMain menu  \nNavigation  \nMain pageContentsCurrent eventsRandom articleAbout WikipediaContact us  \nContribute  \nHelpLearn to editCommunity portalRecent changesUpload file  \nSearch  \nSearch  \nAppearance  \nDonate Create account Log in  \nPersonal tools  \nDonate Create account Log in  \nPages for logged out editors learn more  \nContributionsTalk  \nContents move to sidebar hide  \nToggle General characteristics subsection Toggle Structure and fusion subsection Toggle Magnetic activity subsection Toggle Life phases subsection Toggle Location subsection Toggle Observational history subsection Toggle Observation by eyes subsection  \n(Top)  \n1 Etymology  \n2 General characteristics  \n2.1 Rotation  \n3 Composition  \n4 Structure and fusion  \n4.1 Core  \n4.2 Radiative zone  \n4.3 Tachocline  \n4.4 Convective zone  \n4.5 Photosphere  \n4.6 Atmosphere  \n5 Solar radiation  \n6 Magnetic activity  \n6.1 Sunspot  

### Embedding

#### Ollama embedding

In [19]:
from langchain_community.embeddings import OllamaEmbeddings

embedding = (
    OllamaEmbeddings(model="llama3.2")
)

In [None]:
r1 = embedding.embed_documents(
    ["My name is Soumajit",
    "My surname is Biswas"]
)

### VectorStore DB

#### FAISS

In [23]:
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import CharacterTextSplitter