In [1]:
import os
import bs4

from langchain_community.document_loaders import TextLoader, WebBaseLoader, PyPDFLoader
from dotenv import load_dotenv

load_dotenv()

os.environ['LANGCHAIN_API_KEY'] = os.getenv('LANGCHAIN_API_KEY')

In [17]:
# simple text loaders 
text_loader = TextLoader("./test_speech.txt")
text_documents = text_loader.load()
text_documents[0].page_content

'The world must be made safe for democracy. Its peace must be planted upon the tested foundations of political liberty. We have no selfish ends to serve. We desire no conquest, no dominion. We seek no indemnities for ourselves, no material compensation for the sacrifices we shall freely make. We are but one of the champions of the rights of mankind. We shall be satisfied when those rights have been made as secure as the faith and the freedom of nations can make them.\n\nJust because we fight without rancor and without selfish object, seeking nothing for ourselves but what we shall wish to share with all free peoples, we shall, I feel confident, conduct our operations as belligerents without passion and ourselves observe with proud punctilio the principles of right and of fair play we profess to be fighting for.\n\nâ€¦\n\nIt will be all the easier for us to conduct ourselves as belligerents in a high spirit of right and fairness because we act without animus, not in enmity toward a peop

In [18]:
# simple web data loader
web_loader  = WebBaseLoader(web_paths = ('https://lilianweng.github.io/posts/2023-06-23-agent/', ),
                            bs_kwargs = dict(parse_only = bs4.SoupStrainer(
                                        class_ = ('post-title', 'post-content', 'post-header')
                            )))
text_documents = web_loader.load()
len(text_documents[0].page_content)

In [26]:
# pypdf data loader and the use of text splitter
pdf_loader = PyPDFLoader("./article.pdf")
pdf_docs = pdf_loader.load()
len(pdf_docs)

8

In [29]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
documents = text_splitter.split_documents(pdf_docs)

documents[0]

Document(page_content='IOE Graduate Conference\n[Placeholder for\nPublication\nInformation]\nAutomatic Identification of Monuments in Images using Single-Shot\nDetectors\n[Author name(s) masked for blind review]\n[Author information masked for blind review]\n[Author contact masked for blind review]\nAbstract\nMonuments, embodying historical, archaeological, and cultural significance, serve as gateways to unraveling rich histories,\nparticularly for foreigners. To aid monument identification within images, we fine-tuned the lightweight CNN model, MobileNetV2,\nwith SSD for feature extraction and prediction of monument locations and labels. Subsequently, we trained the more resource\nintensive YOLOv5s model. Our dataset comprised manually collected databases from Kathmandu Valley’s three Durbar Squares:\nKathmandu, Bhaktapur, and Patan. The SSD reached a maximum mAP@0.5 score of 78.68% for test data, while the YOLOv5s\nmodel demonstrated superior performance, with mAP@0.5 scores peaking 

In [39]:
# vector embedding and vector stores

from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
ollama_emb = OllamaEmbeddings(
    model="gemma:2b",
)

db = Chroma.from_documents(documents[:10], ollama_emb)


In [50]:
query = 'achievement section'
retrieved_results = db.similarity_search(query)

In [51]:
from pprint import pprint
pprint(retrieved_results[0].page_content)

('variants (SSD300 and SSD512) but with significantly fewer\n'
 'parameters and lower computation cost. It was found to be 20\n'
 'times more efficient and 10 times smaller than the original SSD\n'
 'while outperforming YOLOv2 on COCO. A study done by [ 6]\n'
 'compared YOLO and MobileNet-SSD for single-stage object\n'
 'detection, finding both suitable for diverse scenarios. YOLO\n'
 'prioritized accuracy but faced localization challenges, while\n'
 'SSD excelled in speed. However, SSD with MobileNetV2 could\n'
 'offer comparable speed to YOLOv5s with less demanding\n'
 'hardware, with a slight accuracy trade-off.\n'
 'The mentioned research advancements in monument\n'
 'classification often struggle with localizing and detecting\n'
 'multiple monuments within a single image, necessitating the use\n'
 'of object detection models. In our exploration, no studies were\n'
 'found on identifying Nepalese monuments, and similar\n'
 'monuments clustered closely together. Object detection thu

In [52]:
## faiss vector database
from langchain_community.vectorstores import FAISS
db = FAISS.from_documents(documents[:5], ollama_emb)