<a href="https://colab.research.google.com/github/sanislearning/LangChainProjects/blob/main/SemanticSearchEngine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
#Goal is to familiarize with LangChain's document loader, embedding and vector store abstractions

In [31]:
import pandas as pd
import numpy as np
from google.colab import userdata

In [32]:
import getpass
import os

os.environ["LANGSMITH_TRACING"]='true'
os.environ['LANGSMITH_API_KEY']=userdata.get('LANGSMITH_API_KEY')

In [33]:
if not os.environ.get("GOOGLE_API_KEY"):
  os.environ['GOOGLE_API_KEY']=userdata.get('GOOGLE_API_KEY')

#Documents

In [34]:
from langchain_core.documents import Document

In [35]:
from langchain_community.document_loaders import PyPDFLoader
from google.colab import drive
drive.mount('/content/drive')
file_path="/content/drive/MyDrive/PData/nke-10k-2023.pdf"

loader=PyPDFLoader(file_path)
#PyPDFLoader loads one Document object per PDF page
#A Document object contains a unit of text and associated metadata
docs=loader.load()
print(len(docs))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
107


In [36]:
print(f"{docs[0].page_content}")

Table of Contents
UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
FORM 10-K
(Mark One)
☑  ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934
FOR THE FISCAL YEAR ENDED MAY 31, 2023
OR
☐  TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934
FOR THE TRANSITION PERIOD FROM                         TO                         .
Commission File No. 1-10635
NIKE, Inc.
(Exact name of Registrant as specified in its charter)
Oregon 93-0584541
(State or other jurisdiction of incorporation) (IRS Employer Identification No.)
One Bowerman Drive, Beaverton, Oregon 97005-6453
(Address of principal executive offices and zip code)
(503) 671-6453
(Registrant's telephone number, including area code)
SECURITIES REGISTERED PURSUANT TO SECTION 12(B) OF THE ACT:
Class B Common Stock NKE New York Stock Exchange
(Title of each class) (Trading symbol) (Name of each exchange on which registered)
SECURITIES REGISTERED PURSU

In [37]:
print(docs[0].metadata)

{'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'creationdate': '2023-07-20T16:22:00-04:00', 'title': '0000320187-23-000039', 'author': 'EDGAR Online, a division of Donnelley Financial Solutions', 'subject': 'Form 10-K filed on 2023-07-20 for the period ending 2023-05-31', 'keywords': '0000320187-23-000039; ; 10-K', 'moddate': '2023-07-20T16:22:08-04:00', 'source': '/content/drive/MyDrive/PData/nke-10k-2023.pdf', 'total_pages': 107, 'page': 0, 'page_label': '1'}


#Splitting

In [41]:
#A page might have information that is not really concise. End goal should be to retrieve Document objects that answer an input query
#Nearby text shouldn't wash out the meaning of relavant portions, so we use text splitters
#Here we will be splitting documents into chunks of 1000 characters with a 200 character overlap

In [43]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter=RecursiveCharacterTextSplitter(
    chunk_size=1000,chunk_overlap=200,add_start_index=True
    #add_start_index tracks where the chunk initially started from in the orginal Document
)
all_splits=text_splitter.split_documents(docs)
len(all_splits)

516

#Embedding and Vectorizing

In [44]:
#Vector search is a common way to store and search over unstructured data
#VectorStore contains methods for adding text and Document objects to the store
#and querying them using various similarity metrics

In [45]:
from langchain.chat_models import init_chat_model
llm=init_chat_model("gemini-2.0-flash",model_provider='google_genai')

In [46]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings #imports the Embedding model related to the Gemini models
embeddings=GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [47]:
#Creates a vector store in-memory, ie in the form of a python dictionary and uses cosine similarity to match
from langchain_core.vectorstores import InMemoryVectorStore
vector_store=InMemoryVectorStore(embeddings)

In [50]:
vector1=embeddings.embed_query(all_splits[0].page_content)
vector2=embeddings.embed_query(all_splits[1].page_content)
assert len(vector1)==len(vector2)
print(f"Generated vectors of length {len(vector1)}\n")
print(vector1[:10])

Generated vectors of length 768

[0.003303560661152005, -0.01885664090514183, -0.023528870195150375, 0.013265608809888363, 0.04694835841655731, 0.04489297419786453, 0.030707117170095444, 0.017642803490161896, 0.0011852466268464923, 0.028473228216171265]


In [52]:
ids=vector_store.add_documents(documents=all_splits) #all_splits is the variable that holds the split Documents

#Usage