In [None]:
pip install langchain_community faiss-cpu openai

In [None]:
pip install langchain_community

In [None]:
pip install langchain-text-splitters langchain-openai

In [None]:
import os

OPENAI_API_KEY= "sk-----"
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [None]:
#@title Import libraries
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter


In [None]:
#@title load the data

loader= TextLoader('/content/speech.txt')
documents = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=20)
docs= text_splitter.split_documents(documents)



In [None]:
docs

[Document(metadata={'source': '/content/speech.txt'}, page_content='The paper "Efficient Estimation of Word Representations in Vector Space" by Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean (2013) is a foundational paper that introduced Word2Vec, a highly efficient and effective method for learning word embeddings (also known as word representations or word vectors).'),
 Document(metadata={'source': '/content/speech.txt'}, page_content="Before this paper, various methods existed for creating word representations, often based on complex neural network language models (NNLMs) or statistical methods like Latent Semantic Analysis (LSA). These methods were computationally expensive, especially for large datasets and vocabularies. Word2Vec revolutionized the field by demonstrating that high-quality word vectors could be learned much more efficiently.\nb) Continuous Skip-gram Model\nIdea: Predicts the surrounding context words given the current word. It's essentially the inverse of 

In [None]:
#@title Embedding

embeddings = OpenAIEmbeddings(model ="text-embedding-3-small", openai_api_key= OPENAI_API_KEY)

db= FAISS.from_documents(docs, embeddings)

In [None]:
db

<langchain_community.vectorstores.faiss.FAISS at 0x7d14d741ee50>

In [None]:
#@title querying the vector database

query ="what is the main idea behind Continuous Skip-gram Model"

docs = db.similarity_search(query)
docs

[Document(id='7a603cfe-0571-4afc-bd55-ce7e90d48617', metadata={'source': '/content/speech.txt'}, page_content="Before this paper, various methods existed for creating word representations, often based on complex neural network language models (NNLMs) or statistical methods like Latent Semantic Analysis (LSA). These methods were computationally expensive, especially for large datasets and vocabularies. Word2Vec revolutionized the field by demonstrating that high-quality word vectors could be learned much more efficiently.\nb) Continuous Skip-gram Model\nIdea: Predicts the surrounding context words given the current word. It's essentially the inverse of CBOW."),
 Document(id='38cfdc88-0241-4972-9b9e-ab7b50e5a72d', metadata={'source': '/content/speech.txt'}, page_content='Performance: The paper found that Skip-gram generally works better for capturing semantic relationships, especially with large datasets, while CBOW can be faster to train.'),
 Document(id='43fe5acf-3269-42ea-9dcc-5e5c938

In [None]:
len(docs)

4

In [None]:
docs[0].page_content

"Before this paper, various methods existed for creating word representations, often based on complex neural network language models (NNLMs) or statistical methods like Latent Semantic Analysis (LSA). These methods were computationally expensive, especially for large datasets and vocabularies. Word2Vec revolutionized the field by demonstrating that high-quality word vectors could be learned much more efficiently.\nb) Continuous Skip-gram Model\nIdea: Predicts the surrounding context words given the current word. It's essentially the inverse of CBOW."

In [None]:
docs[1].page_content

'Performance: The paper found that Skip-gram generally works better for capturing semantic relationships, especially with large datasets, while CBOW can be faster to train.'

In [None]:
#@title Convert vector store into Retriever class
# retriever act as an interface ....  retrive the details from vector store via "query"

retriever = db.as_retriever() # converting vector into retriever.....

docs= retriever.invoke(query)
docs[0].page_content

"Before this paper, various methods existed for creating word representations, often based on complex neural network language models (NNLMs) or statistical methods like Latent Semantic Analysis (LSA). These methods were computationally expensive, especially for large datasets and vocabularies. Word2Vec revolutionized the field by demonstrating that high-quality word vectors could be learned much more efficiently.\nb) Continuous Skip-gram Model\nIdea: Predicts the surrounding context words given the current word. It's essentially the inverse of CBOW."

In [None]:
#@title Similarity score with FAISS (returns manhattan score- L2)

docs_and_score = db.similarity_search_with_score(query)
docs_and_score[1]

(Document(id='38cfdc88-0241-4972-9b9e-ab7b50e5a72d', metadata={'source': '/content/speech.txt'}, page_content='Performance: The paper found that Skip-gram generally works better for capturing semantic relationships, especially with large datasets, while CBOW can be faster to train.'),
 np.float32(0.9252607))

In [None]:
#@title passing vectors instead of sentences

embedding_vector = embeddings.embed_query(query)
embedding_vector

[0.040307268500328064,
 0.01383289322257042,
 0.006966611370444298,
 -0.002762189134955406,
 -0.04765638709068298,
 0.029496794566512108,
 0.060398198664188385,
 -0.010145793668925762,
 -0.06446153670549393,
 0.028518585488200188,
 -0.0041542574763298035,
 -0.03892774134874344,
 -0.01842295564711094,
 0.03147829696536064,
 0.009562630206346512,
 0.003787428606301546,
 0.014133880846202374,
 0.016516700387001038,
 0.05939490720629692,
 0.03338455408811569,
 0.022122595459222794,
 0.02118200995028019,
 -0.007857033051550388,
 0.03373570367693901,
 0.01654178276658058,
 -7.20626485417597e-05,
 0.03127763792872429,
 0.0696786567568779,
 0.02209751307964325,
 -0.010296287946403027,
 -0.027214305475354195,
 -0.018347708508372307,
 -0.04341747611761093,
 -0.037748873233795166,
 0.007010505069047213,
 0.005913154222071171,
 0.0022009725216776133,
 0.03679574653506279,
 -0.014472492039203644,
 0.022636784240603447,
 0.021884314715862274,
 0.01659194752573967,
 -0.0027951097581535578,
 0.0342875

In [None]:
docs_and_score = db.similarity_search_by_vector(embedding_vector)
docs_and_score[2]

Document(id='43fe5acf-3269-42ea-9dcc-5e5c9384f322', metadata={'source': '/content/speech.txt'}, page_content='In summary, this paper introduced Word2Vec, a set of highly efficient algorithms (CBOW and Skip-gram) for learning high-quality word embeddings. Its effectiveness stems from simplified model architectures and clever optimization techniques like hierarchical softmax and negative sampling, which enabled training on unprecedentedly large text corpora.')

In [None]:
#@title save vector db in local - pickle file

db.save_local("faiss_index")

In [None]:
new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

In [None]:
docs = new_db.similarity_search(query)
docs[1].page_content

'Performance: The paper found that Skip-gram generally works better for capturing semantic relationships, especially with large datasets, while CBOW can be faster to train.'