## 1. Install Required Libraries

In [None]:
!pip install \
langchain_community \
unstructured \
langchain-text-splitters

Collecting langchain_community
  Downloading langchain_community-0.2.14-py3-none-any.whl.metadata (2.7 kB)
Collecting unstructured
  Downloading unstructured-0.15.8-py3-none-any.whl.metadata (29 kB)
Collecting langchain-text-splitters
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl.metadata (2.1 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting langchain<0.3.0,>=0.2.15 (from langchain_community)
  Downloading langchain-0.2.15-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core==0.2.36 (from langchain_community)
  Downloading langchain_core-0.2.36-py3-none-any.whl.metadata (6.2 kB)
Collecting langsmith<0.2.0,>=0.1.0 (from langchain_community)
  Downloading langsmith-0.1.107-py3-none-any.whl.metadata (13 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain_community)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting jsonpatch<2.0,

In [None]:
!pip install faiss-cpu sentence-transformers pypdf


Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu, sentence-transformers
Successfully installed faiss-cpu-1.8.0.post1 sentence-transformers-3.0.1


## 2. Import Required Libraries

In [None]:
from sentence_transformers import SentenceTransformer
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import faiss
import numpy as np


  from tqdm.autonotebook import tqdm, trange


## 3: Load and Split Your PDF Document

In [None]:
loader = PyPDFLoader("/content/Nepal.pdf")
docs = loader.load_and_split()

# Split documents into smaller chunks
text_splitter = RecursiveCharacterTextSplitter()
split_docs = text_splitter.split_documents(docs)


## 4: Generate Embeddings Using Sentence-Transformers

In [None]:
# Load the pre-trained model for generating embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for each split document
doc_embeddings = model.encode([doc.page_content for doc in split_docs])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## 5: Set Up FAISS for Vector Search

In [13]:
# Initialize FAISS index for cosine similarity
embedding_dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatIP(embedding_dimension)  # Using Inner Product (cosine similarity) for better performance
faiss.normalize_L2(doc_embeddings)  # Normalize the embeddings to unit length for cosine similarity

# Add embeddings to the FAISS index
index.add(np.array(doc_embeddings))


## 6: Perform Similarity Search Using FAISS

In [27]:
# Define your query and generate its embedding
query = "waht is world highest mountain?"
query_embedding = model.encode([query])
faiss.normalize_L2(query_embedding)  # Normalize the query embedding

# Perform similarity search
k = 2  # Number of similar documents to retrieve
distances, indices = index.search(np.array(query_embedding), k)

# Retrieve the matched documents
similar_docs = [split_docs[idx] for idx in indices[0]]
# print(similar_docs[0].page_content)  # Print the content of the most similar document

# Function to dynamically generate a response based on the most relevant sentence
def generate_best_sentence_response(docs, query):
    combined_content = " ".join([doc.page_content for doc in docs])
    sentences = combined_content.split('. ')  # Split content into sentences

    # Generate embeddings for all sentences
    sentence_embeddings = model.encode(sentences)
    faiss.normalize_L2(sentence_embeddings)  # Normalize sentence embeddings for cosine similarity

    # Calculate cosine similarities between the query and each sentence
    similarities = np.dot(sentence_embeddings, query_embedding.T)

    # Find the most similar sentence
    most_relevant_idx = np.argmax(similarities)
    best_sentence = sentences[most_relevant_idx]

    if similarities[most_relevant_idx] < 0.5:  # Check against a similarity threshold
        response = "I'm sorry, I couldn't find any specific information related to your query in the provided documents."
    else:
        response = f"Here is the most relevant information regarding your query:\n\n{best_sentence.strip()}"

    return response

# Generate and print the response
chatbot_response = generate_best_sentence_response(similar_docs, query)
print(chatbot_response)

Here is the most relevant information regarding your query:

The allure of the Himalayas, including Mount Everest, the world's highest 
peak, and the Annapurna Circuit, one of the most challenging trekking routes, is a major draw for 
adventure seekers
