In [1]:
from langchain_community.llms import Ollama
from langchain_community.document_loaders import WebBaseLoader
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings # import hf embedding
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
import re
import pandas as pd
from datetime import date, timedelta
import datetime
import os
from pinecone import Pinecone
import time
import dotenv
from langchain.embeddings.openai import OpenAIEmbeddings
import uuid
dotenv.load_dotenv()

  from tqdm.autonotebook import tqdm


True

In [2]:
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2")

OAI_embed_model = OpenAIEmbeddings(model="text-embedding-ada-002")

  warn_deprecated(


In [4]:
loader = WebBaseLoader("https://travel.state.gov/content/travel/en/traveladvisories/traveladvisories/israel-west-bank-and-gaza-travel-advisory.html")
data = loader.load()
len(data[0].page_content)

10714

In [5]:
data



In [6]:
text = data[0].page_content\
    .replace('-','')\
    .replace(' – ','')\
    .replace('-','')\
    .replace('\n',' ')\
    .replace('\xa0',' ')\
    .replace('\u202f',' ')\
    .replace('  ',' ')\
    .replace('Log in','')\
    .replace('Do Not Sell My Personal Information','')\
    .replace('Subscribe','')\
    .replace('Follow Us','')\
    .strip().split('Updated with information on travel')[1]\
    .split('× External Link You are')[0]
text



In [7]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=650,
    chunk_overlap=35, 
    length_function = len,
    separators=["\n\n","\n\t\t\n\n*","\n"," "]
)

In [8]:
all_splits = r_splitter.split_text(text)
len(all_splits)

12

In [12]:
import itertools
combined_splits = list(itertools.chain.from_iterable(all_splits))

In [13]:
len(all_splits),len(combined_splits)

(12, 7299)

# FAISS

In [14]:
db = FAISS.from_documents(data, embeddings)
db.save_local(f"israel_dos_db")

In [19]:
# llm = Ollama(model='mistral')
# chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=db.as_retriever(),return_source_documents=True,
#                                               rephrase_question=True)

In [37]:
# query="""is it safe to visit israel?"""
# result = chain({"question": query,"chat_history": []})
# result

# Pinecone

In [21]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY") )

In [23]:
from pinecone import ServerlessSpec

spec = ServerlessSpec(
    cloud="aws", region="us-east-1"
)

In [24]:
index_name = 'wanderchat-israel-rag'
existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

# check if index already exists (it shouldn't if this is first time)
if index_name not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=1536,  # dimensionality of ada 002
        metric='dotproduct',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [29]:
for i, text in enumerate(all_splits):
    embeds = OAI_embed_model.embed_documents(text)
    metadata = {'text':text,
                'source': 'https://travel.state.gov/content/travel/en/traveladvisories/traveladvisories/israel-west-bank-and-gaza-travel-advisory.html',
                'date': datetime.datetime.now().date()}
    index.upsert(vectors=[(str(i), embeds[0], metadata)])

In [31]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 12}},
 'total_vector_count': 12}

In [32]:
from langchain.vectorstores import Pinecone

text_field = "text"  # the metadata field that contains our text

# initialize the vector store object
vectorstore = Pinecone(
    index, OAI_embed_model.embed_query, text_field
)

  warn_deprecated(


In [33]:
query = "is israel safe to visit?"

vectorstore.similarity_search(query, k=3)

[Document(page_content='and terrorist attacks. Additional travel restrictions may be imposed on U.S. government employees under Chief of Mission security responsibility with little to no notice due to increased security issues or threats. Visit our website for Travel to High Risk Areas.     Travel Advisory Levels       Assistance for U.S. Citizens U.S. Embassy Jerusalem 14 David Flusser StreetJerusalem 93392For Embassy Branch Office Tel Aviv, email TelAvivACS@state.gov. For additional contact information for the Embassy Branch Office, see the Embassies and Consulates section on this page.   Telephone + (972) (2) 6304000  Emergency + (972) (3) 5197551  Fax + (972)', metadata={'date': '2024-05-05', 'source': 'https://travel.state.gov/content/travel/en/traveladvisories/traveladvisories/israel-west-bank-and-gaza-travel-advisory.html'}),
 Document(page_content='are currently restricted from personal travel to the following locations: Within seven miles of the Gaza demarcation line, as well 

In [41]:
query = "what is the best time to visit thailand??"

vectorstore.similarity_search(query, k=3)

[Document(page_content='there. The Israel Defense Forces (IDF) are conducting largescale military operations in Gaza against Hamas, a U.S. governmentdesignated foreign terrorist organization, which was responsible for the October 7 attack on Israel. As a result of the armed conflict, the security environment within Gaza and on its borders is extremely dangerous and volatile. The pedestrian crossing between Gaza and Israel was damaged on October 7 and remains closed, and the pedestrian crossing between Egypt and Gaza may close without advance notice depending on the security situation. There are sporadic telecommunication and internet outages within Gaza further', metadata={'date': '2024-05-05', 'source': 'https://travel.state.gov/content/travel/en/traveladvisories/traveladvisories/israel-west-bank-and-gaza-travel-advisory.html'}),
 Document(page_content='do not cover mental health related illnesses/care.  Enroll in the Smart Traveler Enrollment Program (STEP) to receive Alerts and make

In [34]:
def augment_prompt(query: str):
    # get top 3 results from knowledge base
    results = vectorstore.similarity_search(query, k=3)
    # get the text from the results
    source_knowledge = "\n".join([x.page_content for x in results])
    # feed into an augmented prompt
    augmented_prompt = f"""Using the contexts below, answer the query.

    Contexts:
    {source_knowledge}

    Query: {query}"""
    #print(augmented_prompt)
    return augmented_prompt

In [35]:
p = augment_prompt(query)
p

'Using the contexts below, answer the query.\n\n    Contexts:\n    and terrorist attacks. Additional travel restrictions may be imposed on U.S. government employees under Chief of Mission security responsibility with little to no notice due to increased security issues or threats. Visit our website for Travel to High Risk Areas.     Travel Advisory Levels       Assistance for U.S. Citizens U.S. Embassy Jerusalem 14 David Flusser StreetJerusalem 93392For Embassy Branch Office Tel Aviv, email TelAvivACS@state.gov. For additional contact information for the Embassy Branch Office, see the Embassies and Consulates section on this page.   Telephone + (972) (2) 6304000  Emergency + (972) (3) 5197551  Fax + (972)\nare currently restricted from personal travel to the following locations: Within seven miles of the Gaza demarcation line, as well as the cities of Ashdod and Ashkelon; Within 2.5 miles of the Lebanese and Syrian borders; and Within 1.5 miles of the IsraelEgypt border. Additional tra

In [38]:
# llm.invoke(augment_prompt(query))

In [40]:
# llm.invoke(sample_questions[0])