<a href="https://colab.research.google.com/github/prabhakaran-s-code/genai-python/blob/main/Semantic_Search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Abstract
This notebook contains the code to perform semantic search on the data extracted from a set of webpages and return the most relevant URLs. The logic for semantic search is done based on Bert bi-encoder and cross-encoder models. Initially semantic search is done using bi-encoder model and re-ranking is done using a cross-encoder model to get better results.

In [None]:
!pip install sentence_transformers

In [None]:
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer, CrossEncoder, util
from sklearn.metrics.pairwise import cosine_similarity

  from tqdm.autonotebook import tqdm, trange


In [None]:
# Extract all the urls from sitemap.xml for a website

import requests
from bs4 import BeautifulSoup

def extract_urls_from_sitemap(sitemap_url):
    response = requests.get(sitemap_url)
    soup = BeautifulSoup(response.text, 'xml')

    urls = []
    for link in soup.find_all('loc'):
          urls.append(link.text)

    return urls

if __name__ == '__main__':
    sitemap_url = 'https://www.abc.com/sitemap.xml'
    urls = extract_urls_from_sitemap(sitemap_url)

    for url in urls:
        print(url)


In [None]:
# Function to extract content from URL
def extract_content(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    headers = [header.get_text() for header in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]
    paragraphs = [p.get_text() for p in soup.find_all('p')]
    meta = []
    for meta_tag in soup.find_all('meta', {'name': ['title', 'description', 'path', 'tags']}):
        if meta_tag.get('content') is not None:
            meta.append(meta_tag.get('content'))
    return ' '.join(meta + headers + paragraphs)

In [None]:
#We use the Bi-Encoder to encode all passages, so that we can use it with semantic search
bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
bi_encoder.max_seq_length = 256     #Truncate long passages to 256 tokens
top_k = 32                          #Number of passages we want to retrieve with the bi-encoder

#The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

# Extract content from all URLs
documents = [extract_content(url) for url in urls]


# We encode all passages into our vector space. This takes about 5 minutes (depends on your GPU speed)
corpus_embeddings = bi_encoder.encode(documents, convert_to_tensor=True, show_progress_bar=True)

In [None]:
def search(query):
    print("Input question:", query)

    ##### Semantic Search #####
    # Encode the query using the bi-encoder and find potentially relevant passages
    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
    hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
    hits = hits[0]  # Get the hits for the first query

    ##### Re-Ranking #####
    # Now, score all retrieved passages with the cross_encoder
    cross_inp = [[query, documents[hit['corpus_id']]] for hit in hits]
    cross_scores = cross_encoder.predict(cross_inp)

    # Sort results by the cross-encoder scores
    for idx in range(len(cross_scores)):
        hits[idx]['cross-score'] = cross_scores[idx]

    # Output of top-5 hits from bi-encoder
    print("\n-------------------------\n")
    print("Top-3 Bi-Encoder Retrieval hits")
    hits = sorted(hits, key=lambda x: x['score'], reverse=True)
    for hit in hits[0:10]:
        print("\t{:.3f}\t{}".format(hit['score'], urls[hit['corpus_id']]))

    # Output of top-5 hits from re-ranker
    print("\n-------------------------\n")
    print("Top-3 Cross-Encoder Re-ranker hits")
    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
    for hit in hits[0:10]:
        print("\t{:.3f}\t{}".format(hit['cross-score'], urls[hit['corpus_id']]))


In [None]:
search(query = "# user query #")

Below commented code performs semantic search using cosine similarity which was initially tried.

In [None]:
# List of URLs for content extraction
#urls = ["https://www.abc.com/", "https://www.abc.com/about-us"]

In [None]:
# This cell and next one uses Sentence Transformer to embed the documents/query and perform semantic search
# Extract content from all URLs
#documents = [extract_content(url) for url in urls]

# Load pre-trained Sentence Transformer model
#model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for documents
#document_embeddings = model.encode(documents)

In [None]:
# Perform a semantic search
#query = 'tell me something about AI WisdomNext'
#query_embedding = model.encode([query])

# Compute cosine similarity between query and document embeddings
#cosine_scores = cosine_similarity(query_embedding, document_embeddings)

#best_match = cosine_scores.argmax()

# Print the document and its similarity score
#for document_number, score in sorted(enumerate(cosine_scores[0]), key=lambda x: x[1], reverse=True):
#    print(document_number, score)
#print(urls[best_match])