<a href="https://colab.research.google.com/github/premkrishn/bert-hands-on-nlp/blob/main/web_pages_similar_content_on_url.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [47]:
import requests
from bs4 import BeautifulSoup
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to extract text from HTML
def extract_text_from_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    text = ' '.join([elem.get_text().strip() for elem in soup.find_all(text=True)])
    return text

# Function to tokenize and embed text using BERT
def embed_text(text):
    tokenized_input = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
    with torch.no_grad():
        outputs = model(**tokenized_input)
        embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token embeddings
    return embeddings

# Function to calculate cosine similarity between two sets of embeddings
def calculate_similarity(embeddings1, embeddings2):
    return cosine_similarity(embeddings1, embeddings2)

# Function to compare two web pages and find similar sections
def compare_web_pages(url1, url2):
    # Download HTML content of both pages
    html1 = requests.get(url1).text
    html2 = requests.get(url2).text

    # Extract text from HTML
    text1 = extract_text_from_html(html1)
    text2 = extract_text_from_html(html2)

    # Remove excess white spaces
    text1 = ' '.join(text1.split())
    text2 = ' '.join(text2.split())

    # Split text into smaller chunks (max 50 words per chunk)
    chunks1 = [text1[i:i+250] for i in range(0, len(text1), 250)]
    chunks2 = [text2[i:i+250] for i in range(0, len(text2), 250)]

    # Embed text chunks using BERT
    embeddings1 = embed_text(' '.join(chunks1))
    embeddings2 = embed_text(' '.join(chunks2))

    # Calculate similarity between embeddings
    similarity_matrix = calculate_similarity(embeddings1, embeddings2)

    # Find similar sections
    similar_sections = []
    for i in range(similarity_matrix.shape[0]):
        for j in range(similarity_matrix.shape[1]):
            if similarity_matrix[i][j] > 0.5:  # Threshold for similarity
                similar_sections.append((i, j, similarity_matrix[i][j]))

    return similar_sections, chunks1, chunks2

# Function to print similar sections with content
def print_similar_sections(similar_sections, chunks1, chunks2):
    for section in similar_sections:
        section_content1 = chunks1[section[0]]
        section_content2 = chunks2[section[1]]

        print(f"Similarity: {section[2]}")
        print("Section from page 1:")
        print("-" * 30)
        print(section_content1)
        print("-" * 30)
        print("Section from page 2:")
        print("-" * 30)
        print(section_content2)
        print("-" * 30)
        print()

# Example usage
url1 = "https://am.jpmorgan.com/us/en/asset-management/adv/products/fund-explorer/mutual-fund"
url2 = "https://am.jpmorgan.com/tools/dt-ic/"
similar_sections, chunks1, chunks2 = compare_web_pages(url1, url2)

# Print similar sections
print_similar_sections(similar_sections, chunks1, chunks2)


  text = ' '.join([elem.get_text().strip() for elem in soup.find_all(text=True)])


Similarity: 0.6563221216201782
Section from page 1:
------------------------------
Fund Explorer | J.P. Morgan Asset Management Skip to main content Financial Professional Login Welcome Log in for exclusive access and a personalized experience Log in Sign up Benefits of creating a free account Customize our Guide to the Markets and
------------------------------
Section from page 2:
------------------------------
Investment Comparison Skip to main content You need to enable JavaScript to run this app. Loading...
------------------------------



In [46]:
ls

[0m[01;34mdownloads[0m/  [01;34metfs[0m/  FS-LCG-A.pdf  [01;34mmutual_funds[0m/  [01;34mprospectuses[0m/  webpage_text.txt


In [5]:
rm -r *

In [22]:
pip install fitz frontend pymupdf

Collecting pymupdf
  Downloading PyMuPDF-1.24.4-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
Collecting PyMuPDFb==1.24.3 (from pymupdf)
  Downloading PyMuPDFb-1.24.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.8/15.8 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: PyMuPDFb, pymupdf
Successfully installed PyMuPDFb-1.24.3 pymupdf-1.24.4
