<a href="https://colab.research.google.com/github/premkrishn/bert-hands-on-nlp/blob/main/web_pages_similar_content_on_url.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [62]:
import requests
from bs4 import BeautifulSoup
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to extract text from HTML
def extract_text_from_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    paragraphs = soup.find_all('p')
    text = ' '.join([p.get_text().strip() for p in paragraphs])
    return text

# Function to tokenize and embed text using BERT
def embed_text(text):
    tokenized_input = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
    with torch.no_grad():
        outputs = model(**tokenized_input)
        embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token embeddings
    return embeddings

# Function to calculate cosine similarity between two sets of embeddings
def calculate_similarity(embeddings1, embeddings2):
    return cosine_similarity(embeddings1, embeddings2)

import re

def split_into_chunks(text):
    # Define a regular expression pattern to split text into sentences
    sentence_pattern = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s'

    # Split the text into sentences using the pattern
    sentences = re.split(sentence_pattern, text)

    # Group sentences into chunks of approximately 5 sentences each
    chunk_size = 5
    chunks = []
    for i in range(0, len(sentences), chunk_size):
        chunk = ' '.join(sentences[i:i + chunk_size])
        chunks.append(chunk)

    return chunks





# Function to compare two web pages and find similar sections
def compare_web_pages(url1, url2):
    # Download HTML content of both pages
    html1 = requests.get(url1).text
    html2 = requests.get(url2).text

    # Extract text from HTML
    text1 = extract_text_from_html(html1)
    text2 = extract_text_from_html(html2)

    # Remove excess white spaces
    text1 = ' '.join(text1.split())
    text2 = ' '.join(text2.split())

    # Split text into smaller chunks (approximately 5 sentences per chunk)
    chunks1 = split_into_chunks(text1)
    chunks2 = split_into_chunks(text2)

    # Embed text chunks using BERT
    embeddings1 = embed_text(' '.join(chunks1))
    embeddings2 = embed_text(' '.join(chunks2))

    # Calculate similarity between embeddings
    similarity_matrix = calculate_similarity(embeddings1, embeddings2)

    # Find similar sections
    similar_sections = []
    for i in range(similarity_matrix.shape[0]):
        for j in range(similarity_matrix.shape[1]):
            if similarity_matrix[i][j] > 0.9:  # Threshold for similarity
                similar_sections.append((i, j, similarity_matrix[i][j]))

    return similar_sections, chunks1, chunks2

# Function to print similar sections with content
def print_similar_sections(similar_sections, chunks1, chunks2):
    for section in similar_sections:
        section_content1 = chunks1[section[0]]
        section_content2 = chunks2[section[1]]

        print(f"Similarity: {section[2]}")
        print("Section from page 1:")
        print("-" * 30)
        print(section_content1)
        print("-" * 30)
        print("Section from page 2:")
        print("-" * 30)
        print(section_content2)
        print("-" * 30)
        print()

# Function to print number of chunks in each webpage
def print_number_of_chunks(chunks1, chunks2):
    print(f"Number of chunks in webpage 1: {len(chunks1)}")
    print(f"Number of chunks in webpage 2: {len(chunks2)}")
    print()

# Example usage
url1 = "https://am.jpmorgan.com/us/en/asset-management/adv/tools/portfolio-tools/portfolio-analysis/"
url2 = "https://am.jpmorgan.com/us/en/asset-management/adv/investment-strategies/529-college-savings-plan/"
similar_sections, chunks1, chunks2 = compare_web_pages(url1, url2)

# Print number of chunks in each webpage
print_number_of_chunks(chunks1, chunks2)

# Print similar sections
print_similar_sections(similar_sections, chunks1, chunks2)

Number of chunks in webpage 1: 8
Number of chunks in webpage 2: 9

Similarity: 0.9360485076904297
Section from page 1:
------------------------------
Run multiple portfolios at a time through our online analytics to instantly stress test and compare performance under different equity markets, rate environments and future scenarios. Customize your report, select the pages you want to include and add your company logo to make it uniquely yours for your next client meeting. This website is a general communication being provided for informational purposes only. It is educational in nature and not designed to be a recommendation for any specific investment product, strategy, plan feature or other purposes. By receiving this communication you agree with the intended purpose described above.
------------------------------
Section from page 2:
------------------------------
Why 529 plans? Why J.P. Morgan Asset Management? Explore our 529 plan options: College planning tools and resources 1 J.P

In [5]:
rm -r *

In [22]:
pip install fitz frontend pymupdf spacy

Collecting pymupdf
  Downloading PyMuPDF-1.24.4-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
Collecting PyMuPDFb==1.24.3 (from pymupdf)
  Downloading PyMuPDFb-1.24.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.8/15.8 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: PyMuPDFb, pymupdf
Successfully installed PyMuPDFb-1.24.3 pymupdf-1.24.4


In [64]:
import requests
from bs4 import BeautifulSoup

# Function to extract text from HTML, excluding footer content
def extract_text_from_html(html):
    soup = BeautifulSoup(html, 'html.parser')

    # Remove footer elements
    footer_elements = soup.find_all("footer")
    for footer_element in footer_elements:
        footer_element.extract()

    # Extract text from remaining elements
    paragraphs = soup.find_all('p')
    text = ' '.join([p.get_text().strip() for p in paragraphs])
    return text

# URL of the webpage
url = "https://am.jpmorgan.com/gb/en/asset-management/adv/investment-themes/sustainable-investing/climate-change/"

# Make a request to the URL
response = requests.get(url)
html_content = response.text

# Extract text from HTML
text_content = extract_text_from_html(html_content)

# Print the extracted text
print(text_content)


 Climate change  Climate change is the paradigm shift of our century. It brings significant risks to our society, but also creates unparalleled opportunity to invest for the long term and prosper.
Momentum is growing for ambitious carbon transition policies that will create winners and losers across companies, sectors and markets. Climate change investment risks: What investors need to focus on now Climate change remains a dominant theme in sustainable investing, as investors look to take account of climate risk in portfolios and contribute to a more sustainable future. There are three key climate change investment risks that investors can focus on now if they want to support long-term climate change solutions. Find out more  Climate change is one of our six investment stewardship priorities. Find out how we use active ownership to address risks and opportunities through direct engagement with companies on climate change. Striving for a new level of climate transparency and disclosure