<a href="https://colab.research.google.com/github/premkrishn/bert-hands-on-nlp/blob/main/SWP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
from bs4 import BeautifulSoup
import requests
import os
import re
from urllib.parse import urljoin

def download_website(url, folder):
    # Create folder if it doesn't exist
    if not os.path.exists(folder):
        os.makedirs(folder)

    # Set to keep track of visited URLs
    visited_urls = set()

    # Download the main page and follow internal links recursively
    download_page(url, folder, visited_urls)

def download_page(url, folder, visited_urls):
    try:
        # Check if the URL has already been visited
        if url in visited_urls:
            return
        visited_urls.add(url)

        # Get the webpage content
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            # Save page
            save_page(response.content, folder, get_filename(url))

            # Find all links on the page
            links = soup.find_all('a', href=True)
            for link in links:
                href = urljoin(url, link['href'])
                # Check if the link is internal
                if href.startswith(url):
                    # Download the linked page
                    download_page(href, folder, visited_urls)
    except Exception as e:
        print(f"Error downloading {url}: {str(e)}")

def save_page(content, folder, filename):
    with open(os.path.join(folder, filename), 'wb') as f:
        f.write(content)

def get_filename(url):
    # Extract filename from URL
    filename = re.sub(r'^https?://', '', url)
    filename = re.sub(r'[^a-zA-Z0-9\-._]', '_', filename)
    if filename.endswith('/'):
        filename += "index"
    filename += ".html"
    return filename

if __name__ == "__main__":
    # URL of the website
    url = "https://am.jpmorgan.com/us/en/asset-management/liq/"
    # Folder to save web pages
    folder = "jpmorgan_website"

    download_website(url, folder)
    print("Website downloaded successfully!")


Website downloaded successfully!


In [8]:
from bs4 import BeautifulSoup
import requests
import os
import re
from urllib.parse import urlparse, unquote

def download_website(url, folder):
    # Create folder if it doesn't exist
    if not os.path.exists(folder):
        os.makedirs(folder)

    # Set to keep track of visited URLs
    visited_urls = set()

    # Download the main page and follow internal links recursively
    download_page(url, folder, visited_urls)

def download_page(url, folder, visited_urls):
    try:
        # Check if the URL has already been visited
        if url in visited_urls:
            return
        visited_urls.add(url)

        # Get the webpage content
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            # Save page
            filename = get_filename(url)
            save_page(response.content, folder, filename)

            # Find all links on the page
            links = soup.find_all('a', href=True)
            for link in links:
                href = link['href']
                # Check if the link is internal
                if is_internal_link(url, href):
                    # Convert relative URLs to absolute URLs
                    href = urljoin(url, href)
                    # Download the linked page
                    download_page(href, folder, visited_urls)
    except Exception as e:
        print(f"Error downloading {url}: {str(e)}")

def save_page(content, folder, filename):
    with open(os.path.join(folder, filename), 'wb') as f:
        f.write(content)

def get_filename(url):
    # Parse the URL to extract the filename
    parsed_url = urlparse(url)
    # Unquote special characters in the path
    filename = unquote(parsed_url.path)
    # Remove leading and trailing slashes
    filename = filename.strip('/')
    # Replace slashes and special characters with underscores
    filename = re.sub(r'[/\\?%*:|"<>]', '_', filename)
    # If the filename is empty, use "index.html"
    if not filename:
        filename = "index.html"
    return filename

def is_internal_link(base_url, link):
    # Check if the link is internal (starts with the base URL)
    return link.startswith(base_url)

if __name__ == "__main__":
    # URL of the website
    url = "https://am.jpmorgan.com/us/en/asset-management/liq/"
    # Folder to save web pages
    folder = "jpmorgan_website1"

    download_website(url, folder)
    print("Website downloaded successfully!")


Website downloaded successfully!


In [9]:
from bs4 import BeautifulSoup
import os
import re
from urllib.parse import urlparse, unquote
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def load_pages(folder):
    pages = []
    for filename in os.listdir(folder):
        if filename.endswith(".html"):
            with open(os.path.join(folder, filename), 'r', encoding="utf-8") as f:
                content = f.read()
                pages.append(content)
    return pages

def preprocess_page(page):
    # Remove HTML tags and unnecessary spaces
    soup = BeautifulSoup(page, 'html.parser')
    text = re.sub(r'\s+', ' ', soup.get_text(strip=True))
    return text

def encode_pages(pages):
    model = SentenceTransformer("distilbert-base-nli-mean-tokens")
    embeddings = model.encode(pages)
    return embeddings

def calculate_similarity(embeddings):
    similarities = cosine_similarity(embeddings)
    np.fill_diagonal(similarities, 0)  # Set diagonal elements to 0 to avoid self-similarity
    return similarities

def find_top_similar_pages(similarities, page_urls, top_k=5):
    top_similar_pages = {}
    for i, url in enumerate(page_urls):
        similar_pages_indices = np.argsort(similarities[i])[::-1][:top_k]
        similar_pages = [page_urls[idx] for idx in similar_pages_indices]
        top_similar_pages[url] = similar_pages
    return top_similar_pages

if __name__ == "__main__":
    # Folder containing downloaded web pages
    folder = "jpmorgan_website"

    # Load and preprocess pages
    pages = load_pages(folder)
    preprocessed_pages = [preprocess_page(page) for page in pages]

    # Encode pages
    embeddings = encode_pages(preprocessed_pages)

    # Calculate similarity between pages
    similarities = calculate_similarity(embeddings)

    # Get page URLs
    page_urls = [get_filename(url) for url in os.listdir(folder) if url.endswith(".html")]

    # Find top similar pages for each page
    top_similar_pages = find_top_similar_pages(similarities, page_urls)
    for url, similar_pages in top_similar_pages.items():
        print(f"Top 5 similar pages for: {url}")
        for i, page in enumerate(similar_pages, start=1):
            print(f"{i}. {page}")
        print()


Website downloaded successfully!


In [None]:
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import re
import numpy as np

def load_pages(folder):
    pages = []
    for filename in os.listdir(folder):
        if filename.endswith(".html"):
            with open(os.path.join(folder, filename), 'r', encoding="utf-8") as f:
                content = f.read()
                pages.append(content)
    return pages

def preprocess_page(page):
    # Remove HTML tags and unnecessary spaces
    soup = BeautifulSoup(page, 'html.parser')
    text = re.sub(r'\s+', ' ', soup.get_text(strip=True))
    return text

def calculate_similarity(pages):
    # TF-IDF vectorization
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(pages)

    # Calculate cosine similarity
    similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)
    np.fill_diagonal(similarities, 0)  # Set diagonal elements to 0 to avoid self-similarity
    return similarities

def find_top_similar_pages(similarities, page_urls, top_k=5):
    top_similar_pages = {}
    for i, url in enumerate(page_urls):
        similar_pages_indices = np.argsort(similarities[i])[::-1][:top_k]
        similar_pages = [page_urls[idx] for idx in similar_pages_indices]
        top_similar_pages[url] = similar_pages
    return top_similar_pages

if __name__ == "__main__":
    # Folder containing downloaded web pages
    folder = "jpmorgan_website"

    # Load and preprocess pages
    pages = load_pages(folder)
    preprocessed_pages = [preprocess_page(page) for page in pages]

    # Calculate similarity between pages
    similarities = calculate_similarity(preprocessed_pages)

    # Get page URLs
    page_urls = [get_filename(url) for url in os.listdir(folder) if url.endswith(".html")]

    # Find top similar pages for each page
    top_similar_pages = find_top_similar_pages(similarities, page_urls)
    for url, similar_pages in top_similar_pages.items():
        print(f"Top 5 similar pages for: {url}")
        for i, page in enumerate(similar_pages, start=1):
            print(f"{i}. {page}")
        print()
