<a href="https://colab.research.google.com/github/premkrishn/bert-hands-on-nlp/blob/main/Similarity_on_downloaded.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import glob
from bs4 import BeautifulSoup
import re
import numpy as np
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load HTML files
file_paths = glob.glob('SWP/jpmorgan_website_FA/*.html')

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to preprocess text
def preprocess_text(text):
    # Remove HTML tags
    soup = BeautifulSoup(text, 'html.parser')

    # Remove footer elements
    footer_elements = soup.find_all("footer")
    for footer_element in footer_elements:
        footer_element.extract()

    text = soup.get_text()

    # Remove extra spaces
    text = ' '.join(text.split())

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Convert to lowercase
    text = text.lower()
    return text

# Function to get BERT embeddings
def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = torch.mean(outputs.last_hidden_state, dim=1).squeeze().numpy()
    return embeddings

# Load, preprocess HTML files, and compute BERT embeddings
embeddings = []
for file_path in file_paths:
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
        text = preprocess_text(html_content)
        embeddings.append(get_bert_embeddings(text))

# Convert list of embeddings to numpy array
embeddings = np.array(embeddings)

# Function to compute similarity
def compute_similarity(query_embedding, document_embeddings):
    return cosine_similarity([query_embedding], document_embeddings)[0]

# Random HTML document (you need to load and preprocess it similar to the others)
random_html_text = preprocess_text(random_html_content)
random_html_embedding = get_bert_embeddings(random_html_text)

# Compute similarity with all documents
similarities = compute_similarity(random_html_embedding, embeddings)

# Find top similar pages
top_indices = similarities.argsort()[-3:][::-1]
top_similar_pages = [file_paths[i] for i in top_indices]

print("Top 3 similar pages:")
for page in top_similar_pages:
    print(page)
