In [47]:
from bs4 import BeautifulSoup
from bs4.element import Tag
import spacy
import markdown

# Load Spacy's English model
nlp = spacy.load('en_core_web_sm')

In [52]:
def html_to_markdown(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all the div elements
    div_tags = soup.find_all('div')

    # For each div, count the number of tokens in the text within the div
    divs_and_token_counts = [(div, len(div.text.split())) for div in div_tags]

    # Find the div with the most tokens
    main_content_div, _ = max(divs_and_token_counts, key=lambda item: item[1])

    # Find all the elements within this div that could contain relevant information
    relevant_tags = main_content_div.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'pre', 'code', 'img', 'math', 'mi', 'footer', 'span'])

    markdown_content = ""

    for tag in relevant_tags:
        if isinstance(tag, Tag):
            # Stop processing if we encounter a footer tag
            if tag.name == 'footer':
                break
            # Add headers
            elif tag.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
                markdown_content += f"\n{'#' * int(tag.name[1:])} {tag.text}\n"
            # Add paragraphs
            elif tag.name == 'p':
                markdown_content += f"\n{tag.text}\n"
            # Add preformatted text (code)
            elif tag.name == 'pre' or (tag.name == 'code' and tag.parent.name == 'pre'):
                markdown_content += f"\n```\n{tag.text}\n```\n"
            # Add images
            elif tag.name == 'img':
                markdown_content += f"\n![{tag.get('alt', '')}]({tag.get('src', '')})\n"
            # Add math content
            elif tag.name == 'span' and 'MathJax' in tag.get('class', []):
                math_content = tag.get('data-mathml')
                if 'MathJax_FullWidth' in tag.get('class', []):
                    # Block equation
                    markdown_content += f"\n$$\n{math_content}\n$$\n"
                else:
                    # Inline equation
                    markdown_content += f"${math_content}$"

    return markdown_content

In [53]:
# Read the HTML content from a file
with open('test.html', 'r') as file:
        html_content = file.read()

# Convert the HTML content to Markdown
markdown_content = html_to_markdown(html_content)

# Write the Markdown content to a file
with open('output.md', 'w') as file:
        file.write(markdown_content)

