In [None]:
import requests
from bs4 import BeautifulSoup

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

def fetch_investopedia_article(url):
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return ""

    soup = BeautifulSoup(response.text, "html.parser")

    # Method 1: Find article body div
    main_content = soup.find("div", {"class": "article-body"})

    # Method 2: Find all article paragraphs (fallback)
    if not main_content:
        main_content = soup.find("div", {"id": "mntl-sc-page_1-0"})

    # Method 3: Find all content sections
    if not main_content:
        sections = soup.find_all("div", class_="section-content")
        if sections:
            main_content = "\n".join([s.get_text() for s in sections])

    if main_content:
        if isinstance(main_content, list):
            text = "\n".join([elem.get_text(separator=" ", strip=True) for elem in main_content])
        else:
            text = main_content.get_text(separator=" ", strip=True)
        return text
    else:
        print(f"Could not find main content in {url}")
        return ""


investopedia_urls = [
    "https://www.investopedia.com/terms/p/price-earningsratio.asp",
    "https://www.investopedia.com/terms/t/trailingpe.asp",
    "https://www.investopedia.com/terms/f/forwardpe.asp",
    "https://www.investopedia.com/terms/p/pegratio.asp",
    "https://www.investopedia.com/terms/p/price-to-bookratio.asp",
    "https://www.investopedia.com/terms/e/eps.asp",
    "https://www.investopedia.com/terms/b/bookvalue.asp",
    "https://www.investopedia.com/terms/f/freecashflow.asp",
    "https://www.investopedia.com/terms/e/ebitda.asp",
    "https://www.investopedia.com/terms/e/enterprisevalue.asp",
    "https://www.investopedia.com/terms/m/marketcapitalization.asp",
    "https://www.investopedia.com/terms/r/returnonequity.asp",
    "https://www.investopedia.com/terms/r/returnoninvestmentcapital.asp",
    "https://www.investopedia.com/terms/w/wacc.asp",
    "https://www.investopedia.com/terms/r/returnonassets.asp",
    "https://www.investopedia.com/terms/e/ebitda-margin.asp",
    "https://www.investopedia.com/terms/o/operatingmargin.asp",
    "https://www.investopedia.com/terms/n/net_margin.asp",
    "https://www.investopedia.com/terms/g/grossmargin.asp"
]

investopedia_articles = {}
for url in investopedia_urls:
    article_text = fetch_investopedia_article(url)
    investopedia_articles[url] = article_text
    print(f"Fetched Investopedia article: {url}")


In [None]:
!pip install wikipedia

In [None]:
import wikipedia

pages = ["Stock market", "Bond (finance)" , "Mutualfund", "Portfolio (finance)",
         "Exchange-traded fund", "Financial statement", "Investing", "Retirement planning"]

wiki_articles = {}
for page in pages:
    try:
        page_content = wikipedia.page(page).content
        wiki_texts[page] = page_content
    except Exception as e:
        print(f"Error fetching {page}:", e)

In [None]:
import json

all_articles = {
    "wikipedia": wiki_articles,
    "investopedia": investopedia_articles
}

with open("finance_articles.json", "w", encoding="utf-8") as f:
    json.dump(all_articles, f, ensure_ascii=False, indent=2)

print("finance_articles.json")

In [None]:
import json
import re

def clean_finance_article(text: str, source: str) -> str:
    """Clean article text from specific sources with tailored rules"""
    # Common cleaning for all sources
    text = re.sub(r'\s+', ' ', text)  # Replace multiple whitespace
    text = re.sub(r'\[\d+\]', '', text)  # Remove citation numbers

    # Source-specific cleaning
    if source == 'wikipedia':
        # Remove edit section links and templates
        text = re.sub(r'\[edit\]', '', text)
        text = re.sub(r'\{\{.*?\}\}', '', text, flags=re.DOTALL)  # Remove templates

        # Remove table of contents section
        text = re.sub(r'== Contents ==.*?==', '==', text, flags=re.DOTALL)

        # Remove non-content sections
        sections_to_remove = [
            '== See also ==', '== References ==', '== External links ==',
            '== Further reading ==', '== Notes ==', '== Bibliography =='
        ]
        for section in sections_to_remove:
            text = text.split(section)[0]

    elif source == 'investopedia':
        # Remove disclaimer and ad-related text
        text = re.sub(r'(Read our|View) editorial (policies|standards).*?\.', '', text)
        text = re.sub(r'(As of|Updated).*?20\d{2}', '', text)  # Remove dates
        text = re.sub(r'Disclosure:.*?\.', '', text)

        # Remove author/contributor information
        text = re.sub(r'By [A-Z][a-z]+ [A-Z][a-z]+', '', text)
        text = re.sub(r'Reviewed by .*?\.', '', text)

        # Remove social media prompts
        text = re.sub(r'Follow (us|Investopedia) on.*?\.', '', text)

    # Common pattern removal
    patterns_to_remove = [
        r'This article (was|is) .*?\.',  # Article metadata
        r'Please (read|review) our.*?\.',  # Policy links
        r'Terms of Use apply',
        r'Partner Links.*?\.',
        r'Advertisement( - Continue Reading Below)?',
        r'Cookie (Policy|Settings)',
        r'var\s+\w+\s+=.*?;',  # JavaScript variables
        r'<.*?>',  # Remaining HTML tags
        r'(\b[A-Z]+\b)(?=\s+[A-Z])'  # Standalone capitalized words (potential ads)
    ]

    for pattern in patterns_to_remove:
        text = re.sub(pattern, '', text)

    # Final cleanup
    text = text.strip()
    text = re.sub(r'\s+([.,!?])', r'\1', text)  # Fix punctuation spacing
    text = re.sub(r'\s+', ' ', text)  # Final whitespace cleanup

    return text

# Load raw data
with open("finance_articles.json", "r", encoding="utf-8") as f:
    raw_data = json.load(f)

# Clean data
cleaned_data = {"wikipedia": {}, "investopedia": {}}

for source in raw_data:
    for url, content in raw_data[source].items():
        cleaned_content = clean_finance_article(content, source)
        cleaned_data[source][url] = cleaned_content

# Save cleaned data
with open("cleaned_finance_articles.json", "w", encoding="utf-8") as f:
    json.dump(cleaned_data, f, ensure_ascii=False, indent=2)

print("Data cleaning complete. Saved as cleaned_finance_articles.json")