# Investopedia Dictonary Downloader

In [2]:
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import os
import re
import unicodedata
from pathlib import Path

## Define the URL and headers for scraping

In [3]:
url = "https://www.investopedia.com/financial-term-dictionary-4769738"
headers = {"User-Agent": "Mozilla/5.0"}

## Scrape Data

### 1. Scrape Dictionary Term Links 
We first scrape all `<a>` tags with class `dictionary-top24-list__complete-listing-link` in the `div` with ID `dictionary-top24-list_1-0` and save the `href` attributes in `main_links`.

In [4]:
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
main_links = []
for a_tag in soup.select("div#dictionary-top24-list_1-0 a.dictionary-top24-list__complete-listing-link"):
    main_links.append(a_tag['href'])
print(f'Found {len(main_links)} main links')

Found 27 main links


### 2. Scrape Links for Words From Each Term Page
Define `scrape_dictionary_links()` to handle the parsing of each secondary page, collecting all relevant links within `<div id="dictionary-top300-list_1-0">`.

In [5]:
# scrape links from each dictionary term page
def scrape_dictionary_links(url):
    try:
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract links from the specified div in each page
        for link in soup.select('div#dictionary-top300-list_1-0 a'):
            text = link.get_text(strip=True)
            href = link['href']
            dictionary_links[text] = href
    except Exception as e:
        print(f"Failed to scrape {url}: {e}")

dictionary_links = {}

with ThreadPoolExecutor(max_workers=8) as executor:
    futures = [executor.submit(scrape_dictionary_links, url) for url in main_links]
    for future in tqdm(as_completed(futures), total=len(futures), desc="Scraping initial links"):
        pass

Scraping initial links: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 27/27 [00:01<00:00, 13.68it/s]


### 3. Scrape Article Content From Each Term's Page
Define `scrape_article_content()` to fetch the actual article text in each term's page, saving the content to a .txt file.

In [6]:
# Helper function to clean text
def clean_text(text):
    # Normalize Unicode to ASCII
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')
    # Replace any kind of hyphen, dash, or special dash characters with a standard dash
    text = re.sub(r'[\u2010-\u2015]', '-', text)
    # Replace multiple spaces/newlines with single spaces/newlines
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n+', '\n', text)
    # Remove any non-printable characters and trim leading/trailing whitespace
    text = re.sub(r'[^\x20-\x7E\n]', '', text).strip()
    return text

# Helper function to clean the filename by replacing non-alphanumeric characters with underscores
def clean_filename(name):
    return re.sub(r'[^\w\s-]', '_', name).strip()

# Function to scrape article content from each term's page
def scrape_article_content(name, url):
    try:
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find the article content div
        article_div = soup.find('div', class_='article-content')
        if article_div:
            # Extract and clean text content
            article_text = article_div.get_text(strip=True)
            cleaned_text = clean_text(article_text)
            
            # Save cleaned text to file
            filename = f"./investopedia-dictionary/{clean_filename(name)}.txt"
            with open(filename, 'w', encoding='ascii') as file:
                file.write(cleaned_text)
    except Exception as e:
        print(f"Failed to scrape article {name}: {e}")

Path("./investopedia-dictionary").mkdir(parents=True, exist_ok=True)
with ThreadPoolExecutor(max_workers=8) as executor:
    futures = [executor.submit(scrape_article_content, name, url) for name, url in dictionary_links.items()]
    for future in tqdm(as_completed(futures), total=len(futures), desc="Scraping article content"):
        pass

Scraping article content: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 6317/6317 [08:11<00:00, 12.85it/s]


## Data Cleaning
1. Checks if the content starts with "Close" and, if so, removes it and any subsequent whitespace.
2. Adds space after first `?`

In [3]:
DIRECTORY = "./investopedia-dictionary"

# Function to clean the content
def clean_file_content(content):
    # Remove any "Close" text at the beginning of the file
    if content.startswith("Close"):
        content = content[5:].lstrip()  # Remove "Close" and any following whitespace
    elif content.startswith("Definition"):
        content = content[10:].lstrip()
    
    # Ensure there's a space after the first question mark if missing
    content = re.sub(r'\?(\S)', r'? \1', content, count=1)
    
    return content

# Get a list of all text files in the directory
text_files = [f for f in os.listdir(DIRECTORY) if f.endswith(".txt")]

# Iterate through all text files with progress tracking
for filename in tqdm(text_files, desc="Cleaning files", unit="file"):
    file_path = os.path.join(DIRECTORY, filename)
    
    # Read the original content
    with open(file_path, 'r', encoding='ascii') as file:
        content = file.read()
    
    # Clean the content
    cleaned_content = clean_file_content(content)
    
    # Write the cleaned content back to the file
    with open(file_path, 'w', encoding='ascii') as file:
        file.write(cleaned_content)

Cleaning files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 6286/6286 [00:00<00:00, 16244.31file/s]
