In [7]:
#pip install requests beautifulsoup4

In [5]:
import pandas as pd

In [2]:
import requests
from bs4 import BeautifulSoup

def extract_glossary_terms(url):
    """
    Downloads a Wikipedia glossary page by adding a User-Agent header
    and extracts terms found within <dt> tags.
    """
    # Define a standard User-Agent header to mimic a web browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    try:
        # 1. Download the webpage content, passing the headers
        response = requests.get(url, headers=headers)
        response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
        print("Successfully fetched the page content.")
    except requests.RequestException as e:
        print(f"Error fetching URL: {e}")
        return []

    # 2. Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # 3. Find and extract all glossary terms
    glossary_terms = []

    # We look for <dt> tags inside the main content area
    body_content = soup.find(id='bodyContent')

    if body_content:
        for dt_tag in body_content.find_all('dt'):
            # Extract and clean the text of the term
            term = dt_tag.get_text().strip()

            if term:
                # Basic cleaning: takes the first line and removes reference brackets
                term_clean = term.split('\n')[0].split('[')[0].strip()
                glossary_terms.append(term_clean)

    return glossary_terms

# The URL of the agricultural glossary
wikipedia_url = "https://en.wikipedia.org/wiki/Glossary_of_agriculture"

# Execute the function and get the list of terms
terms_list = extract_glossary_terms(wikipedia_url)

# Print the results
if terms_list:
    print("-" * 40)
    print(f"✅ Successfully extracted {len(terms_list)} unique agricultural terms.")
    print("-" * 40)
    # Print the first 15 terms as an example
    print("First 15 Extracted Terms:")
    for term in terms_list[:15]:
        print(f"* {term}")
    print("...")

Successfully fetched the page content.
----------------------------------------
✅ Successfully extracted 856 unique agricultural terms.
----------------------------------------
First 15 Extracted Terms:
* abattoir
* aboiteau
* acaricide
* acre (ac)
* acreage
* acre-foot
* adjuvant
* aerial seeding
* aeroponics
* agrarian system
* agrarianism
* agribusiness
* agricultural aircraft
* agricultural cooperative
* agricultural cycle
...


In [6]:
df = pd.DataFrame(terms_list, columns=['Term'])

# Define the filename for the CSV file
csv_filename = 'agricultural_terms.csv'

# Save the DataFrame to CSV. 'index=False' prevents pandas from writing row numbers.
df.to_csv(csv_filename, index=False, encoding='utf-8')