In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from datetime import datetime, timedelta

In [7]:
# 1. Function to generate ET archive URLs for a given date range

def generate_archive_urls(start_date, end_date):
    base_url = "https://economictimes.indiatimes.com/archivelist/year-{year},month-{month},starttime-{starttime}.cms"
    url_list = []
    
    current_date = start_date
    while current_date <= end_date:
        # ET's starttime = number of days since 01-Jan-1900
        starttime = (current_date - datetime(1900, 1, 1)).days + 2  # offset
        url = base_url.format(year=current_date.year, month=current_date.month, starttime=starttime)
        
        url_list.append({"date": current_date.strftime("%d-%b-%Y"), "url": url})
        
        current_date += timedelta(days=1)
    
    return url_list

In [8]:
# 2. Function to scrape headlines from a given archive URL

def scrape_archive_page(url, keywords):
    response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    soup = BeautifulSoup(response.text, "html.parser")
    
    headlines = []
    
    ul = soup.find("ul", class_="content")
    if not ul:
        return headlines
    
    for link in ul.find_all("a", href=True):
        text = link.get_text(strip=True)
        if text and any(keyword.lower() in text.lower() for keyword in keywords):
            headlines.append(text)
    
    return headlines

In [11]:
# 3. Main script

if __name__ == "__main__":
    start_date = datetime(2024, 1, 1)
    end_date = datetime(2025, 6, 30)
    
    keywords = ["sensex", "nifty", "nifty50", "stock market"]
    
    urls = generate_archive_urls(start_date, end_date)
    
    all_data = []
    
    for item in urls:
        date_text = item["date"]
        url = item["url"]
        
        print(f"Scraping {date_text} → {url}")
        headlines = scrape_archive_page(url, keywords)
        
        for h in headlines:
            all_data.append({"date": date_text, "headline": h, "url": url})
        
        time.sleep(2)  # delay

Scraping 01-Jan-2024 → https://economictimes.indiatimes.com/archivelist/year-2024,month-1,starttime-45292.cms
Scraping 02-Jan-2024 → https://economictimes.indiatimes.com/archivelist/year-2024,month-1,starttime-45293.cms
Scraping 03-Jan-2024 → https://economictimes.indiatimes.com/archivelist/year-2024,month-1,starttime-45294.cms
Scraping 04-Jan-2024 → https://economictimes.indiatimes.com/archivelist/year-2024,month-1,starttime-45295.cms
Scraping 05-Jan-2024 → https://economictimes.indiatimes.com/archivelist/year-2024,month-1,starttime-45296.cms
Scraping 06-Jan-2024 → https://economictimes.indiatimes.com/archivelist/year-2024,month-1,starttime-45297.cms
Scraping 07-Jan-2024 → https://economictimes.indiatimes.com/archivelist/year-2024,month-1,starttime-45298.cms
Scraping 08-Jan-2024 → https://economictimes.indiatimes.com/archivelist/year-2024,month-1,starttime-45299.cms
Scraping 09-Jan-2024 → https://economictimes.indiatimes.com/archivelist/year-2024,month-1,starttime-45300.cms
Scraping 1

In [12]:
    # Save results
    df = pd.DataFrame(all_data)
    df.to_csv("et_stock_headlines.csv", index=False, encoding="utf-8-sig")
    print("✅ Scraping complete! Data saved to et_stock_headlines.csv")

✅ Scraping complete! Data saved to et_stock_headlines.csv
