In [1]:
import os
import requests
import zipfile
import io
import pandas as pd
import time
import random
from datetime import datetime, timedelta

# Rotating User-Agents to mimic real browser requests
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
]

# Headers with randomized User-Agent
HEADERS = {
    "User-Agent": random.choice(USER_AGENTS),
    "Referer": "https://www.nseindia.com",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive"
}

# File paths
CSV_FILE = "Nifty50_Options_10Y.csv"
LOG_FILE = "download_log.txt"

# Function to download and extract NSE Bhavcopy for Nifty 50 options
def download_nse_data(date):
    """Downloads NSE Bhavcopy, extracts Nifty 50 option data (CALL & PUT)."""
    url = f"https://archives.nseindia.com/content/historical/DERIVATIVES/{date.year}/{date.strftime('%b').upper()}/fo{date.strftime('%d%b%Y').upper()}bhav.csv.zip"
    
    try:
        print(f"🔍 Fetching data for {date.strftime('%Y-%m-%d')}...")

        # Retry mechanism in case of failures
        for attempt in range(3):  # Try 3 times
            response = requests.get(url, headers=HEADERS, timeout=10, stream=True)

            if response.status_code == 200:
                try:
                    with zipfile.ZipFile(io.BytesIO(response.content)) as z:
                        zip_contents = z.namelist()
                        if not zip_contents:
                            print(f"⚠️ Empty ZIP file for {date.strftime('%Y-%m-%d')}")
                            return None

                        file_name = zip_contents[0]
                        with z.open(file_name) as f:
                            df = pd.read_csv(f)

                    # **FILTER ONLY NIFTY 50 OPTIONS**
                    df = df[(df["SYMBOL"] == "NIFTY") & (df["INSTRUMENT"] == "OPTIDX")]

                    if not df.empty:
                        print(f"✅ Data fetched for {date.strftime('%Y-%m-%d')}")
                        return df
                    else:
                        print(f"⚠️ No Nifty 50 options data found for {date.strftime('%Y-%m-%d')}")
                        return None
                
                except zipfile.BadZipFile:
                    print(f"⚠️ Corrupt ZIP file for {date.strftime('%Y-%m-%d')}")
                    return None
            elif response.status_code == 403:
                print(f"❌ 403 Forbidden - NSE might be blocking GitHub. Retrying in {2**attempt} seconds...")
                time.sleep(2**attempt)
            else:
                print(f"❌ No data available for {date.strftime('%Y-%m-%d')} (HTTP {response.status_code})")
                return None
        
        return None

    except Exception as e:
        print(f"⚠️ Error fetching data for {date.strftime('%Y-%m-%d')}: {e}")
        return None

# Function to get the last downloaded date
def get_last_downloaded_date():
    if os.path.exists(LOG_FILE):
        with open(LOG_FILE, "r") as f:
            last_date = f.read().strip()
            if last_date:
                return datetime.strptime(last_date, "%Y-%m-%d")
    return None

# Function to update the log file
def update_last_downloaded_date(date):
    with open(LOG_FILE, "w") as f:
        f.write(date.strftime("%Y-%m-%d"))

# Fetch data from the last 10 years
end_date = datetime.today()
start_date = end_date - timedelta(days=365 * 10)
date_range = pd.date_range(start=start_date, end=end_date, freq='B')  # Business days only

# Resume from last downloaded date
last_downloaded_date = get_last_downloaded_date()
if last_downloaded_date:
    date_range = [d for d in date_range if d > last_downloaded_date]

# Start downloading
counter = 0
for date in reversed(date_range):  # Start from recent dates
    df = download_nse_data(date)

    if df is not None:
        # Save data
        if os.path.exists(CSV_FILE):
            df.to_csv(CSV_FILE, mode="a", header=False, index=False)
        else:
            df.to_csv(CSV_FILE, index=False)

        # Update last downloaded date
        update_last_downloaded_date(date)

    # **Rate limiting to avoid detection**
    time.sleep(random.uniform(2, 6))

    # **Avoid mass requests**: Pause after 20 requests
    counter += 1
    if counter % 20 == 0:
        print("⏳ Taking a longer break to avoid detection...")
        time.sleep(random.uniform(15, 30))

print("✅ Data extraction completed! Saved as 'Nifty50_Options_10Y.csv'.")


🔍 Fetching data for 2025-02-28...
❌ No data available for 2025-02-28 (HTTP 404)
🔍 Fetching data for 2025-02-27...
❌ No data available for 2025-02-27 (HTTP 404)


KeyboardInterrupt: 