In [None]:
import os
import requests
import zipfile
import io
import pandas as pd
import time
import random
from datetime import datetime, timedelta

# Headers to mimic a real browser
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Referer": "https://www.nseindia.com",
    "Accept-Language": "en-US,en;q=0.9",
}

# File paths inside the GitHub repository
CSV_FILE = "NSE_Options_10Y_Safe.csv"
LOG_FILE = "download_log.txt"

# Function to download and extract NSE Bhavcopy
def download_nse_data(date):
    """Downloads and extracts NSE Bhavcopy for a given date safely."""
    url = f"https://archives.nseindia.com/content/historical/DERIVATIVES/{date.year}/{date.strftime('%b').upper()}/fo{date.strftime('%d%b%Y').upper()}bhav.csv.zip"
    
    try:
        response = requests.get(url, headers=HEADERS, timeout=10, stream=True)
        
        # Check if request was successful
        if response.status_code == 200:
            with zipfile.ZipFile(io.BytesIO(response.content)) as z:
                file_name = z.namelist()[0]  # Get the CSV filename
                with z.open(file_name) as f:
                    df = pd.read_csv(f)
            print(f"✅ Data fetched for {date.strftime('%Y-%m-%d')}")
            return df
        else:
            print(f"❌ No data available for {date.strftime('%Y-%m-%d')}")
            return None

    except Exception as e:
        print(f"⚠️ Error fetching data for {date.strftime('%Y-%m-%d')}: {e}")
        return None

# Function to get last downloaded date from log file
def get_last_downloaded_date():
    if os.path.exists(LOG_FILE):
        with open(LOG_FILE, "r") as f:
            last_date = f.read().strip()
            if last_date:
                return datetime.strptime(last_date, "%Y-%m-%d")
    return None

# Function to update log file with last downloaded date
def update_last_downloaded_date(date):
    with open(LOG_FILE, "w") as f:
        f.write(date.strftime("%Y-%m-%d"))

# Fetch data from TODAY to the last 10 years
end_date = datetime.today()
start_date = end_date - timedelta(days=365 * 10)
date_range = pd.date_range(start=start_date, end=end_date, freq='B')  # Business days only

# Check where to resume
last_downloaded_date = get_last_downloaded_date()
if last_downloaded_date:
    date_range = [d for d in date_range if d > last_downloaded_date]  # Skip already downloaded dates

# Start downloading
counter = 0
for date in reversed(date_range):  # Reverse order: Start from today
    df = download_nse_data(date)
    
    if df is not None:
        # Append data to CSV file
        if os.path.exists(CSV_FILE):
            df.to_csv(CSV_FILE, mode="a", header=False, index=False)  # Append without header
        else:
            df.to_csv(CSV_FILE, index=False)  # Create new file with header
        
        # Update last downloaded date
        update_last_downloaded_date(date)
    
    # **RATE LIMITING**: Sleep for a random time (2-6 seconds) to avoid detection
    time.sleep(random.uniform(2, 6))

    # **AVOID MASS REQUESTS**: Pause after every 20 requests
    counter += 1
    if counter % 20 == 0:
        print("⏳ Taking a longer break to avoid detection...")
        time.sleep(random.uniform(15, 30))

print("✅ Data downloading process completed (or paused). Resume anytime!")
