In [1]:
import argparse
import time
import os
from datetime import datetime, timedelta
import yfinance as yf
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import joblib

import warnings
warnings.filterwarnings('ignore')

In [None]:
def fetch_gld_data(live_mode=False, force_refresh=False):
    """Fetch GLD (Gold ETF) historical data with proper validation and numpy storage"""
    cache_file = 'gld_data_1h.npy'  # Different cache for 1h data
    cache_dates_file = 'gld_dates_1h.npy'
    
    print("=== GLD DATA FETCHING DEBUG ===")
    print(f"Cache files: {cache_file}, {cache_dates_file}")
    print(f"Cache exists: {os.path.exists(cache_file) and os.path.exists(cache_dates_file)}")
    
    # Always fetch fresh data to avoid corruption issues
    force_refresh = True  # Force fresh data until we verify integrity
    
    if not force_refresh and os.path.exists(cache_file) and os.path.exists(cache_dates_file):
        try:
            cache_time = os.path.getmtime(cache_file)
            current_time = time.time()
            cache_age = current_time - cache_time
            print(f"Cache age: {cache_age:.0f} seconds ({cache_age/3600:.2f} hours)")
            
            # If cache is less than 1 hour old, use it
            if cache_age < 3600:  # 3600 seconds = 1 hour
                print("Loading cached data...")
                data_array = np.load(cache_file)
                dates_array = np.load(cache_dates_file, allow_pickle=True)  # Fix: allow pickle for string arrays
                
                # Convert back to DataFrame
                dates = pd.to_datetime(dates_array)
                cached_data = pd.DataFrame(data_array, 
                                         columns=['Open', 'High', 'Low', 'Close', 'Volume'],
                                         index=dates)
                
                print(f"✓ Using cached data: {len(cached_data)} rows")
                print(f"✓ Cache date range: {cached_data.index[0]} to {cached_data.index[-1]}")
                print(f"✓ Cache price range: ${cached_data['Close'].min():.2f} - ${cached_data['Close'].max():.2f}")
                
                # Validate current price against real market
                current_gld = yf.Ticker('GLD')
                current_price = current_gld.info.get('regularMarketPrice', 0)
                cached_latest = cached_data['Close'].iloc[-1]
                price_diff_pct = abs((current_price - cached_latest) / current_price) * 100
                
                if price_diff_pct > 10:  # If cached price differs by more than 10%
                    print(f"✗ Cache validation failed: cached ${cached_latest:.2f} vs current ${current_price:.2f} ({price_diff_pct:.1f}% diff)")
                    force_refresh = True
                else:
                    print(f"✓ Cache validation passed: price difference {price_diff_pct:.1f}%")
                    return cached_data
            else:
                print("Cache is too old, fetching fresh data...")
        except Exception as e:
            print(f"✗ Cache load error: {e}, fetching fresh data...")
    
    if force_refresh or not os.path.exists(cache_file):
        print("Fetching fresh data or cache validation failed...")
    
    try:
        print("\n=== FETCHING FRESH GLD DATA ===")
        print("Connecting to Yahoo Finance for GLD...")
        
        # OPTIMIZED: Use 1-hour intervals for extended dataset (Better for GLD analysis)
        end_date = datetime.now()
        start_date = end_date - timedelta(days=1095)  # 3 years of 1h data = ~26k samples
        
        print(f"Requesting 1-HOUR GLD data from {start_date.date()} to {end_date.date()} (3 years)...")
        print("Calling yf.download() with 1h interval for GLD (optimal for gold trading)...")
        
        # Download 1-hour data with error handling (no 60-day limit)
        import time as time_module
        fetch_start = time_module.time()
        gld = yf.download('GLD', start=start_date, end=end_date, interval='1h', progress=False)
        fetch_duration = time_module.time() - fetch_start
        
        print(f"Download completed in {fetch_duration:.2f} seconds")
        print(f"Downloaded data type: {type(gld)}")
        print(f"Downloaded data empty: {gld.empty if hasattr(gld, 'empty') else 'No empty attr'}")
        
        if gld.empty:
            raise Exception("Downloaded GLD data is empty")
        
        print(f"✓ Raw GLD data shape: {gld.shape}")
        print(f"✓ Raw columns: {gld.columns.tolist()}")
        print(f"✓ Raw index type: {type(gld.index)}")
        print(f"✓ First few dates: {gld.index[:3].tolist()}")
        
        # Handle multi-level columns from yfinance
        if isinstance(gld.columns, pd.MultiIndex):
            print("Flattening multi-level columns...")
            gld.columns = [col[0] if col[1] == 'GLD' else f"{col[0]}_{col[1]}" for col in gld.columns]
        
        print(f"✓ Processed columns: {gld.columns.tolist()}")
        
        # Validate required columns
        required_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
        missing_cols = [col for col in required_cols if col not in gld.columns]
        if missing_cols:
            raise Exception(f"Missing required columns: {missing_cols}")
        
        print(f"✓ All required columns present: {required_cols}")
        
        # Validate data quality for 1h data (3 years)
        if len(gld) < 5000:  # Need substantial data for 1h intervals
            raise Exception(f"Insufficient GLD data: only {len(gld)} rows. Need at least 5000 for 1h training.")
        
        nan_count = gld['Close'].isna().sum()
        if nan_count > len(gld) * 0.1:
            raise Exception(f"Too many missing Close prices: {nan_count} out of {len(gld)}")
        
        print(f"✓ Data quality check passed: {len(gld)} rows, {nan_count} NaN values")
        
        # Remove any rows with NaN in critical columns
        original_len = len(gld)
        gld = gld.dropna(subset=['Open', 'High', 'Low', 'Close', 'Volume'])
        dropped_rows = original_len - len(gld)
        
        print(f"✓ Cleaned GLD data shape: {gld.shape} (dropped {dropped_rows} rows)")
        print(f"✓ Date range: {gld.index[0]} to {gld.index[-1]}")
        print(f"✓ Price range: ${gld['Close'].min():.2f} - ${gld['Close'].max():.2f}")
        print(f"✓ Sample recent prices: {gld['Close'].tail(3).values}")
        
        # Final validation for 1h data (3 years)
        if len(gld) < 8000:
            raise Exception(f"After cleaning, insufficient GLD data: {len(gld)} rows. Need at least 8000.")
        
        # Save to cache using numpy arrays for better reliability
        try:
            print(f"Saving to numpy cache: {cache_file}")
            
            # Convert to numpy arrays with proper datetime handling for 1h data
            data_array = gld[['Open', 'High', 'Low', 'Close', 'Volume']].values
            dates_array = gld.index.strftime('%Y-%m-%d %H:%M:%S').values  # Include time for 1h data
            
            # Save arrays
            np.save(cache_file, data_array)
            np.save(cache_dates_file, dates_array)
            
            cache_size = os.path.getsize(cache_file) + os.path.getsize(cache_dates_file)
            print(f"✓ GLD Data cached as numpy arrays! ({len(gld)} rows, {cache_size} bytes)")
            
            # Verify cache was written correctly
            print("Verifying numpy cache integrity...")
            test_data = np.load(cache_file)
            test_dates = np.load(cache_dates_file, allow_pickle=True)  # Fix: allow pickle for string arrays
            
            if len(test_data) != len(gld) or len(test_dates) != len(gld):
                os.remove(cache_file)
                os.remove(cache_dates_file)
                print("✗ Cache verification failed, removed cache files")
            else:
                print("✓ Numpy cache verification successful")
        except Exception as cache_error:
            print(f"✗ Warning: Could not cache GLD data: {cache_error}")
            # Continue without caching - not critical for training
            for f in [cache_file, cache_dates_file]:
                if os.path.exists(f):
                    os.remove(f)
        
        print("=== GLD DATA FETCH COMPLETE ===\n")
        return gld
    
    except Exception as e:
        print(f"GLD data fetch failed: {e}. Checking for cached data...")
        
        # Try to use old cached data as last resort
        if os.path.exists(cache_file) and os.path.exists(cache_dates_file):
            try:
                print("Attempting to use older cached GLD data...")
                data_array = np.load(cache_file)
                dates_array = np.load(cache_dates_file)
                dates = pd.to_datetime(dates_array)
                cached_data = pd.DataFrame(data_array, 
                                         columns=['Open', 'High', 'Low', 'Close', 'Volume'],
                                         index=dates)
                
                if len(cached_data) > 100:
                    print(f"Using old GLD cache: {len(cached_data)} rows")
                    return cached_data
                else:
                    print("Old GLD cache is also invalid")
            except Exception as cache_error:
                print(f"GLD cache load failed: {cache_error}")
            
            # Remove corrupted cache files
            for f in [cache_file, cache_dates_file]:
                if os.path.exists(f):
                    os.remove(f)
        
        print("Using synthetic GLD data as fallback...")
        # Generate synthetic gold price data (3 years)
        dates = pd.date_range(start=datetime.now() - timedelta(days=1095), 
                             end=datetime.now(), freq='H')  # Hourly frequency
        
        np.random.seed(42)
        prices = [180]  # Start around typical GLD price
        volumes = []
        
        for i in range(len(dates) - 1):
            # Simulate realistic gold price movements (lower volatility, market hours effect)
            hour = dates[i].hour
            if 9 <= hour <= 16:  # Market hours - more activity
                change = np.random.normal(0, 1.5)  
            else:  # After hours - less volatility
                change = np.random.normal(0, 0.5)
            
            prices.append(max(prices[-1] + change, 50))  # Min price of $50
            
            # Volume varies by market hours
            if 9 <= hour <= 16:
                volumes.append(np.random.randint(8000000, 25000000))
            else:
                volumes.append(np.random.randint(1000000, 5000000))
        
        volumes.append(np.random.randint(5000000, 20000000))
        
        synthetic_data = pd.DataFrame({
            'Open': prices,
            'High': [p * (1 + abs(np.random.normal(0, 0.008))) for p in prices],  # Lower vol 
            'Low': [p * (1 - abs(np.random.normal(0, 0.008))) for p in prices],   # Lower vol
            'Close': prices,
            'Volume': volumes
        }, index=dates)
        
        return synthetic_data

In [7]:
df = fetch_gld_data()
df.to_csv('initial_dataset.csv', index=False)

=== GLD DATA FETCHING DEBUG ===
Cache files: gld_data_1h.npy, gld_dates_1h.npy
Cache exists: False
Fetching fresh data or cache validation failed...

=== FETCHING FRESH GLD DATA ===
Connecting to Yahoo Finance for GLD...
Requesting 1-HOUR GLD data from 2022-09-18 to 2025-09-17 (3 years)...
Calling yf.download() with 1h interval for GLD (optimal for gold trading)...



1 Failed download:
['GLD']: YFPricesMissingError('possibly delisted; no price data found  (1h 2022-09-18 20:04:46.301210 -> 2025-09-17 20:04:46.301210) (Yahoo error = "1h data not available for startTime=1663545886 and endTime=1758132288. The requested range must be within the last 730 days.")')


Download completed in 0.69 seconds
Downloaded data type: <class 'pandas.core.frame.DataFrame'>
Downloaded data empty: True
error occured: Downloaded GLD data is empty
