In [None]:
# ======================================================
# üîë API Keys Configuration
# ======================================================
import os

# Set API keys from environment variables or defaults
ALPHA_VANTAGE_KEY = os.environ.get('ALPHA_VANTAGE_KEY', '1W58NPZXOG5SLHZ6')
BROWSERLESS_TOKEN = os.environ.get('BROWSERLESS_TOKEN', '2TMVUBAjFwrr7Tb283f0da6602a4cb698b81778bda61967f7')

# Set environment variables for downstream code
os.environ['ALPHA_VANTAGE_KEY'] = ALPHA_VANTAGE_KEY
os.environ['BROWSERLESS_TOKEN'] = BROWSERLESS_TOKEN

# Validate
if not ALPHA_VANTAGE_KEY:
    print("‚ö†Ô∏è Warning: ALPHA_VANTAGE_KEY not set!")
else:
    print(f"‚úÖ Alpha Vantage Key: {ALPHA_VANTAGE_KEY[:4]}...{ALPHA_VANTAGE_KEY[-4:]}")

if not BROWSERLESS_TOKEN:
    print("‚ö†Ô∏è Warning: BROWSERLESS_TOKEN not set!")
else:
    print(f"‚úÖ Browserless Token: {BROWSERLESS_TOKEN[:4]}...{BROWSERLESS_TOKEN[-4:]}")

In [None]:
# ======================================================
# üåç Environment Detection & Setup (MUST RUN FIRST!)
# ======================================================
import os
import sys
from pathlib import Path

# Detect environment
try:
    import google.colab
    IN_COLAB = True
    ENV_NAME = "Google Colab"
except ImportError:
    IN_COLAB = False
    ENV_NAME = "Local/GitHub Actions"

IN_GHA = "GITHUB_ACTIONS" in os.environ

# Override ENV_NAME if in GitHub Actions
if IN_GHA:
    ENV_NAME = "GitHub Actions"

# Set base paths based on environment
if IN_COLAB:
    BASE_FOLDER = Path("/content")
    SAVE_FOLDER = BASE_FOLDER / "forex-ai-models"
elif IN_GHA:
    # GitHub Actions already checks out the repo
    BASE_FOLDER = Path.cwd()
    SAVE_FOLDER = BASE_FOLDER
else:
    # Local development
    BASE_FOLDER = Path.cwd()
    SAVE_FOLDER = BASE_FOLDER

# Create necessary directories with organized structure
DIRECTORIES = {
    "data_raw": SAVE_FOLDER / "data" / "raw" / "yfinance",
    "data_processed": SAVE_FOLDER / "data" / "processed",
    "database": SAVE_FOLDER / "database",
    "logs": SAVE_FOLDER / "logs",
    "outputs": SAVE_FOLDER / "outputs",
}

# Create all directories
for dir_name, dir_path in DIRECTORIES.items():
    dir_path.mkdir(parents=True, exist_ok=True)

# Display environment info
print("=" * 60)
print(f"üåç Environment: {ENV_NAME}")
print(f"üìÇ Base Folder: {BASE_FOLDER}")
print(f"üíæ Save Folder: {SAVE_FOLDER}")
print(f"üîß Python: {sys.version.split()[0]}")
print(f"üìç Working Dir: {os.getcwd()}")
print("=" * 60)

# Validate critical environment variables for GitHub Actions
if IN_GHA:
    required_vars = ["FOREX_PAT", "GIT_USER_NAME", "GIT_USER_EMAIL"]
    missing = [v for v in required_vars if not os.environ.get(v)]
    if missing:
        print(f"‚ö†Ô∏è  Warning: Missing environment variables: {', '.join(missing)}")
        sys.exit(1)  # Fail fast in CI if critical vars missing
    else:
        print("‚úÖ All required environment variables present")

# Export commonly used paths as globals
CSV_FOLDER = DIRECTORIES["data_raw"]
PICKLE_FOLDER = DIRECTORIES["data_processed"]
DB_PATH = DIRECTORIES["database"] / "memory_v85.db"
LOG_PATH = DIRECTORIES["logs"] / "pipeline.log"
OUTPUT_PATH = DIRECTORIES["outputs"] / "signals.json"

print(f"\nüìÅ Key Paths:")
print(f"   CSV: {CSV_FOLDER}")
print(f"   Pickles: {PICKLE_FOLDER}")
print(f"   Database: {DB_PATH}")
print(f"   Logs: {LOG_PATH}")
print(f"   Signals: {OUTPUT_PATH}")
print("=" * 60)

In [None]:
# ======================================================
# üìÑ GitHub Sync (Environment-Aware) - ALIGNED VERSION
# ======================================================
import os
import subprocess
import shutil
from pathlib import Path
import urllib.parse
import sys

# ======================================================
# 1Ô∏è‚É£ Environment Detection (MUST MATCH YOUR FIRST CELL!)
# ======================================================
try:
    import google.colab
    IN_COLAB = True
    ENV_NAME = "Google Colab"
except ImportError:
    IN_COLAB = False
    ENV_NAME = "Local/GitHub Actions"

IN_GHA = "GITHUB_ACTIONS" in os.environ

# Override ENV_NAME if in GitHub Actions
if IN_GHA:
    ENV_NAME = "GitHub Actions"

# ======================================================
# 2Ô∏è‚É£ CRITICAL FIX: Use SAME paths as environment detection
# ======================================================
if IN_COLAB:
    # ‚úÖ MATCHES YOUR ENVIRONMENT DETECTION
    BASE_FOLDER = Path("/content")
    SAVE_FOLDER = BASE_FOLDER / "forex-ai-models"  # Same as env detection!
    REPO_FOLDER = SAVE_FOLDER  # Repo IS the save folder
    print("‚òÅÔ∏è Colab Mode: Cloning directly to /content/forex-ai-models")

elif IN_GHA:
    # ‚úÖ GitHub Actions: Use current directory (already in repo)
    BASE_FOLDER = Path.cwd()
    SAVE_FOLDER = BASE_FOLDER
    REPO_FOLDER = BASE_FOLDER  # We're already in the repo!
    print("ü§ñ GitHub Actions Mode: Using current directory")

else:
    # ‚úÖ Local: Use current directory
    BASE_FOLDER = Path.cwd()
    SAVE_FOLDER = BASE_FOLDER
    REPO_FOLDER = BASE_FOLDER
    print("üíª Local Mode: Using current directory")

# Create necessary directories WITH your organized structure
DIRECTORIES = {
    "data_raw": SAVE_FOLDER / "data" / "raw" / "yfinance",
    "data_processed": SAVE_FOLDER / "data" / "processed",
    "database": SAVE_FOLDER / "database",
    "logs": SAVE_FOLDER / "logs",
    "outputs": SAVE_FOLDER / "outputs",
}

print("=" * 70)
print(f"üîß Running in: {ENV_NAME}")
print(f"üìÇ Working directory: {os.getcwd()}")
print(f"üíæ Save folder: {SAVE_FOLDER}")
print(f"üì¶ Repo folder: {REPO_FOLDER}")
print(f"üêç Python: {sys.version.split()[0]}")
print("=" * 70)

# ======================================================
# 3Ô∏è‚É£ GitHub Configuration
# ======================================================
GITHUB_USERNAME = "rahim-dotAI"
GITHUB_REPO = "forex-ai-models"
BRANCH = "main"

# ======================================================
# 4Ô∏è‚É£ GitHub Token (Multi-Source)
# ======================================================
FOREX_PAT = os.environ.get("FOREX_PAT")

# Try Colab secrets if in Colab and PAT not found
if not FOREX_PAT and IN_COLAB:
    try:
        from google.colab import userdata
        FOREX_PAT = userdata.get("FOREX_PAT")
        if FOREX_PAT:
            os.environ["FOREX_PAT"] = FOREX_PAT
            print("üîê Loaded FOREX_PAT from Colab secret.")
    except ImportError:
        pass
    except Exception as e:
        print(f"‚ö†Ô∏è Could not load Colab secret: {e}")

# Validate PAT
if not FOREX_PAT:
    print("‚ö†Ô∏è Warning: FOREX_PAT not found. Git operations may fail.")
    print("   Set FOREX_PAT in:")
    print("   - GitHub Secrets (for Actions)")
    print("   - Colab Secrets (for Colab)")
    print("   - Environment variable (for local)")
    REPO_URL = None
else:
    SAFE_PAT = urllib.parse.quote(FOREX_PAT)
    REPO_URL = f"https://{GITHUB_USERNAME}:{SAFE_PAT}@github.com/{GITHUB_USERNAME}/{GITHUB_REPO}.git"
    print("‚úÖ GitHub token configured")

# ======================================================
# 5Ô∏è‚É£ Handle Repository Based on Environment
# ======================================================
if IN_GHA:
    # ===== GitHub Actions =====
    print("\nü§ñ GitHub Actions Mode")
    print("‚úÖ Repository already checked out by actions/checkout")
    print(f"üìÇ Current directory: {Path.cwd()}")

    # Verify .git exists
    if not (Path.cwd() / ".git").exists():
        print("‚ö†Ô∏è Warning: .git directory not found!")
        print("   Make sure actions/checkout@v4 is in your workflow")
    else:
        print("‚úÖ Git repository confirmed")

elif IN_COLAB:
    # ===== Google Colab =====
    print("\n‚òÅÔ∏è Google Colab Mode")

    if not REPO_URL:
        print("‚ùå Cannot clone repository: FOREX_PAT not available")
    elif not (REPO_FOLDER / ".git").exists():
        # Check if directory exists but isn't a git repo
        if REPO_FOLDER.exists():
            print(f"‚ö†Ô∏è Directory exists but is not a git repo. Removing...")
            shutil.rmtree(REPO_FOLDER)
            print("‚úÖ Cleaned up non-git directory")

        # Clone repository
        print(f"üì• Cloning repository to {REPO_FOLDER}...")
        env = os.environ.copy()
        env["GIT_LFS_SKIP_SMUDGE"] = "1"  # Skip LFS files

        try:
            result = subprocess.run(
                ["git", "clone", "-b", BRANCH, REPO_URL, str(REPO_FOLDER)],
                check=True,
                env=env,
                capture_output=True,
                text=True,
                timeout=60
            )
            print("‚úÖ Repository cloned successfully")

            # Change to repo directory
            os.chdir(REPO_FOLDER)
            print(f"üìÇ Changed directory to: {os.getcwd()}")

        except subprocess.CalledProcessError as e:
            print(f"‚ùå Clone failed: {e.stderr}")
            print("Creating directory structure manually...")
            REPO_FOLDER.mkdir(parents=True, exist_ok=True)
        except subprocess.TimeoutExpired:
            print("‚ùå Clone timed out after 60 seconds")
            REPO_FOLDER.mkdir(parents=True, exist_ok=True)
    else:
        # Repository exists, pull latest
        print("‚úÖ Repository already exists, pulling latest changes...")
        os.chdir(REPO_FOLDER)

        try:
            result = subprocess.run(
                ["git", "pull", "origin", BRANCH],
                check=True,
                cwd=REPO_FOLDER,
                capture_output=True,
                text=True,
                timeout=30
            )
            print("‚úÖ Successfully pulled latest changes")
        except subprocess.CalledProcessError as e:
            print(f"‚ö†Ô∏è Pull failed: {e.stderr}")
            print("Continuing with existing files...")
        except subprocess.TimeoutExpired:
            print("‚ö†Ô∏è Pull timed out, continuing anyway...")

    # Configure Git LFS (disable for Colab)
    print("‚öôÔ∏è Configuring Git LFS...")
    try:
        subprocess.run(
            ["git", "lfs", "uninstall"],
            check=False,
            cwd=REPO_FOLDER,
            capture_output=True
        )
        print("‚úÖ LFS disabled for Colab")
    except Exception as e:
        print(f"‚ö†Ô∏è LFS setup warning: {e}")

else:
    # ===== Local Environment =====
    print("\nüíª Local Development Mode")
    print(f"üìÇ Working in: {SAVE_FOLDER}")

    if not (REPO_FOLDER / ".git").exists():
        print("‚ö†Ô∏è Not a git repository")
        print("   Run: git clone https://github.com/rahim-dotAI/forex-ai-models.git")
    else:
        print("‚úÖ Git repository found")

# ======================================================
# 6Ô∏è‚É£ Create Organized Directory Structure
# ======================================================
print("\nüìÅ Creating organized directory structure...")
for dir_name, dir_path in DIRECTORIES.items():
    dir_path.mkdir(parents=True, exist_ok=True)
    print(f"   ‚úÖ {dir_name}: {dir_path}")

# ======================================================
# 7Ô∏è‚É£ Git Global Configuration
# ======================================================
print("\nüîß Configuring Git...")

GIT_USER_NAME = os.environ.get("GIT_USER_NAME", "Forex AI Bot")
GIT_USER_EMAIL = os.environ.get("GIT_USER_EMAIL", "nakatonabira3@gmail.com")

# Set git config
git_configs = [
    (["git", "config", "--global", "user.name", GIT_USER_NAME], "User name"),
    (["git", "config", "--global", "user.email", GIT_USER_EMAIL], "User email"),
    (["git", "config", "--global", "advice.detachedHead", "false"], "Detached HEAD warning"),
    (["git", "config", "--global", "init.defaultBranch", "main"], "Default branch")
]

for cmd, description in git_configs:
    try:
        subprocess.run(cmd, check=False, capture_output=True)
    except Exception as e:
        print(f"‚ö†Ô∏è Could not set {description}: {e}")

print(f"‚úÖ Git configured: {GIT_USER_NAME} <{GIT_USER_EMAIL}>")

# ======================================================
# 8Ô∏è‚É£ Export Path Constants (MATCH YOUR ENVIRONMENT DETECTION!)
# ======================================================
CSV_FOLDER = DIRECTORIES["data_raw"]
PICKLE_FOLDER = DIRECTORIES["data_processed"]
DB_PATH = DIRECTORIES["database"] / "memory_v85.db"
LOG_PATH = DIRECTORIES["logs"] / "pipeline.log"
OUTPUT_PATH = DIRECTORIES["outputs"] / "signals.json"

# ======================================================
# 9Ô∏è‚É£ Environment Summary & Validation
# ======================================================
print("\n" + "=" * 70)
print("üßæ ENVIRONMENT SUMMARY")
print("=" * 70)
print(f"Environment:      {ENV_NAME}")
print(f"Working Dir:      {os.getcwd()}")
print(f"Save Folder:      {SAVE_FOLDER}")
print(f"Repo Folder:      {REPO_FOLDER}")
print(f"Repository:       https://github.com/{GITHUB_USERNAME}/{GITHUB_REPO}")
print(f"Branch:           {BRANCH}")
print(f"Git Repo Exists:  {(REPO_FOLDER / '.git').exists()}")
print(f"FOREX_PAT Set:    {'‚úÖ Yes' if FOREX_PAT else '‚ùå No'}")

# Check critical paths
print("\nüìã Critical Paths:")
print(f"   CSV Folder:    {CSV_FOLDER}")
print(f"   Pickle Folder: {PICKLE_FOLDER}")
print(f"   Database:      {DB_PATH}")
print(f"   Logs:          {LOG_PATH}")
print(f"   Signals:       {OUTPUT_PATH}")

print("\nüìÇ Directory Status:")
critical_paths = {
    "Repo .git": REPO_FOLDER / ".git",
    "Data Raw": CSV_FOLDER,
    "Data Processed": PICKLE_FOLDER,
    "Database": DIRECTORIES["database"],
    "Logs": DIRECTORIES["logs"],
    "Outputs": DIRECTORIES["outputs"]
}

for name, path in critical_paths.items():
    exists = path.exists()
    icon = "‚úÖ" if exists else "‚ùå"
    print(f"  {icon} {name}: {path}")

print("=" * 70)
print("‚úÖ Setup completed successfully!")
print("=" * 70)

# ======================================================
# üîü Export Variables for Downstream Cells
# ======================================================
# These variables are now available in subsequent cells:
# - ENV_NAME: Environment name
# - IN_COLAB: Boolean for Colab detection
# - IN_GHA: Boolean for GitHub Actions detection
# - SAVE_FOLDER: Path to save files (same as REPO_FOLDER in Colab)
# - REPO_FOLDER: Path to git repository
# - CSV_FOLDER, PICKLE_FOLDER, DB_PATH, LOG_PATH, OUTPUT_PATH: Organized paths
# - GITHUB_USERNAME, GITHUB_REPO, BRANCH: Git config
# - FOREX_PAT: GitHub token (if available)

print("\n‚úÖ All environment variables exported for downstream cells")

In [None]:
!pip install mplfinance firebase-admin dropbox requests beautifulsoup4 pandas numpy ta yfinance pyppeteer nest_asyncio lightgbm joblib matplotlib alpha_vantage tqdm scikit-learn river


In [None]:
#!/usr/bin/env python3
"""
ALPHA VANTAGE FX DATA FETCHER - ALIGNED WITH CLEAN STRUCTURE
============================================================
‚úÖ Uses NEW clean repo structure (data/raw/alpha_vantage)
‚úÖ Data quality validation BEFORE saving
‚úÖ Works in GitHub Actions, Google Colab, and Local
‚úÖ Unified with YFinance folder structure
‚úÖ Thread-safe operations
‚úÖ API rate limit handling
‚úÖ Automatic retry logic
‚úÖ Clear naming: pair_daily_av.csv (av = Alpha Vantage)
"""

import os
import time
import hashlib
import requests
import subprocess
import threading
import shutil
import urllib.parse
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
import numpy as np

# ======================================================
# 1Ô∏è‚É£ ENVIRONMENT DETECTION (MATCHES YOUR SETUP!)
# ======================================================
print("=" * 70)
print("üöÄ Alpha Vantage FX Data Fetcher - Clean Structure Edition")
print("=" * 70)

try:
    import google.colab
    IN_COLAB = True
    ENV_NAME = "Google Colab"
except ImportError:
    IN_COLAB = False
    ENV_NAME = "Local"

IN_GHA = "GITHUB_ACTIONS" in os.environ

if IN_GHA:
    ENV_NAME = "GitHub Actions"

print(f"üìç Environment: {ENV_NAME}")

# ======================================================
# 2Ô∏è‚É£ UNIFIED PATH CONFIGURATION (MATCHES YOUR CLEAN STRUCTURE!)
# ======================================================
if IN_COLAB:
    print("‚òÅÔ∏è Google Colab detected - using clean structure")
    BASE_FOLDER = Path("/content")
    SAVE_FOLDER = BASE_FOLDER / "forex-ai-models"  # ‚úÖ MATCHES YOUR SETUP!
    REPO_FOLDER = SAVE_FOLDER
elif IN_GHA:
    print("ü§ñ GitHub Actions detected - using repository root")
    BASE_FOLDER = Path.cwd()
    SAVE_FOLDER = BASE_FOLDER
    REPO_FOLDER = BASE_FOLDER
else:
    print("üíª Local environment detected - using clean structure")
    BASE_FOLDER = Path.cwd()
    SAVE_FOLDER = BASE_FOLDER
    REPO_FOLDER = BASE_FOLDER

# ‚úÖ NEW: Use organized directory structure
DIRECTORIES = {
    "data_raw_alpha": SAVE_FOLDER / "data" / "raw" / "alpha_vantage",
    "data_processed": SAVE_FOLDER / "data" / "processed",
    "database": SAVE_FOLDER / "database",
    "logs": SAVE_FOLDER / "logs",
    "outputs": SAVE_FOLDER / "outputs",
    "quarantine": SAVE_FOLDER / "data" / "quarantine" / "alpha_vantage",
}

# Create all directories
for dir_name, dir_path in DIRECTORIES.items():
    dir_path.mkdir(parents=True, exist_ok=True)

# Export key paths
CSV_FOLDER = DIRECTORIES["data_raw_alpha"]  # ‚úÖ Alpha Vantage CSVs here
QUARANTINE_FOLDER = DIRECTORIES["quarantine"]
LOG_FOLDER = DIRECTORIES["logs"]

print(f"üìÇ Base Folder: {BASE_FOLDER}")
print(f"üíæ Save Folder: {SAVE_FOLDER}")
print(f"üì¶ Repo Folder: {REPO_FOLDER}")
print(f"üìä Alpha Vantage CSV: {CSV_FOLDER}")
print(f"üóëÔ∏è Quarantine: {QUARANTINE_FOLDER}")
print("=" * 70)

# ======================================================
# 3Ô∏è‚É£ DATA QUALITY VALIDATOR
# ======================================================
class DataQualityValidator:
    """Validate data quality before saving"""

    MIN_ROWS = 50  # Alpha Vantage should give us lots of data
    MIN_PRICE_CV = 0.01  # 0.01% minimum variation
    MIN_UNIQUE_RATIO = 0.01  # 1% unique prices
    MIN_TRUE_RANGE = 1e-10
    MIN_QUALITY_SCORE = 40.0

    @staticmethod
    def validate_dataframe(df, pair):
        """
        Validate DataFrame quality
        Returns: (is_valid, quality_score, metrics, issues)
        """
        if df is None or df.empty:
            return False, 0.0, {}, ["Empty DataFrame"]

        issues = []
        metrics = {}

        # Check row count
        metrics['row_count'] = len(df)
        if len(df) < DataQualityValidator.MIN_ROWS:
            issues.append(f"Too few rows: {len(df)}")

        # Check required columns
        required_cols = ['open', 'high', 'low', 'close']
        missing_cols = [col for col in required_cols if col not in df.columns]
        if missing_cols:
            issues.append(f"Missing columns: {missing_cols}")
            return False, 0.0, metrics, issues

        # Get valid OHLC data
        ohlc_data = df[required_cols].dropna()
        if len(ohlc_data) == 0:
            issues.append("No valid OHLC data")
            return False, 0.0, metrics, issues

        metrics['valid_rows'] = len(ohlc_data)
        metrics['valid_ratio'] = len(ohlc_data) / len(df)

        # Price statistics
        close_prices = ohlc_data['close']
        metrics['price_mean'] = float(close_prices.mean())
        metrics['price_std'] = float(close_prices.std())
        metrics['price_min'] = float(close_prices.min())
        metrics['price_max'] = float(close_prices.max())

        # Coefficient of variation
        if metrics['price_mean'] > 0:
            metrics['price_cv'] = (metrics['price_std'] / metrics['price_mean']) * 100
        else:
            metrics['price_cv'] = 0.0
            issues.append("Zero mean price")

        # Unique price ratio
        metrics['unique_prices'] = close_prices.nunique()
        metrics['unique_ratio'] = metrics['unique_prices'] / len(close_prices)

        # Calculate true range
        high = ohlc_data['high'].values
        low = ohlc_data['low'].values
        close = ohlc_data['close'].values

        tr = np.maximum.reduce([
            high - low,
            np.abs(high - np.roll(close, 1)),
            np.abs(low - np.roll(close, 1))
        ])
        tr[0] = high[0] - low[0]

        metrics['true_range_median'] = float(np.median(tr))
        metrics['true_range_mean'] = float(np.mean(tr))

        # Calculate quality score (0-100)
        quality_score = 0.0

        # Valid data ratio (30 points)
        quality_score += metrics['valid_ratio'] * 30

        # Price variation (30 points)
        if metrics['price_cv'] >= 1.0:
            quality_score += 30
        elif metrics['price_cv'] >= DataQualityValidator.MIN_PRICE_CV:
            quality_score += (metrics['price_cv'] / 1.0) * 30

        # Unique price ratio (20 points)
        quality_score += min(metrics['unique_ratio'] * 20, 20)

        # True range adequacy (20 points)
        if metrics['true_range_median'] >= 1e-5:
            quality_score += 20
        elif metrics['true_range_median'] >= DataQualityValidator.MIN_TRUE_RANGE:
            quality_score += (metrics['true_range_median'] / 1e-5) * 20

        metrics['quality_score'] = quality_score

        # Determine if valid
        is_valid = (quality_score >= DataQualityValidator.MIN_QUALITY_SCORE)

        return is_valid, quality_score, metrics, issues

validator = DataQualityValidator()

# ======================================================
# 4Ô∏è‚É£ GITHUB CONFIGURATION
# ======================================================
GITHUB_USERNAME = "rahim-dotAI"
GITHUB_REPO = "forex-ai-models"
BRANCH = "main"

FOREX_PAT = os.environ.get("FOREX_PAT")

if not FOREX_PAT and IN_COLAB:
    try:
        from google.colab import userdata
        FOREX_PAT = userdata.get("FOREX_PAT")
        if FOREX_PAT:
            os.environ["FOREX_PAT"] = FOREX_PAT
            print("üîê Loaded FOREX_PAT from Colab secrets")
    except Exception as e:
        print(f"‚ö†Ô∏è Could not access Colab secrets: {e}")

if not FOREX_PAT:
    raise ValueError("FOREX_PAT is required")

SAFE_PAT = urllib.parse.quote(FOREX_PAT)
REPO_URL = f"https://{GITHUB_USERNAME}:{SAFE_PAT}@github.com/{GITHUB_USERNAME}/{GITHUB_REPO}.git"

print("‚úÖ GitHub credentials configured")

# ======================================================
# 5Ô∏è‚É£ REPOSITORY MANAGEMENT (SIMPLIFIED FOR CLEAN STRUCTURE)
# ======================================================
def ensure_repository():
    """Ensure repository is available and up-to-date"""
    if IN_GHA:
        print("\nü§ñ GitHub Actions: Repository already available")
        if not (REPO_FOLDER / ".git").exists():
            print("‚ö†Ô∏è Warning: .git directory not found")
        else:
            print("‚úÖ Git repository verified")
        return

    print("\nüì• Managing repository...")

    if REPO_FOLDER.exists() and not (REPO_FOLDER / ".git").exists():
        # Directory exists but isn't a git repo - this shouldn't happen with new structure
        print("‚ö†Ô∏è Directory exists but is not a git repository")
        return

    if (REPO_FOLDER / ".git").exists():
        print(f"üîÑ Pulling latest changes...")
        try:
            result = subprocess.run(
                ["git", "-C", str(REPO_FOLDER), "pull", "origin", BRANCH],
                capture_output=True,
                text=True,
                timeout=30
            )
            if result.returncode == 0:
                print("‚úÖ Repository updated successfully")
            else:
                print(f"‚ö†Ô∏è Pull had issues, continuing anyway")
        except Exception as e:
            print(f"‚ö†Ô∏è Update failed: {e} - continuing with existing repo")
    else:
        print("‚ö†Ô∏è Repository not found. This script expects the repo to be set up first.")
        print("   Please run the GitHub Sync script first!")

ensure_repository()

GIT_USER_NAME = os.environ.get("GIT_USER_NAME", "Forex AI Bot")
GIT_USER_EMAIL = os.environ.get("GIT_USER_EMAIL", "nakatonabira3@gmail.com")

subprocess.run(["git", "config", "--global", "user.name", GIT_USER_NAME],
               capture_output=True, check=False)
subprocess.run(["git", "config", "--global", "user.email", GIT_USER_EMAIL],
               capture_output=True, check=False)

print(f"‚úÖ Git configured: {GIT_USER_NAME} <{GIT_USER_EMAIL}>")

# ======================================================
# 6Ô∏è‚É£ ALPHA VANTAGE CONFIGURATION
# ======================================================
ALPHA_VANTAGE_KEY = os.environ.get("ALPHA_VANTAGE_KEY")

if not ALPHA_VANTAGE_KEY and IN_COLAB:
    try:
        from google.colab import userdata
        ALPHA_VANTAGE_KEY = userdata.get("ALPHA_VANTAGE_KEY")
        if ALPHA_VANTAGE_KEY:
            os.environ["ALPHA_VANTAGE_KEY"] = ALPHA_VANTAGE_KEY
            print("üîê Loaded ALPHA_VANTAGE_KEY from Colab secrets")
    except Exception as e:
        print(f"‚ö†Ô∏è Could not access Colab secrets for API key: {e}")

if not ALPHA_VANTAGE_KEY:
    raise ValueError("‚ùå ALPHA_VANTAGE_KEY is required")

print(f"‚úÖ Alpha Vantage API key: {ALPHA_VANTAGE_KEY[:4]}...{ALPHA_VANTAGE_KEY[-4:]}")

FX_PAIRS = ["EUR/USD", "GBP/USD", "USD/JPY", "AUD/USD"]

lock = threading.Lock()

# ======================================================
# 7Ô∏è‚É£ HELPER FUNCTIONS
# ======================================================
def ensure_tz_naive(df):
    """Remove timezone information from DataFrame index"""
    if df is None or df.empty:
        return df

    df.index = pd.to_datetime(df.index, errors='coerce')

    if df.index.tz is not None:
        df.index = df.index.tz_convert(None)

    return df

def file_hash(filepath, chunk_size=8192):
    """Calculate MD5 hash of file to detect changes"""
    if not filepath.exists():
        return None

    md5 = hashlib.md5()
    with open(filepath, "rb") as f:
        for chunk in iter(lambda: f.read(chunk_size), b""):
            md5.update(chunk)

    return md5.hexdigest()

def fetch_alpha_vantage_fx(pair, outputsize='full', max_retries=3, retry_delay=5):
    """
    Fetch FX data from Alpha Vantage API with retry logic

    Returns:
        DataFrame with OHLC data or empty DataFrame on failure
    """
    base_url = 'https://www.alphavantage.co/query'
    from_currency, to_currency = pair.split('/')

    params = {
        'function': 'FX_DAILY',
        'from_symbol': from_currency,
        'to_symbol': to_currency,
        'outputsize': outputsize,
        'datatype': 'json',
        'apikey': ALPHA_VANTAGE_KEY
    }

    for attempt in range(max_retries):
        try:
            print(f"  üîΩ Fetching {pair} (attempt {attempt + 1}/{max_retries})...")

            r = requests.get(base_url, params=params, timeout=30)
            r.raise_for_status()
            data = r.json()

            # Check for API errors
            if 'Error Message' in data:
                raise ValueError(f"API Error: {data['Error Message']}")

            if 'Note' in data:
                print(f"  ‚ö†Ô∏è API rate limit reached for {pair}")
                if attempt < max_retries - 1:
                    time.sleep(retry_delay * 2)
                    continue
                return pd.DataFrame()

            if 'Time Series FX (Daily)' not in data:
                raise ValueError(f"Unexpected response format: {list(data.keys())}")

            # Parse time series data
            ts = data['Time Series FX (Daily)']
            df = pd.DataFrame(ts).T
            df.index = pd.to_datetime(df.index)
            df = df.sort_index()

            # Rename columns
            df = df.rename(columns={
                '1. open': 'open',
                '2. high': 'high',
                '3. low': 'low',
                '4. close': 'close'
            })

            # Convert to float
            df = df.astype(float)

            # Remove timezone
            df = ensure_tz_naive(df)

            print(f"  ‚úÖ Fetched {len(df)} rows for {pair}")
            return df

        except requests.RequestException as e:
            print(f"  ‚ö†Ô∏è Network error: {e}")
            if attempt < max_retries - 1:
                time.sleep(retry_delay)
            else:
                return pd.DataFrame()

        except Exception as e:
            print(f"  ‚ö†Ô∏è Error: {e}")
            if attempt < max_retries - 1:
                time.sleep(retry_delay)
            else:
                return pd.DataFrame()

    return pd.DataFrame()

# ======================================================
# 8Ô∏è‚É£ PAIR PROCESSING WITH QUALITY VALIDATION
# ======================================================
def process_pair(pair):
    """
    Process single FX pair: fetch, validate quality, merge, save

    ‚úÖ Saves to data/raw/alpha_vantage/ with clear naming

    Returns:
        Tuple of (filepath if changed, status message, quality_score)
    """
    print(f"\nüîÑ Processing {pair}...")

    # ‚úÖ Save to Alpha Vantage folder
    filename = pair.replace("/", "_") + "_daily_av.csv"
    file_path = CSV_FOLDER / filename

    # Load existing data
    existing_df = pd.DataFrame()
    if file_path.exists():
        try:
            existing_df = pd.read_csv(file_path, index_col=0, parse_dates=True)
            existing_df = ensure_tz_naive(existing_df)
            print(f"  üìä Loaded {len(existing_df)} existing rows")
        except Exception as e:
            print(f"  ‚ö†Ô∏è Could not load existing data: {e}")

    old_hash = file_hash(file_path)

    # Fetch new data
    new_df = fetch_alpha_vantage_fx(pair)

    if new_df.empty:
        return None, f"‚ùå {pair}: No data fetched", 0.0

    # Merge with existing data
    if not existing_df.empty:
        combined_df = pd.concat([existing_df, new_df])
        combined_df = combined_df[~combined_df.index.duplicated(keep='last')]
    else:
        combined_df = new_df

    combined_df.sort_index(inplace=True)

    # ‚úÖ VALIDATE QUALITY BEFORE SAVING
    is_valid, quality_score, metrics, issues = validator.validate_dataframe(
        combined_df, pair
    )

    print(f"  üìä Quality score: {quality_score:.1f}/100")

    if not is_valid:
        print(f"  ‚ö†Ô∏è Quality issues: {'; '.join(issues[:2])}")
        print(f"     CV: {metrics.get('price_cv', 0):.4f}%, Unique: {metrics.get('unique_ratio', 0):.1%}")

        # Quarantine if quality too low
        if quality_score < DataQualityValidator.MIN_QUALITY_SCORE:
            print(f"  ‚ùå Data quality too low - quarantining")

            quarantine_file = QUARANTINE_FOLDER / f"{filename}.bad"
            with lock:
                combined_df.to_csv(quarantine_file)

                # Save quality report
                report_file = QUARANTINE_FOLDER / f"{filename}.quality.txt"
                with open(report_file, 'w') as f:
                    f.write(f"Quality Report for {pair} (Alpha Vantage)\n")
                    f.write(f"{'='*50}\n")
                    f.write(f"Quality Score: {quality_score:.1f}/100\n")
                    f.write(f"Issues: {'; '.join(issues)}\n")
                    f.write(f"\nMetrics:\n")
                    for k, v in metrics.items():
                        f.write(f"  {k}: {v}\n")

            return None, f"‚ùå {pair}: Quality too low ({quality_score:.1f}/100)", quality_score
        else:
            print(f"  ‚ö†Ô∏è Low quality but acceptable - saving with warning")

    # ‚úÖ Quality good, save the file
    with lock:
        combined_df.to_csv(file_path)

    new_hash = file_hash(file_path)
    changed = (old_hash != new_hash)

    status = "‚úÖ Updated" if changed else "‚ÑπÔ∏è No changes"
    print(f"  {status} - {len(combined_df)} rows, quality: {quality_score:.1f}/100")

    return (str(file_path) if changed else None), f"{status} {pair} ({len(combined_df)} rows, Q:{quality_score:.0f})", quality_score

# ======================================================
# 9Ô∏è‚É£ PARALLEL EXECUTION
# ======================================================
print("\n" + "=" * 70)
print("üöÄ Fetching FX data with quality validation...")
print("=" * 70)

changed_files = []
results = []
quality_scores = {}

with ThreadPoolExecutor(max_workers=4) as executor:
    futures = {executor.submit(process_pair, pair): pair for pair in FX_PAIRS}

    for future in as_completed(futures):
        pair = futures[future]
        try:
            filepath, message, quality = future.result()
            results.append(message)
            if filepath:
                changed_files.append(filepath)
                quality_scores[filepath] = quality
        except Exception as e:
            print(f"‚ùå {pair} processing failed: {e}")
            results.append(f"‚ùå {pair}: Failed")

# ======================================================
# üîü RESULTS SUMMARY WITH QUALITY REPORT
# ======================================================
print("\n" + "=" * 70)
print("üìä PROCESSING SUMMARY")
print("=" * 70)

for result in results:
    print(result)

print(f"\nTotal pairs processed: {len(FX_PAIRS)}")
print(f"Files updated: {len(changed_files)}")

# Quality report
if quality_scores:
    print("\n" + "=" * 70)
    print("üìä QUALITY REPORT")
    print("=" * 70)
    avg_quality = sum(quality_scores.values()) / len(quality_scores)
    print(f"Average quality score: {avg_quality:.1f}/100")

    if quality_scores:
        print(f"\nFiles by quality:")
        for fname, score in sorted(quality_scores.items(), key=lambda x: x[1], reverse=True):
            print(f"  {'‚úÖ' if score >= 60 else '‚ö†Ô∏è'} {Path(fname).name}: {score:.1f}/100")

# Check quarantine
quarantined = list(QUARANTINE_FOLDER.glob("*.bad"))
if quarantined:
    print(f"\n" + "=" * 70)
    print(f"‚ö†Ô∏è  QUARANTINED FILES: {len(quarantined)}")
    print("=" * 70)
    for qfile in quarantined:
        print(f"  ‚ùå {qfile.stem}")

# ======================================================
# 1Ô∏è‚É£1Ô∏è‚É£ GIT COMMIT & PUSH
# ======================================================
if IN_GHA:
    print("\n" + "=" * 70)
    print("ü§ñ GitHub Actions: Skipping git operations")
    print("=" * 70)

elif changed_files:
    print("\n" + "=" * 70)
    print("üöÄ Committing changes to GitHub...")
    print("=" * 70)

    try:
        os.chdir(REPO_FOLDER)

        subprocess.run(["git", "add", "-A"], check=False)

        commit_msg = f"Update Alpha Vantage data - {len(changed_files)} files"
        if quality_scores:
            commit_msg += f" (Avg Q:{avg_quality:.0f})"

        result = subprocess.run(
            ["git", "commit", "-m", commit_msg],
            capture_output=True,
            text=True
        )

        if result.returncode == 0:
            print("‚úÖ Changes committed")

            for attempt in range(3):
                print(f"üì§ Pushing to GitHub (attempt {attempt + 1}/3)...")
                result = subprocess.run(
                    ["git", "push", "origin", BRANCH],
                    capture_output=True,
                    text=True,
                    timeout=30
                )

                if result.returncode == 0:
                    print("‚úÖ Successfully pushed to GitHub")
                    break
                elif attempt < 2:
                    subprocess.run(
                        ["git", "pull", "--rebase", "origin", BRANCH],
                        capture_output=True
                    )
                    time.sleep(3)

    except Exception as e:
        print(f"‚ùå Git error: {e}")
    finally:
        os.chdir(SAVE_FOLDER)

else:
    print("\n‚ÑπÔ∏è No changes to commit")

# ======================================================
# ‚úÖ COMPLETION
# ======================================================
print("\n" + "=" * 70)
print("‚úÖ ALPHA VANTAGE WORKFLOW COMPLETED")
print("=" * 70)
print(f"Environment: {ENV_NAME}")
print(f"Files updated: {len(changed_files)}")
print(f"Quality validated: ‚úÖ")
if quality_scores:
    print(f"Average quality: {avg_quality:.1f}/100")
print(f"Status: {'‚úÖ Success' if len(results) == len(FX_PAIRS) else '‚ö†Ô∏è Partial'}")
print("=" * 70)
print("\nüìÅ Clean File Structure:")
print(f"   Alpha Vantage: {CSV_FOLDER}")
print(f"   ‚îî‚îÄ‚îÄ EUR_USD_daily_av.csv")
print(f"   YFinance: {SAVE_FOLDER / 'data' / 'raw' / 'yfinance'}")
print(f"   ‚îî‚îÄ‚îÄ EUR_USD_1d_5y.csv")
print("\nüéØ Both sources in organized folders!")
print("=" * 70)

In [None]:
#!/usr/bin/env python3
"""
YFINANCE FX DATA FETCHER - CLEAN STRUCTURE EDITION
===================================================
‚úÖ Aligned with clean repo structure (data/raw/yfinance)
‚úÖ Relaxed quality thresholds for more data acceptance
‚úÖ Automatic OHLC logic fixing
‚úÖ Enhanced fallback options
‚úÖ Smart data cleaning before validation
‚úÖ Better symbol format handling
‚úÖ Multi-environment support (Colab, GHA, Local)
"""

import os
import time
import hashlib
import subprocess
import shutil
import threading
import urllib.parse
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
import numpy as np
import yfinance as yf
from datetime import datetime

print("=" * 70)
print("üöÄ YFinance FX Data Fetcher - Clean Structure Edition")
print("=" * 70)

# ======================================================
# 1Ô∏è‚É£ ENVIRONMENT DETECTION (MATCHES YOUR SETUP!)
# ======================================================
try:
    import google.colab
    IN_COLAB = True
    ENV_NAME = "Google Colab"
except ImportError:
    IN_COLAB = False
    ENV_NAME = "Local"

IN_GHA = "GITHUB_ACTIONS" in os.environ
if IN_GHA:
    ENV_NAME = "GitHub Actions"

print(f"üåç Environment: {ENV_NAME}")

# ======================================================
# 2Ô∏è‚É£ UNIFIED PATH CONFIGURATION (MATCHES CLEAN STRUCTURE!)
# ======================================================
if IN_COLAB:
    print("‚òÅÔ∏è Google Colab detected - using clean structure")
    BASE_FOLDER = Path("/content")
    SAVE_FOLDER = BASE_FOLDER / "forex-ai-models"  # ‚úÖ MATCHES!
    REPO_FOLDER = SAVE_FOLDER
elif IN_GHA:
    print("ü§ñ GitHub Actions detected - using repository root")
    BASE_FOLDER = Path.cwd()
    SAVE_FOLDER = BASE_FOLDER
    REPO_FOLDER = BASE_FOLDER
else:
    print("üíª Local environment detected - using clean structure")
    BASE_FOLDER = Path.cwd()
    SAVE_FOLDER = BASE_FOLDER
    REPO_FOLDER = BASE_FOLDER

# ‚úÖ CREATE ORGANIZED DIRECTORY STRUCTURE
DIRECTORIES = {
    "data_raw_yfinance": SAVE_FOLDER / "data" / "raw" / "yfinance",
    "data_processed": SAVE_FOLDER / "data" / "processed",
    "database": SAVE_FOLDER / "database",
    "logs": SAVE_FOLDER / "logs",
    "outputs": SAVE_FOLDER / "outputs",
    "quarantine": SAVE_FOLDER / "data" / "quarantine" / "yfinance",
}

# Create all directories
for dir_name, dir_path in DIRECTORIES.items():
    dir_path.mkdir(parents=True, exist_ok=True)

# Export key paths
CSV_FOLDER = DIRECTORIES["data_raw_yfinance"]  # ‚úÖ YFinance CSVs here
QUARANTINE_FOLDER = DIRECTORIES["quarantine"]
LOG_FOLDER = DIRECTORIES["logs"]

print(f"üìÇ Base Folder: {BASE_FOLDER}")
print(f"üíæ Save Folder: {SAVE_FOLDER}")
print(f"üì¶ Repo Folder: {REPO_FOLDER}")
print(f"üìä YFinance CSV: {CSV_FOLDER}")
print(f"üóëÔ∏è Quarantine: {QUARANTINE_FOLDER}")
print("=" * 70)

# ======================================================
# 3Ô∏è‚É£ GIT CONFIGURATION
# ======================================================
GIT_USER_NAME = os.environ.get("GIT_USER_NAME", "Forex AI Bot")
GIT_USER_EMAIL = os.environ.get("GIT_USER_EMAIL", "nakatonabira3@gmail.com")
GITHUB_USERNAME = "rahim-dotAI"
GITHUB_REPO = "forex-ai-models"
BRANCH = "main"

FOREX_PAT = os.environ.get("FOREX_PAT")

# Try Colab secrets if in Colab and PAT not found
if not FOREX_PAT and IN_COLAB:
    try:
        from google.colab import userdata
        FOREX_PAT = userdata.get("FOREX_PAT")
        if FOREX_PAT:
            os.environ["FOREX_PAT"] = FOREX_PAT
            print("üîê Loaded FOREX_PAT from Colab secrets")
    except Exception as e:
        print(f"‚ö†Ô∏è Could not access Colab secrets: {e}")

if not FOREX_PAT:
    raise ValueError("‚ùå FOREX_PAT is required!")

SAFE_PAT = urllib.parse.quote(FOREX_PAT)
REPO_URL = f"https://{GITHUB_USERNAME}:{SAFE_PAT}@github.com/{GITHUB_USERNAME}/{GITHUB_REPO}.git"

# Configure git
subprocess.run(["git", "config", "--global", "user.name", GIT_USER_NAME],
               capture_output=True, check=False)
subprocess.run(["git", "config", "--global", "user.email", GIT_USER_EMAIL],
               capture_output=True, check=False)

print(f"‚úÖ Git configured: {GIT_USER_NAME} <{GIT_USER_EMAIL}>")

# ======================================================
# 4Ô∏è‚É£ REPOSITORY MANAGEMENT (SIMPLIFIED)
# ======================================================
def ensure_repository():
    """Ensure repository is available and up-to-date"""
    if IN_GHA:
        print("\nü§ñ GitHub Actions: Repository already available")
        if not (REPO_FOLDER / ".git").exists():
            print("‚ö†Ô∏è Warning: .git directory not found")
        else:
            print("‚úÖ Git repository verified")
        return

    print("\nüì• Managing repository...")

    if REPO_FOLDER.exists() and not (REPO_FOLDER / ".git").exists():
        print("‚ö†Ô∏è Directory exists but is not a git repository")
        return

    if (REPO_FOLDER / ".git").exists():
        print(f"üîÑ Pulling latest changes...")
        try:
            result = subprocess.run(
                ["git", "-C", str(REPO_FOLDER), "pull", "origin", BRANCH],
                capture_output=True,
                text=True,
                timeout=30
            )
            if result.returncode == 0:
                print("‚úÖ Repository updated successfully")
            else:
                print(f"‚ö†Ô∏è Pull had issues, continuing anyway")
        except Exception as e:
            print(f"‚ö†Ô∏è Update failed: {e} - continuing with existing repo")
    else:
        print("‚ö†Ô∏è Repository not found. This script expects the repo to be set up first.")
        print("   Please run the GitHub Sync script first!")

ensure_repository()

# ======================================================
# 5Ô∏è‚É£ RATE LIMITER
# ======================================================
class RateLimiter:
    """Rate limiter for API calls"""
    def __init__(self, requests_per_minute=10, requests_per_hour=350):
        self.rpm = requests_per_minute
        self.rph = requests_per_hour
        self.request_times = []
        self.hourly_request_times = []
        self.lock = threading.Lock()
        self.total_requests = 0

    def wait_if_needed(self):
        with self.lock:
            now = time.time()
            self.request_times = [t for t in self.request_times if now - t < 60]
            self.hourly_request_times = [t for t in self.hourly_request_times if now - t < 3600]

            if len(self.request_times) >= self.rpm:
                wait_time = 60 - (now - self.request_times[0])
                if wait_time > 0:
                    time.sleep(wait_time + 1)
                    self.request_times = []

            if len(self.hourly_request_times) >= self.rph:
                wait_time = 3600 - (now - self.hourly_request_times[0])
                if wait_time > 0:
                    time.sleep(wait_time + 1)
                    self.hourly_request_times = []

            self.request_times.append(now)
            self.hourly_request_times.append(now)
            self.total_requests += 1
            time.sleep(1.0 + (hash(str(now)) % 20) / 10)

    def get_stats(self):
        with self.lock:
            return {'total_requests': self.total_requests}

rate_limiter = RateLimiter()

# ======================================================
# 6Ô∏è‚É£ DATA CLEANING & VALIDATION
# ======================================================
def fix_ohlc_logic(df):
    """Fix impossible OHLC relationships"""
    if df is None or df.empty:
        return df

    df = df.copy()
    required_cols = ['open', 'high', 'low', 'close']

    if not all(col in df.columns for col in required_cols):
        return df

    # Fix High: should be maximum of OHLC
    df['high'] = df[required_cols].max(axis=1)

    # Fix Low: should be minimum of OHLC
    df['low'] = df[required_cols].min(axis=1)

    return df

class DataQualityValidator:
    """RELAXED validation for more data acceptance"""

    # ‚úÖ RELAXED THRESHOLDS
    MIN_ROWS = 5  # Down from 10
    MIN_PRICE_CV = 0.01  # Down from 0.1 (1% instead of 10%)
    MIN_UNIQUE_RATIO = 0.005  # Down from 0.05 (0.5% instead of 5%)
    MIN_TRUE_RANGE = 1e-12  # More lenient
    MIN_QUALITY_SCORE = 20.0  # Down from 40.0

    @staticmethod
    def validate_dataframe(df, pair, tf_name):
        """Validate with relaxed criteria"""
        if df is None or df.empty:
            return False, 0.0, {}, ["Empty DataFrame"]

        issues = []
        metrics = {}

        metrics['row_count'] = len(df)
        if len(df) < DataQualityValidator.MIN_ROWS:
            return False, 0.0, metrics, [f"Too few rows: {len(df)}"]

        required_cols = ['open', 'high', 'low', 'close']
        if not all(col in df.columns for col in required_cols):
            return False, 0.0, metrics, ["Missing OHLC columns"]

        ohlc_data = df[required_cols].dropna()
        if len(ohlc_data) == 0:
            return False, 0.0, metrics, ["No valid OHLC data"]

        metrics['valid_rows'] = len(ohlc_data)
        metrics['valid_ratio'] = len(ohlc_data) / len(df)

        close_prices = ohlc_data['close']
        metrics['price_mean'] = float(close_prices.mean())
        metrics['price_std'] = float(close_prices.std())
        metrics['price_cv'] = (metrics['price_std'] / metrics['price_mean']) * 100 if metrics['price_mean'] > 0 else 0.0

        metrics['unique_prices'] = close_prices.nunique()
        metrics['unique_ratio'] = metrics['unique_prices'] / len(close_prices)

        # Calculate true range
        high = ohlc_data['high'].values
        low = ohlc_data['low'].values
        close = ohlc_data['close'].values

        tr = np.maximum.reduce([
            high - low,
            np.abs(high - np.roll(close, 1)),
            np.abs(low - np.roll(close, 1))
        ])
        tr[0] = high[0] - low[0]

        metrics['true_range_median'] = float(np.median(tr))

        # Quality score calculation (more lenient)
        quality_score = metrics['valid_ratio'] * 30

        if metrics['price_cv'] >= 0.5:
            quality_score += 40
        elif metrics['price_cv'] >= DataQualityValidator.MIN_PRICE_CV:
            quality_score += (metrics['price_cv'] / 0.5) * 40

        if metrics['unique_ratio'] >= 0.1:
            quality_score += 30
        elif metrics['unique_ratio'] >= DataQualityValidator.MIN_UNIQUE_RATIO:
            quality_score += (metrics['unique_ratio'] / 0.1) * 30

        metrics['quality_score'] = quality_score

        # Relaxed validation - accept if meets minimum thresholds
        is_valid = (
            quality_score >= DataQualityValidator.MIN_QUALITY_SCORE and
            metrics['price_cv'] >= DataQualityValidator.MIN_PRICE_CV and
            metrics['unique_ratio'] >= DataQualityValidator.MIN_UNIQUE_RATIO
        )

        if not is_valid:
            if metrics['price_cv'] < DataQualityValidator.MIN_PRICE_CV:
                issues.append(f"Low CV: {metrics['price_cv']:.4f}%")
            if metrics['unique_ratio'] < DataQualityValidator.MIN_UNIQUE_RATIO:
                issues.append(f"Low unique: {metrics['unique_ratio']:.3%}")

        return is_valid, quality_score, metrics, issues

validator = DataQualityValidator()

# ======================================================
# 7Ô∏è‚É£ CONFIGURATION
# ======================================================
FX_PAIRS = ["EUR/USD", "GBP/USD", "USD/JPY", "AUD/USD"]

# ‚úÖ ENHANCED with more fallback options
TIMEFRAMES = {
    "1d_5y": [
        ("1d", "5y"),
        ("1d", "max"),  # Try max available
        ("1d", "3y"),
        ("1d", "2y"),
    ],
    "1h_2y": [
        ("1h", "2y"),
        ("1h", "1y"),
        ("1h", "730d"),  # Exactly 2 years in days
        ("1h", "6mo")
    ],
    "15m_60d": [
        ("15m", "60d"),
        ("15m", "2mo"),
        ("15m", "30d"),
    ],
    "5m_1mo": [
        ("5m", "1mo"),
        ("5m", "30d"),
        ("5m", "14d"),
    ],
    "1m_7d": [
        ("1m", "7d"),
        ("1m", "5d"),
        ("1m", "3d"),
    ]
}

print(f"\nüìä Configuration:")
print(f"   Pairs: {len(FX_PAIRS)}")
print(f"   Timeframes: {len(TIMEFRAMES)}")
print(f"   Total tasks: {len(FX_PAIRS) * len(TIMEFRAMES)}")
print(f"   Quality threshold: {validator.MIN_QUALITY_SCORE}/100 (RELAXED)")
print("=" * 70)

lock = threading.Lock()

# ======================================================
# 8Ô∏è‚É£ HELPER FUNCTIONS
# ======================================================
def file_hash(filepath):
    """Calculate MD5 hash of file"""
    if not filepath.exists():
        return None
    md5 = hashlib.md5()
    with open(filepath, "rb") as f:
        for chunk in iter(lambda: f.read(8192), b""):
            md5.update(chunk)
    return md5.hexdigest()

def ensure_tz_naive(df):
    """Remove timezone information from DataFrame index"""
    if df is None or df.empty:
        return df
    df.index = pd.to_datetime(df.index, errors='coerce')
    if df.index.tz is not None:
        df.index = df.index.tz_convert(None)
    return df

def merge_data(existing_df, new_df):
    """Merge existing and new data, removing duplicates"""
    existing_df = ensure_tz_naive(existing_df)
    new_df = ensure_tz_naive(new_df)
    if existing_df.empty:
        return new_df
    if new_df.empty:
        return existing_df
    combined = pd.concat([existing_df, new_df])
    combined = combined[~combined.index.duplicated(keep="last")]
    combined.sort_index(inplace=True)
    return combined

def get_symbol_variants(pair, interval):
    """Get multiple symbol format variations"""
    base_symbol = pair.replace("/", "") + "=X"
    variants = [base_symbol]

    # Additional formats
    if interval in ["1d", "1h"]:
        from_curr, to_curr = pair.split("/")
        variants.append(f"{from_curr}{to_curr}=X")  # No separator
        variants.append(f"{from_curr}=X")  # Just base currency

    return variants

# ======================================================
# 9Ô∏è‚É£ WORKER FUNCTION
# ======================================================
def process_pair_tf(pair, tf_name, interval_period_options, max_retries=3):
    """
    Download YFinance data with OHLC fixing and validation

    ‚úÖ Saves to data/raw/yfinance/ with clear naming

    Returns:
        Tuple of (message, filepath if changed, quality_score)
    """
    # ‚úÖ Save to YFinance folder
    filename = f"{pair.replace('/', '_')}_{tf_name}.csv"
    filepath = CSV_FOLDER / filename

    existing_df = pd.DataFrame()
    if filepath.exists():
        try:
            existing_df = pd.read_csv(filepath, index_col=0, parse_dates=True)
            existing_df = ensure_tz_naive(existing_df)
        except Exception as e:
            print(f"  ‚ö†Ô∏è Could not load existing data: {e}")

    old_hash = file_hash(filepath)

    for option_idx, (interval, period) in enumerate(interval_period_options):
        symbol_variants = get_symbol_variants(pair, interval)

        for symbol in symbol_variants:
            for attempt in range(max_retries):
                try:
                    rate_limiter.wait_if_needed()

                    ticker = yf.Ticker(symbol)
                    df = ticker.history(
                        period=period,
                        interval=interval,
                        auto_adjust=False,
                        prepost=False,
                        actions=False,
                        raise_errors=False
                    )

                    if df.empty:
                        raise ValueError("Empty data")

                    available_cols = [c for c in ['Open', 'High', 'Low', 'Close', 'Volume']
                                     if c in df.columns]
                    df = df[available_cols]
                    df.rename(columns=lambda x: x.lower(), inplace=True)
                    df = ensure_tz_naive(df)

                    combined_df = merge_data(existing_df, df)

                    # ‚úÖ FIX OHLC LOGIC BEFORE VALIDATION
                    combined_df = fix_ohlc_logic(combined_df)

                    is_valid, quality_score, metrics, issues = validator.validate_dataframe(
                        combined_df, pair, tf_name
                    )

                    if not is_valid:
                        if attempt < max_retries - 1:
                            time.sleep(3 * (2 ** attempt))
                            continue
                        elif option_idx < len(interval_period_options) - 1:
                            break  # Try next option
                        else:
                            # Save anyway but mark as low quality
                            print(f"  ‚ö†Ô∏è Low quality ({quality_score:.1f}) but saving: {pair} {tf_name}")

                    # Save the file
                    with lock:
                        combined_df.to_csv(filepath)

                    new_hash = file_hash(filepath)
                    changed = (old_hash != new_hash)

                    status = "‚úÖ" if quality_score >= 50 else "‚ö†Ô∏è"
                    msg = f"{status} {pair} {tf_name} - {len(combined_df)} rows, Q:{quality_score:.0f}"
                    print(f"  {msg}")
                    return msg, str(filepath) if changed else None, quality_score

                except Exception as e:
                    if attempt < max_retries - 1:
                        time.sleep(3 * (2 ** attempt))
                    else:
                        if option_idx < len(interval_period_options) - 1:
                            break  # Try next option

    return f"‚ùå Failed {pair} {tf_name}", None, 0.0

# ======================================================
# üîü PARALLEL EXECUTION
# ======================================================
print("\n" + "=" * 70)
print("üöÄ Starting YFinance data download...")
print("=" * 70 + "\n")

start_time = time.time()
changed_files = []
results = []
quality_scores = {}

with ThreadPoolExecutor(max_workers=2) as executor:
    tasks = []
    for pair in FX_PAIRS:
        for tf_name, options in TIMEFRAMES.items():
            tasks.append(executor.submit(process_pair_tf, pair, tf_name, options))

    for future in as_completed(tasks):
        try:
            msg, filename, quality = future.result()
            results.append(msg)
            if filename:
                changed_files.append(filename)
                quality_scores[filename] = quality
        except Exception as e:
            results.append(f"‚ùå Error: {e}")

elapsed_time = time.time() - start_time

# ======================================================
# 1Ô∏è‚É£1Ô∏è‚É£ SUMMARY
# ======================================================
print("\n" + "=" * 70)
print("üìä PROCESSING SUMMARY")
print("=" * 70)

for result in results:
    print(result)

success_count = len([r for r in results if "‚úÖ" in r or "‚ö†Ô∏è" in r])
print(f"\nTotal tasks: {len(results)}")
print(f"Successful: {success_count}/{len(results)}")
print(f"Files updated: {len(changed_files)}")
print(f"Time: {elapsed_time/60:.1f} min")

if quality_scores:
    avg_q = sum(quality_scores.values()) / len(quality_scores)
    print(f"Average quality: {avg_q:.1f}/100")

    print("\n" + "=" * 70)
    print("üìä QUALITY REPORT")
    print("=" * 70)
    for fname, score in sorted(quality_scores.items(), key=lambda x: x[1], reverse=True):
        status = "‚úÖ" if score >= 50 else "‚ö†Ô∏è"
        print(f"  {status} {Path(fname).name}: {score:.1f}/100")

# Check quarantine
quarantined = list(QUARANTINE_FOLDER.glob("*.bad"))
if quarantined:
    print(f"\n" + "=" * 70)
    print(f"‚ö†Ô∏è  QUARANTINED FILES: {len(quarantined)}")
    print("=" * 70)
    for qfile in quarantined:
        print(f"  ‚ùå {qfile.stem}")

# ======================================================
# 1Ô∏è‚É£2Ô∏è‚É£ GIT COMMIT & PUSH
# ======================================================
if IN_GHA:
    print("\n" + "=" * 70)
    print("ü§ñ GitHub Actions: Skipping git operations")
    print("=" * 70)

elif changed_files:
    print("\n" + "=" * 70)
    print("üöÄ Committing changes to GitHub...")
    print("=" * 70)

    try:
        os.chdir(REPO_FOLDER)

        subprocess.run(["git", "add", "-A"], check=False)

        commit_msg = f"Update YFinance data - {len(changed_files)} files"
        if quality_scores:
            commit_msg += f" (Avg Q:{avg_q:.0f})"

        result = subprocess.run(
            ["git", "commit", "-m", commit_msg],
            capture_output=True,
            text=True
        )

        if result.returncode == 0:
            print("‚úÖ Changes committed")

            for attempt in range(3):
                print(f"üì§ Pushing to GitHub (attempt {attempt + 1}/3)...")
                result = subprocess.run(
                    ["git", "push", "origin", BRANCH],
                    capture_output=True,
                    text=True,
                    timeout=30
                )

                if result.returncode == 0:
                    print("‚úÖ Successfully pushed to GitHub")
                    break
                elif attempt < 2:
                    subprocess.run(
                        ["git", "pull", "--rebase", "origin", BRANCH],
                        capture_output=True
                    )
                    time.sleep(3)
        else:
            print("‚ÑπÔ∏è  No changes to commit")

    except Exception as e:
        print(f"‚ùå Git error: {e}")
    finally:
        os.chdir(SAVE_FOLDER)

else:
    print("\n‚ÑπÔ∏è No changes to commit")

# ======================================================
# ‚úÖ COMPLETION
# ======================================================
print("\n" + "=" * 70)
print("‚úÖ YFINANCE WORKFLOW COMPLETED")
print("=" * 70)
print(f"Environment: {ENV_NAME}")
print(f"Files updated: {len(changed_files)}")
print(f"Quality validated: ‚úÖ")
if quality_scores:
    print(f"Average quality: {avg_q:.1f}/100")
print(f"Status: {'‚úÖ Success' if success_count == len(results) else '‚ö†Ô∏è Partial'}")
print(f"Rate limiter: {rate_limiter.get_stats()['total_requests']} requests")
print("=" * 70)
print("\nüìÅ Clean File Structure:")
print(f"   YFinance: {CSV_FOLDER}")
print(f"   ‚îî‚îÄ‚îÄ EUR_USD_1d_5y.csv, EUR_USD_1h_2y.csv, etc.")
print(f"   Alpha Vantage: {SAVE_FOLDER / 'data' / 'raw' / 'alpha_vantage'}")
print(f"   ‚îî‚îÄ‚îÄ EUR_USD_daily_av.csv")
print("\nüéØ All data sources in organized folders!")
print("=" * 70)

In [None]:
#!/usr/bin/env python3
"""
FX CSV Combiner + Multi-Type Handler - CLEAN STRUCTURE EDITION
==============================================================
‚úÖ Aligned with clean repo structure (data/raw/, data/processed/)
‚úÖ Combines Alpha Vantage + YFinance data
‚úÖ Full-dataset indicator calculation (not incremental)
‚úÖ ATR preservation (no clipping or scaling)
‚úÖ Quality validation before processing
‚úÖ Multi-environment support (Colab, GHA, Local)
"""

import os
import time
import hashlib
import subprocess
import shutil
import urllib.parse
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
import ta
from ta.momentum import WilliamsRIndicator
from ta.volatility import AverageTrueRange
import warnings

warnings.filterwarnings('ignore')

print("=" * 70)
print("üîß CSV Combiner & Multi-Type Handler - Clean Structure Edition")
print("=" * 70)

# ======================================================
# 1Ô∏è‚É£ ENVIRONMENT DETECTION
# ======================================================
try:
    import google.colab
    IN_COLAB = True
    ENV_NAME = "Google Colab"
except ImportError:
    IN_COLAB = False
    ENV_NAME = "Local"

IN_GHA = "GITHUB_ACTIONS" in os.environ
if IN_GHA:
    ENV_NAME = "GitHub Actions"

print(f"üåç Environment: {ENV_NAME}")

# ======================================================
# 2Ô∏è‚É£ UNIFIED PATH CONFIGURATION (MATCHES CLEAN STRUCTURE!)
# ======================================================
if IN_COLAB:
    print("‚òÅÔ∏è Google Colab detected - using clean structure")
    BASE_FOLDER = Path("/content")
    SAVE_FOLDER = BASE_FOLDER / "forex-ai-models"
    REPO_FOLDER = SAVE_FOLDER
elif IN_GHA:
    print("ü§ñ GitHub Actions detected - using repository root")
    BASE_FOLDER = Path.cwd()
    SAVE_FOLDER = BASE_FOLDER
    REPO_FOLDER = BASE_FOLDER
else:
    print("üíª Local environment detected - using clean structure")
    BASE_FOLDER = Path.cwd()
    SAVE_FOLDER = BASE_FOLDER
    REPO_FOLDER = BASE_FOLDER

# ‚úÖ CREATE ORGANIZED DIRECTORY STRUCTURE
DIRECTORIES = {
    "data_raw_yfinance": SAVE_FOLDER / "data" / "raw" / "yfinance",
    "data_raw_alpha": SAVE_FOLDER / "data" / "raw" / "alpha_vantage",
    "data_processed": SAVE_FOLDER / "data" / "processed",
    "database": SAVE_FOLDER / "database",
    "logs": SAVE_FOLDER / "logs",
    "outputs": SAVE_FOLDER / "outputs",
    "quarantine": SAVE_FOLDER / "data" / "quarantine" / "combiner",
}

# Create all directories
for dir_name, dir_path in DIRECTORIES.items():
    dir_path.mkdir(parents=True, exist_ok=True)

# Export key paths
YFINANCE_CSV_FOLDER = DIRECTORIES["data_raw_yfinance"]
ALPHA_CSV_FOLDER = DIRECTORIES["data_raw_alpha"]
PICKLE_FOLDER = DIRECTORIES["data_processed"]
QUARANTINE_FOLDER = DIRECTORIES["quarantine"]
LOG_FOLDER = DIRECTORIES["logs"]

print(f"üìÇ Base Folder: {BASE_FOLDER}")
print(f"üíæ Save Folder: {SAVE_FOLDER}")
print(f"üì¶ Repo Folder: {REPO_FOLDER}")
print(f"üìä YFinance CSV: {YFINANCE_CSV_FOLDER}")
print(f"üìä Alpha CSV: {ALPHA_CSV_FOLDER}")
print(f"üîß Processed: {PICKLE_FOLDER}")
print(f"üóëÔ∏è Quarantine: {QUARANTINE_FOLDER}")
print("=" * 70)

lock = threading.Lock()

def print_status(msg, level="info"):
    """Print status messages with icons"""
    levels = {"info": "‚ÑπÔ∏è", "success": "‚úÖ", "warn": "‚ö†Ô∏è", "error": "‚ùå", "debug": "üêû"}
    print(f"{levels.get(level, '‚ÑπÔ∏è')} {msg}")

# ======================================================
# 3Ô∏è‚É£ DATA QUALITY VALIDATOR
# ======================================================
class DataQualityValidator:
    """Validate data quality for OHLC files"""

    MIN_ROWS = 10
    MIN_PRICE_CV = 0.01  # 0.01% minimum (relaxed)
    MIN_UNIQUE_RATIO = 0.005  # 0.5% unique prices (relaxed)
    MIN_TRUE_RANGE = 1e-10
    MIN_QUALITY_SCORE = 20.0  # Relaxed from 30

    @staticmethod
    def validate_dataframe(df, filename):
        """Validate DataFrame quality"""
        if df is None or df.empty:
            return False, 0.0, {}, ["Empty DataFrame"]

        issues = []
        metrics = {}

        metrics['row_count'] = len(df)
        if len(df) < DataQualityValidator.MIN_ROWS:
            issues.append(f"Too few rows: {len(df)}")

        required_cols = ['open', 'high', 'low', 'close']
        missing_cols = [col for col in required_cols if col not in df.columns]
        if missing_cols:
            issues.append(f"Missing columns: {missing_cols}")
            return False, 0.0, metrics, issues

        ohlc_data = df[required_cols].dropna()
        if len(ohlc_data) == 0:
            issues.append("No valid OHLC data")
            return False, 0.0, metrics, issues

        metrics['valid_rows'] = len(ohlc_data)
        metrics['valid_ratio'] = len(ohlc_data) / len(df)

        close_prices = ohlc_data['close']
        metrics['price_mean'] = float(close_prices.mean())
        metrics['price_std'] = float(close_prices.std())
        metrics['price_cv'] = (metrics['price_std'] / metrics['price_mean'] * 100) if metrics['price_mean'] > 0 else 0.0

        metrics['unique_prices'] = close_prices.nunique()
        metrics['unique_ratio'] = metrics['unique_prices'] / len(close_prices)

        high = ohlc_data['high'].values
        low = ohlc_data['low'].values
        close = ohlc_data['close'].values

        tr = np.maximum.reduce([
            high - low,
            np.abs(high - np.roll(close, 1)),
            np.abs(low - np.roll(close, 1))
        ])
        tr[0] = high[0] - low[0]

        metrics['true_range_median'] = float(np.median(tr))

        quality_score = 0.0
        quality_score += metrics['valid_ratio'] * 30

        if metrics['price_cv'] >= 0.5:
            quality_score += 40
        elif metrics['price_cv'] >= DataQualityValidator.MIN_PRICE_CV:
            quality_score += (metrics['price_cv'] / 0.5) * 40

        if metrics['unique_ratio'] >= 0.1:
            quality_score += 30
        elif metrics['unique_ratio'] >= DataQualityValidator.MIN_UNIQUE_RATIO:
            quality_score += (metrics['unique_ratio'] / 0.1) * 30

        metrics['quality_score'] = quality_score

        is_valid = (
            quality_score >= DataQualityValidator.MIN_QUALITY_SCORE and
            metrics['price_cv'] >= DataQualityValidator.MIN_PRICE_CV
        )

        if not is_valid:
            if metrics['price_cv'] < DataQualityValidator.MIN_PRICE_CV:
                issues.append(f"Low CV: {metrics['price_cv']:.4f}%")
            if metrics['unique_ratio'] < DataQualityValidator.MIN_UNIQUE_RATIO:
                issues.append(f"Low unique: {metrics['unique_ratio']:.3%}")

        return is_valid, quality_score, metrics, issues

validator = DataQualityValidator()

# ======================================================
# 4Ô∏è‚É£ GIT CONFIGURATION
# ======================================================
GIT_USER_NAME = os.environ.get("GIT_USER_NAME", "Forex AI Bot")
GIT_USER_EMAIL = os.environ.get("GIT_USER_EMAIL", "nakatonabira3@gmail.com")
GITHUB_USERNAME = "rahim-dotAI"
GITHUB_REPO = "forex-ai-models"
BRANCH = "main"

FOREX_PAT = os.environ.get("FOREX_PAT")

if not FOREX_PAT and IN_COLAB:
    try:
        from google.colab import userdata
        FOREX_PAT = userdata.get("FOREX_PAT")
        if FOREX_PAT:
            os.environ["FOREX_PAT"] = FOREX_PAT
            print("üîê Loaded FOREX_PAT from Colab secrets")
    except Exception as e:
        print(f"‚ö†Ô∏è Could not access Colab secrets: {e}")

if FOREX_PAT:
    subprocess.run(["git", "config", "--global", "user.name", GIT_USER_NAME],
                   capture_output=True, check=False)
    subprocess.run(["git", "config", "--global", "user.email", GIT_USER_EMAIL],
                   capture_output=True, check=False)
    print(f"‚úÖ Git configured: {GIT_USER_NAME} <{GIT_USER_EMAIL}>")

# ======================================================
# 5Ô∏è‚É£ HELPER FUNCTIONS
# ======================================================
def ensure_tz_naive(df):
    """Remove timezone information from DataFrame index"""
    if df is None or df.empty:
        return pd.DataFrame()

    df.index = pd.to_datetime(df.index, errors='coerce')
    if df.index.tz is not None:
        df.index = df.index.tz_localize(None)

    return df

def safe_numeric(df):
    """Handle infinity/NaN robustly"""
    df_clean = df.copy()
    df_clean.replace([np.inf, -np.inf], np.nan, inplace=True)

    required_columns = ['open', 'high', 'low', 'close']
    existing_columns = [col for col in required_columns if col in df_clean.columns]

    if existing_columns:
        df_clean.dropna(subset=existing_columns, inplace=True)
    else:
        df_clean.dropna(how='all', inplace=True)

    return df_clean

# ======================================================
# 6Ô∏è‚É£ CSV DISCOVERY
# ======================================================
def discover_csv_files():
    """Discover CSV files from both YFinance and Alpha Vantage folders"""
    csv_files = []

    # Search in YFinance folder
    yf_files = list(YFINANCE_CSV_FOLDER.glob("*.csv"))
    if yf_files:
        print_status(f"üìÇ Found {len(yf_files)} YFinance CSV(s)", "debug")
        csv_files.extend(yf_files)

    # Search in Alpha Vantage folder
    alpha_files = list(ALPHA_CSV_FOLDER.glob("*.csv"))
    if alpha_files:
        print_status(f"üìÇ Found {len(alpha_files)} Alpha Vantage CSV(s)", "debug")
        csv_files.extend(alpha_files)

    return csv_files

# ======================================================
# 7Ô∏è‚É£ INDICATOR CALCULATION (FULL DATASET)
# ======================================================
def add_indicators_full(df):
    """
    ‚úÖ Calculate indicators on FULL dataset (not incremental)
    ‚úÖ ATR preserved without clipping or scaling
    """
    if df.empty:
        return None

    required_cols = ['open', 'high', 'low', 'close']
    if not all(col in df.columns for col in required_cols):
        return None

    df = safe_numeric(df)
    if df.empty:
        return None

    df = df.copy()
    df.sort_index(inplace=True)

    # Preserve raw prices
    for col in ['open', 'high', 'low', 'close']:
        if col in df.columns and f'raw_{col}' not in df.columns:
            df[f'raw_{col}'] = df[col].copy()

    print_status(f"  üîß Calculating indicators on {len(df)} rows", "debug")

    try:
        # Trend indicators
        if len(df) >= 10:
            df['SMA_10'] = ta.trend.sma_indicator(df['close'], 10)
            df['EMA_10'] = ta.trend.ema_indicator(df['close'], 10)

        if len(df) >= 20:
            df['SMA_20'] = ta.trend.sma_indicator(df['close'], 20)
            df['EMA_20'] = ta.trend.ema_indicator(df['close'], 20)

        if len(df) >= 50:
            df['SMA_50'] = ta.trend.sma_indicator(df['close'], 50)
            df['EMA_50'] = ta.trend.ema_indicator(df['close'], 50)

        if len(df) >= 200:
            df['SMA_200'] = ta.trend.sma_indicator(df['close'], 200)

        # MACD
        if len(df) >= 26:
            macd = ta.trend.MACD(df['close'])
            df['MACD'] = macd.macd()
            df['MACD_signal'] = macd.macd_signal()
            df['MACD_diff'] = macd.macd_diff()

    except Exception as e:
        print_status(f"  ‚ö†Ô∏è Trend indicator error: {e}", "warn")

    try:
        # Momentum indicators
        if len(df) >= 14:
            df['RSI_14'] = ta.momentum.rsi(df['close'], 14)
            df['Williams_%R'] = WilliamsRIndicator(
                df['high'], df['low'], df['close'], 14
            ).williams_r()
            df['Stoch_K'] = ta.momentum.stoch(df['high'], df['low'], df['close'], 14)
            df['Stoch_D'] = ta.momentum.stoch_signal(df['high'], df['low'], df['close'], 14)

        if len(df) >= 20:
            df['CCI_20'] = ta.trend.cci(df['high'], df['low'], df['close'], 20)
            df['ROC'] = ta.momentum.roc(df['close'], 12)

    except Exception as e:
        print_status(f"  ‚ö†Ô∏è Momentum indicator error: {e}", "warn")

    try:
        # ‚úÖ CRITICAL: ATR calculation - NO CLIPPING!
        if len(df) >= 14:
            atr_values = AverageTrueRange(
                df['high'], df['low'], df['close'], 14
            ).average_true_range()

            # Only fill NaN, don't clip
            df['ATR'] = atr_values.fillna(1e-10)

            atr_median = df['ATR'].median()
            if pd.notna(atr_median):
                print_status(f"  üìä ATR median: {atr_median:.8f}", "debug")

        # Bollinger Bands
        if len(df) >= 20:
            bb = ta.volatility.BollingerBands(df['close'], 20, 2)
            df['BB_upper'] = bb.bollinger_hband()
            df['BB_middle'] = bb.bollinger_mavg()
            df['BB_lower'] = bb.bollinger_lband()
            df['BB_width'] = bb.bollinger_wband()

    except Exception as e:
        print_status(f"  ‚ö†Ô∏è Volatility indicator error: {e}", "warn")

    try:
        # Derived features
        df['price_change'] = df['close'].pct_change()
        df['price_change_5'] = df['close'].pct_change(5)
        df['high_low_range'] = (df['high'] - df['low']) / df['close']
        df['close_open_range'] = (df['close'] - df['open']) / df['open']

        if 'volume' in df.columns:
            df['vwap'] = (df['close'] * df['volume']).cumsum() / df['volume'].cumsum()

        if 'SMA_50' in df.columns:
            df['price_vs_sma50'] = (df['close'] - df['SMA_50']) / df['SMA_50']

        if 'RSI_14' in df.columns:
            df['rsi_momentum'] = df['RSI_14'].diff()

    except Exception as e:
        print_status(f"  ‚ö†Ô∏è Derived features error: {e}", "warn")

    try:
        # ‚úÖ Scale features but PROTECT ATR and raw prices
        numeric_cols = df.select_dtypes(include=[np.number]).columns

        protected_cols = [
            'open', 'high', 'low', 'close', 'volume',
            'raw_open', 'raw_high', 'raw_low', 'raw_close',
            'ATR'  # ‚úÖ PROTECT ATR!
        ]

        scalable_cols = [c for c in numeric_cols if c not in protected_cols]

        if scalable_cols:
            df[scalable_cols] = df[scalable_cols].replace([np.inf, -np.inf], np.nan)
            cols_with_data = [c for c in scalable_cols if not df[c].isna().all()]

            if cols_with_data:
                scaler = RobustScaler()
                df[cols_with_data] = scaler.fit_transform(
                    df[cols_with_data].fillna(0) + 1e-10
                )
                print_status(f"  ‚úÖ Scaled {len(cols_with_data)} features (ATR protected)", "debug")

    except Exception as e:
        print_status(f"  ‚ö†Ô∏è Scaling error: {e}", "warn")

    return df

# ======================================================
# 8Ô∏è‚É£ MAIN PROCESSING FUNCTION
# ======================================================
def process_csv_file(csv_file):
    """Process a single CSV file: validate, combine, add indicators, save"""
    try:
        print_status(f"üìã Processing: {csv_file.name}", "info")

        # Load CSV
        df = pd.read_csv(csv_file, index_col=0, parse_dates=True)
        df = ensure_tz_naive(df)

        if df.empty:
            msg = f"‚ö†Ô∏è {csv_file.name}: Empty file"
            print_status(msg, "warn")
            return None, msg

        # ‚úÖ VALIDATE QUALITY
        is_valid, quality_score, metrics, issues = validator.validate_dataframe(df, csv_file.name)

        print_status(f"  üìä Quality score: {quality_score:.1f}/100", "debug")

        if not is_valid:
            print_status(f"  ‚ö†Ô∏è Quality issues: {'; '.join(issues[:2])}", "warn")

            # Quarantine if too low
            if quality_score < validator.MIN_QUALITY_SCORE:
                print_status(f"  ‚ùå Quarantining low quality file", "error")

                quarantine_file = QUARANTINE_FOLDER / f"{csv_file.name}.bad"
                with lock:
                    df.to_csv(quarantine_file)

                    report_file = QUARANTINE_FOLDER / f"{csv_file.name}.quality.txt"
                    with open(report_file, 'w') as f:
                        f.write(f"Quality Report for {csv_file.name}\n")
                        f.write(f"{'='*50}\n")
                        f.write(f"Quality Score: {quality_score:.1f}/100\n")
                        f.write(f"Issues: {'; '.join(issues)}\n")
                        f.write(f"\nMetrics:\n")
                        for k, v in metrics.items():
                            f.write(f"  {k}: {v}\n")

                return None, f"‚ùå {csv_file.name}: Quarantined (Q:{quality_score:.1f})"
            else:
                print_status(f"  ‚ö†Ô∏è Low quality but acceptable", "warn")

        # ‚úÖ ADD INDICATORS (FULL DATASET)
        processed_df = add_indicators_full(df)

        if processed_df is None:
            msg = f"‚ùå {csv_file.name}: Indicator calculation failed"
            print_status(msg, "error")
            return None, msg

        # ‚úÖ SAVE PROCESSED DATA
        pickle_filename = csv_file.stem + ".pkl"
        pickle_path = PICKLE_FOLDER / pickle_filename

        with lock:
            processed_df.to_pickle(pickle_path, compression='gzip', protocol=4)

        atr_median = processed_df['ATR'].median() if 'ATR' in processed_df.columns else 0
        msg = f"‚úÖ {csv_file.name}: {len(processed_df)} rows, Q:{quality_score:.0f}, ATR:{atr_median:.8f}"
        print_status(msg, "success")

        return str(pickle_path), msg

    except Exception as e:
        msg = f"‚ùå Failed {csv_file.name}: {e}"
        print_status(msg, "error")
        import traceback
        traceback.print_exc()
        return None, msg

# ======================================================
# 9Ô∏è‚É£ MAIN EXECUTION
# ======================================================
print("\n" + "=" * 70)
print("üöÄ Discovering CSV files...")
print("=" * 70 + "\n")

csv_files = discover_csv_files()

if csv_files:
    print_status(f"üìä Total CSV files found: {len(csv_files)}", "success")
    for csv_file in csv_files[:5]:
        print_status(f"  ‚Ä¢ {csv_file.name} ({csv_file.stat().st_size / 1024:.1f} KB)", "debug")
    if len(csv_files) > 5:
        print_status(f"  ... and {len(csv_files) - 5} more", "debug")
else:
    print_status("‚ö†Ô∏è No CSV files found!", "warn")
    print_status("   Check that data fetchers have run successfully", "warn")

changed_files = []
quality_scores = {}

# ======================================================
# üîü PROCESS FILES
# ======================================================
if csv_files:
    print("\n" + "=" * 70)
    print(f"‚öôÔ∏è Processing {len(csv_files)} CSV file(s)...")
    print("=" * 70 + "\n")

    with ThreadPoolExecutor(max_workers=min(8, len(csv_files))) as executor:
        futures = [executor.submit(process_csv_file, f) for f in csv_files]

        for future in as_completed(futures):
            file, msg = future.result()
            if file:
                changed_files.append(file)
                # Extract quality info
                if "ATR:" in msg:
                    try:
                        atr_str = msg.split("ATR:")[1].strip()
                        quality_scores[file] = float(atr_str)
                    except:
                        pass

# ======================================================
# 1Ô∏è‚É£1Ô∏è‚É£ QUALITY REPORT
# ======================================================
if quality_scores:
    print("\n" + "=" * 70)
    print("üìä QUALITY REPORT - ATR VALUES")
    print("=" * 70)

    avg_atr = sum(quality_scores.values()) / len(quality_scores)
    print(f"Average ATR: {avg_atr:.8f}")
    print(f"\nATR by file:")

    for filepath, atr in sorted(quality_scores.items(), key=lambda x: x[1], reverse=True):
        filename = Path(filepath).stem
        status = "‚úÖ" if atr > 1e-6 else "‚ö†Ô∏è"
        print(f"  {status} {filename}: {atr:.8f}")

    low_atr_files = [f for f, atr in quality_scores.items() if atr < 1e-6]
    if low_atr_files:
        print(f"\n‚ö†Ô∏è  {len(low_atr_files)} file(s) with suspiciously low ATR")

# Check quarantine
quarantined = list(QUARANTINE_FOLDER.glob("*.bad"))
if quarantined:
    print(f"\n" + "=" * 70)
    print(f"‚ö†Ô∏è  QUARANTINED FILES: {len(quarantined)}")
    print("=" * 70)
    for qfile in quarantined:
        print(f"  ‚ùå {qfile.stem}")

# ======================================================
# 1Ô∏è‚É£2Ô∏è‚É£ GIT COMMIT & PUSH
# ======================================================
if IN_GHA:
    print("\n" + "=" * 70)
    print("ü§ñ GitHub Actions: Skipping git operations")
    print("=" * 70)

elif changed_files and FOREX_PAT:
    print("\n" + "=" * 70)
    print("üöÄ Committing changes to GitHub...")
    print("=" * 70)

    try:
        os.chdir(REPO_FOLDER)

        subprocess.run(["git", "add", "-A"], check=False)

        commit_msg = f"Update processed data - {len(changed_files)} files"
        if quality_scores:
            commit_msg += f" (Avg ATR: {avg_atr:.6f})"

        result = subprocess.run(
            ["git", "commit", "-m", commit_msg],
            capture_output=True,
            text=True
        )

        if result.returncode == 0:
            print_status("‚úÖ Changes committed", "success")

            for attempt in range(3):
                print_status(f"üì§ Pushing (attempt {attempt + 1}/3)...", "info")
                result = subprocess.run(
                    ["git", "push", "origin", BRANCH],
                    capture_output=True,
                    text=True,
                    timeout=30
                )

                if result.returncode == 0:
                    print_status("‚úÖ Push successful", "success")
                    break
                elif attempt < 2:
                    subprocess.run(
                        ["git", "pull", "--rebase", "origin", BRANCH],
                        capture_output=True
                    )
                    time.sleep(3)

        elif "nothing to commit" in result.stdout.lower():
            print_status("‚ÑπÔ∏è No changes to commit", "info")

    except Exception as e:
        print_status(f"‚ùå Git error: {e}", "error")
    finally:
        os.chdir(SAVE_FOLDER)

# ======================================================
# ‚úÖ COMPLETION SUMMARY
# ======================================================
print("\n" + "=" * 70)
print("‚úÖ CSV COMBINER COMPLETED")
print("=" * 70)
print(f"Environment: {ENV_NAME}")
print(f"CSV files found: {len(csv_files)}")
print(f"Files processed: {len(changed_files)}")
print(f"Files quarantined: {len(quarantined)}")

if quality_scores:
    print(f"\nüìà ATR Statistics:")
    print(f"   Average: {avg_atr:.8f}")
    print(f"   Files analyzed: {len(quality_scores)}")

print("\nüîß KEY FEATURES:")
print("   ‚úÖ Full-dataset indicator calculation")
print("   ‚úÖ ATR preserved (no clipping/scaling)")
print("   ‚úÖ Quality validation with quarantine")
print("   ‚úÖ Clean organized structure")
print("   ‚úÖ Thread-safe processing")

print("\nüìÅ Output Locations:")
print(f"   Processed pickles: {PICKLE_FOLDER}")
print(f"   Quarantine: {QUARANTINE_FOLDER}")

print("=" * 70)

In [None]:
# TAG: pipeline_main

#!/usr/bin/env python3
"""
ULTRA-PERSISTENT SELF-LEARNING HYBRID FX PIPELINE v4.0
======================================================
‚úÖ Aligned with clean repo structure
‚úÖ Uses processed pickle files from combiner
‚úÖ Database-driven ML with memory
‚úÖ Multi-timeframe analysis
‚úÖ Automated signal generation
‚úÖ Performance tracking
"""

import os
import time
import json
import sqlite3
import threading
import subprocess
import pickle
import filecmp
from pathlib import Path
from datetime import datetime, timezone, timedelta
from contextlib import contextmanager
from collections import defaultdict
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

print("=" * 70)
print("üöÄ Ultra-Persistent FX Pipeline v4.0 - Clean Structure Edition")
print("=" * 70)

# ======================================================
# 1Ô∏è‚É£ ENVIRONMENT DETECTION
# ======================================================
try:
    import google.colab
    IN_COLAB = True
    ENV_NAME = "Google Colab"
except ImportError:
    IN_COLAB = False
    ENV_NAME = "Local"

IN_GHA = "GITHUB_ACTIONS" in os.environ
if IN_GHA:
    ENV_NAME = "GitHub Actions"

print(f"üåç Environment: {ENV_NAME}")

# ======================================================
# 2Ô∏è‚É£ UNIFIED PATH CONFIGURATION
# ======================================================
if IN_COLAB:
    BASE_FOLDER = Path("/content")
    SAVE_FOLDER = BASE_FOLDER / "forex-ai-models"
    REPO_FOLDER = SAVE_FOLDER
elif IN_GHA:
    BASE_FOLDER = Path.cwd()
    SAVE_FOLDER = BASE_FOLDER
    REPO_FOLDER = BASE_FOLDER
else:
    BASE_FOLDER = Path.cwd()
    SAVE_FOLDER = BASE_FOLDER
    REPO_FOLDER = BASE_FOLDER

# ‚úÖ ORGANIZED DIRECTORIES
DIRECTORIES = {
    "data_processed": SAVE_FOLDER / "data" / "processed",
    "database": SAVE_FOLDER / "database",
    "logs": SAVE_FOLDER / "logs",
    "outputs": SAVE_FOLDER / "outputs",
}

for dir_path in DIRECTORIES.values():
    dir_path.mkdir(parents=True, exist_ok=True)

PICKLE_FOLDER = DIRECTORIES["data_processed"]
DB_FOLDER = DIRECTORIES["database"]
LOGS_FOLDER = DIRECTORIES["logs"]
OUTPUTS_FOLDER = DIRECTORIES["outputs"]

PERSISTENT_DB = DB_FOLDER / "memory_v85.db"

print(f"üìÇ Base Folder: {BASE_FOLDER}")
print(f"üíæ Save Folder: {SAVE_FOLDER}")
print(f"üì¶ Repo Folder: {REPO_FOLDER}")
print(f"üîß Processed: {PICKLE_FOLDER}")
print(f"üíø Database: {PERSISTENT_DB}")
print("=" * 70)

def print_status(msg, level="info"):
    """Enhanced status printing"""
    icons = {
        "info": "‚ÑπÔ∏è",
        "success": "‚úÖ",
        "warn": "‚ö†Ô∏è",
        "debug": "üêû",
        "error": "‚ùå",
        "performance": "‚ö°",
        "data": "üìä"
    }
    icon = icons.get(level, '‚ÑπÔ∏è')
    print(f"{icon} {msg}")

# ======================================================
# 3Ô∏è‚É£ GIT CONFIGURATION
# ======================================================
GIT_USER_NAME = os.environ.get("GIT_USER_NAME", "Forex AI Bot")
GIT_USER_EMAIL = os.environ.get("GIT_USER_EMAIL", "nakatonabira3@gmail.com")
GITHUB_USERNAME = "rahim-dotAI"
GITHUB_REPO = "forex-ai-models"
BRANCH = "main"

FOREX_PAT = os.environ.get("FOREX_PAT")

if not FOREX_PAT and IN_COLAB:
    try:
        from google.colab import userdata
        FOREX_PAT = userdata.get("FOREX_PAT")
        if FOREX_PAT:
            os.environ["FOREX_PAT"] = FOREX_PAT
            print_status("üîê Loaded FOREX_PAT from Colab secrets", "success")
    except Exception as e:
        print_status(f"‚ö†Ô∏è Could not access Colab secrets: {e}", "warn")

if FOREX_PAT:
    subprocess.run(["git", "config", "--global", "user.name", GIT_USER_NAME],
                   capture_output=True, check=False)
    subprocess.run(["git", "config", "--global", "user.email", GIT_USER_EMAIL],
                   capture_output=True, check=False)
    print_status(f"‚úÖ Git configured: {GIT_USER_NAME}", "success")

# ======================================================
# 4Ô∏è‚É£ ML IMPORTS
# ======================================================
try:
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.linear_model import SGDClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.exceptions import NotFittedError
    print_status("‚úÖ ML libraries loaded", "success")
except ImportError as e:
    print_status(f"‚ùå ML libraries missing: {e}", "error")
    raise

# ======================================================
# 5Ô∏è‚É£ ENHANCED DATABASE CLASS
# ======================================================
class EnhancedTradeMemoryDatabase:
    """Enhanced FX Trading Database v4.0"""

    def __init__(self, db_path=PERSISTENT_DB, min_age_hours=1):
        self.db_path = db_path
        self.conn = None
        self.lock = threading.RLock()
        self.min_age_hours = min_age_hours

        print_status(f"üìÅ Database: {self.db_path}", "info")
        print_status(f"‚è±Ô∏è  Min trade age: {self.min_age_hours}h", "info")
        self.initialize_database()

    @contextmanager
    def get_cursor(self):
        """Context manager for database cursor"""
        cursor = self.conn.cursor()
        try:
            yield cursor
            self.conn.commit()
        except Exception as e:
            self.conn.rollback()
            raise e
        finally:
            cursor.close()

    def initialize_database(self):
        """Create database with optimized settings"""
        try:
            db_exists = self.db_path.exists()

            self.conn = sqlite3.connect(
                str(self.db_path),
                timeout=30,
                check_same_thread=False
            )

            pragmas = [
                "PRAGMA journal_mode=WAL",
                "PRAGMA synchronous=NORMAL",
                "PRAGMA cache_size=-64000",
            ]

            for pragma in pragmas:
                self.conn.execute(pragma)

            with self.get_cursor() as cursor:
                cursor.execute('''
                    CREATE TABLE IF NOT EXISTS pending_trades (
                        id INTEGER PRIMARY KEY AUTOINCREMENT,
                        created_at TEXT NOT NULL,
                        iteration INTEGER NOT NULL,
                        pair TEXT NOT NULL,
                        timeframe TEXT NOT NULL,
                        sgd_prediction INTEGER,
                        rf_prediction INTEGER,
                        ensemble_prediction INTEGER,
                        entry_price REAL NOT NULL,
                        sl_price REAL NOT NULL,
                        tp_price REAL NOT NULL,
                        confidence REAL,
                        evaluated BOOLEAN DEFAULT 0
                    )
                ''')

                cursor.execute('''
                    CREATE TABLE IF NOT EXISTS completed_trades (
                        id INTEGER PRIMARY KEY AUTOINCREMENT,
                        pending_trade_id INTEGER,
                        created_at TEXT NOT NULL,
                        evaluated_at TEXT NOT NULL,
                        iteration_created INTEGER,
                        iteration_evaluated INTEGER,
                        pair TEXT NOT NULL,
                        timeframe TEXT NOT NULL,
                        model_used TEXT NOT NULL,
                        entry_price REAL NOT NULL,
                        exit_price REAL NOT NULL,
                        sl_price REAL NOT NULL,
                        tp_price REAL NOT NULL,
                        prediction INTEGER,
                        hit_tp BOOLEAN NOT NULL,
                        pnl REAL NOT NULL,
                        pnl_percent REAL,
                        duration_hours REAL
                    )
                ''')

                cursor.execute('''
                    CREATE TABLE IF NOT EXISTS model_stats_cache (
                        id INTEGER PRIMARY KEY AUTOINCREMENT,
                        updated_at TEXT NOT NULL,
                        pair TEXT NOT NULL,
                        model_name TEXT NOT NULL,
                        days INTEGER NOT NULL,
                        total_trades INTEGER DEFAULT 0,
                        winning_trades INTEGER DEFAULT 0,
                        losing_trades INTEGER DEFAULT 0,
                        accuracy_pct REAL DEFAULT 0.0,
                        total_pnl REAL DEFAULT 0.0,
                        avg_pnl REAL DEFAULT 0.0,
                        sharpe_ratio REAL DEFAULT 0.0,
                        UNIQUE(pair, model_name, days) ON CONFLICT REPLACE
                    )
                ''')

            if db_exists:
                print_status(f"‚úÖ Connected to: {self.db_path.name}", "success")
            else:
                print_status(f"‚úÖ Created: {self.db_path.name}", "success")

            self._verify_database_integrity()

        except sqlite3.Error as e:
            print_status(f"‚ùå Database init failed: {e}", "error")
            raise

    def _verify_database_integrity(self):
        """Verify database structure"""
        try:
            with self.get_cursor() as cursor:
                cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
                tables = [row[0] for row in cursor.fetchall()]

                expected = ['pending_trades', 'completed_trades', 'model_stats_cache']

                print_status("üìä Database Tables:", "data")
                for table in expected:
                    if table in tables:
                        cursor.execute(f"SELECT COUNT(*) FROM {table}")
                        count = cursor.fetchone()[0]
                        print_status(f"  ‚úì {table}: {count} rows", "data")
                    else:
                        print_status(f"  ‚úó {table}: MISSING!", "error")

        except Exception as e:
            print_status(f"‚ö†Ô∏è Verification warning: {e}", "warn")

    def store_new_signals(self, signals, current_iteration):
        """Store signals with batch insert"""
        if not signals:
            print_status("‚ö†Ô∏è No signals to store", "warn")
            return 0

        batch_data = []

        for pair, pair_data in signals.items():
            pair_signals = pair_data.get('signals', {})

            for tf_name, signal_data in pair_signals.items():
                if not signal_data:
                    continue

                required = ['live', 'SL', 'TP']
                if not all(signal_data.get(f, 0) > 0 for f in required):
                    continue

                batch_data.append((
                    datetime.now(timezone.utc).isoformat(),
                    current_iteration,
                    pair,
                    tf_name,
                    signal_data.get('sgd_pred'),
                    signal_data.get('rf_pred'),
                    signal_data.get('signal'),
                    signal_data.get('live', 0),
                    signal_data.get('SL', 0),
                    signal_data.get('TP', 0),
                    signal_data.get('confidence', 0.5)
                ))

        if not batch_data:
            print_status("‚ö†Ô∏è No valid signals", "warn")
            return 0

        try:
            with self.lock, self.get_cursor() as cursor:
                cursor.executemany('''
                    INSERT INTO pending_trades
                    (created_at, iteration, pair, timeframe,
                     sgd_prediction, rf_prediction, ensemble_prediction,
                     entry_price, sl_price, tp_price, confidence)
                    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                ''', batch_data)

                stored = len(batch_data)

            print_status(f"üíæ Stored {stored} trades", "success")
            return stored

        except sqlite3.Error as e:
            print_status(f"‚ùå Batch insert failed: {e}", "error")
            return 0

    def evaluate_pending_trades(self, current_prices, current_iteration):
        """Evaluate pending trades"""
        if not current_prices:
            print_status("‚ö†Ô∏è No current prices", "warn")
            return {}

        min_age = (datetime.now(timezone.utc) - timedelta(hours=self.min_age_hours)).isoformat()

        try:
            with self.lock, self.get_cursor() as cursor:
                cursor.execute('''
                    SELECT id, pair, timeframe, sgd_prediction, rf_prediction,
                           ensemble_prediction, entry_price, sl_price, tp_price,
                           created_at, iteration
                    FROM pending_trades
                    WHERE evaluated = 0 AND created_at < ?
                    ORDER BY created_at ASC
                    LIMIT 1000
                ''', (min_age,))

                pending = cursor.fetchall()

        except sqlite3.Error as e:
            print_status(f"‚ùå Failed to fetch: {e}", "error")
            return {}

        if not pending:
            print_status(f"‚ÑπÔ∏è No trades old enough (need {self.min_age_hours}h+)", "info")
            return {}

        print_status(f"üîç Evaluating {len(pending)} trades", "info")

        results = defaultdict(lambda: {
            'closed_trades': 0,
            'wins': 0,
            'losses': 0,
            'total_pnl': 0.0
        })

        completed_batch = []
        evaluated_ids = []

        for trade in pending:
            (trade_id, pair, tf, sgd, rf, ens, entry, sl, tp, created, iter_created) = trade

            current = current_prices.get(pair, 0)
            if current <= 0:
                continue

            for model_name, pred in [('SGD', sgd), ('RandomForest', rf), ('Ensemble', ens)]:
                if pred is None:
                    continue

                hit_tp, hit_sl, exit_price = self._evaluate_outcome(pred, current, tp, sl)

                if exit_price:
                    pnl = self._calc_pnl(pred, entry, exit_price)
                    pnl_pct = (pnl / entry) * 100
                    duration = self._calc_duration(created)

                    completed_batch.append((
                        trade_id, created, datetime.now(timezone.utc).isoformat(),
                        iter_created, current_iteration,
                        pair, tf, model_name, entry, exit_price,
                        sl, tp, pred, hit_tp, pnl, pnl_pct, duration
                    ))

                    results[model_name]['closed_trades'] += 1
                    results[model_name]['total_pnl'] += pnl

                    if hit_tp:
                        results[model_name]['wins'] += 1
                        status = "WIN ‚úÖ"
                    else:
                        results[model_name]['losses'] += 1
                        status = "LOSS ‚ùå"

                    print_status(
                        f"{status} {model_name}: {pair} {tf} "
                        f"P&L=${pnl:.5f} ({pnl_pct:+.2f}%) [{duration:.1f}h]",
                        "success" if hit_tp else "warn"
                    )

            evaluated_ids.append(trade_id)

        if completed_batch:
            try:
                with self.lock, self.get_cursor() as cursor:
                    cursor.executemany('''
                        INSERT INTO completed_trades
                        (pending_trade_id, created_at, evaluated_at,
                         iteration_created, iteration_evaluated,
                         pair, timeframe, model_used, entry_price, exit_price,
                         sl_price, tp_price, prediction, hit_tp, pnl, pnl_percent,
                         duration_hours)
                        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                    ''', completed_batch)

                    if evaluated_ids:
                        placeholders = ','.join('?' * len(evaluated_ids))
                        cursor.execute(f'''
                            UPDATE pending_trades
                            SET evaluated = 1
                            WHERE id IN ({placeholders})
                        ''', evaluated_ids)

                print_status(f"‚úÖ Evaluated {len(evaluated_ids)} trades", "success")

            except sqlite3.Error as e:
                print_status(f"‚ùå Evaluation failed: {e}", "error")
                return {}

        for model, data in results.items():
            if data['closed_trades'] > 0:
                data['accuracy'] = (data['wins'] / data['closed_trades']) * 100

        self._update_stats_cache()

        return dict(results)

    def _evaluate_outcome(self, pred, current, tp, sl):
        """Determine if trade hit TP or SL"""
        hit_tp, hit_sl, exit_price = False, False, None

        try:
            if pred == 1:  # Long
                if current >= tp:
                    hit_tp, exit_price = True, tp
                elif current <= sl:
                    hit_sl, exit_price = True, sl
            elif pred == 0:  # Short
                if current <= tp:
                    hit_tp, exit_price = True, tp
                elif current >= sl:
                    hit_sl, exit_price = True, sl
        except:
            pass

        return hit_tp, hit_sl, exit_price

    def _calc_pnl(self, pred, entry, exit):
        """Calculate P&L"""
        try:
            return exit - entry if pred == 1 else entry - exit
        except:
            return 0.0

    def _calc_duration(self, created_at):
        """Calculate duration in hours"""
        try:
            created = datetime.fromisoformat(created_at.replace('Z', '+00:00'))
            return max(0, (datetime.now(timezone.utc) - created).total_seconds() / 3600)
        except:
            return 0.0

    def _update_stats_cache(self):
        """Update cached statistics"""
        try:
            with self.lock, self.get_cursor() as cursor:
                cursor.execute('SELECT DISTINCT pair FROM completed_trades')
                pairs = [row[0] for row in cursor.fetchall()]

                cursor.execute('SELECT DISTINCT model_used FROM completed_trades')
                models = [row[0] for row in cursor.fetchall()]

                for pair in pairs:
                    for model in models:
                        for days in [7, 30]:
                            since = (datetime.now(timezone.utc) - timedelta(days=days)).isoformat()

                            cursor.execute('''
                                SELECT
                                    COUNT(*) as total,
                                    SUM(CASE WHEN hit_tp THEN 1 ELSE 0 END) as wins,
                                    SUM(pnl) as total_pnl,
                                    AVG(pnl) as avg_pnl
                                FROM completed_trades
                                WHERE pair = ? AND model_used = ? AND evaluated_at > ?
                            ''', (pair, model, since))

                            result = cursor.fetchone()
                            if not result or not result[0]:
                                continue

                            total, wins, total_pnl, avg_pnl = result
                            accuracy = (wins / total * 100) if total > 0 else 0.0

                            cursor.execute('''
                                SELECT pnl FROM completed_trades
                                WHERE pair = ? AND model_used = ? AND evaluated_at > ?
                            ''', (pair, model, since))

                            pnls = [row[0] for row in cursor.fetchall()]
                            sharpe = 0.0
                            if len(pnls) > 1:
                                pnl_std = np.std(pnls)
                                if pnl_std > 0:
                                    sharpe = (avg_pnl or 0) / pnl_std

                            cursor.execute('''
                                INSERT OR REPLACE INTO model_stats_cache
                                (updated_at, pair, model_name, days, total_trades,
                                 winning_trades, losing_trades, accuracy_pct,
                                 total_pnl, avg_pnl, sharpe_ratio)
                                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                            ''', (
                                datetime.now(timezone.utc).isoformat(),
                                pair, model, days, total, wins or 0, (total - wins) or 0,
                                accuracy, total_pnl or 0.0, avg_pnl or 0.0, sharpe
                            ))

        except Exception as e:
            print_status(f"‚ö†Ô∏è Stats update failed: {e}", "warn")

    def get_database_stats(self):
        """Get database statistics"""
        stats = {}

        try:
            with self.get_cursor() as cursor:
                cursor.execute('SELECT COUNT(*) FROM pending_trades WHERE evaluated = 0')
                stats['pending_trades'] = cursor.fetchone()[0]

                cursor.execute('SELECT COUNT(*) FROM completed_trades')
                stats['completed_trades'] = cursor.fetchone()[0]

                cursor.execute('SELECT SUM(pnl) FROM completed_trades')
                result = cursor.fetchone()
                stats['total_pnl'] = result[0] if result[0] else 0.0

                cursor.execute('''
                    SELECT COUNT(*), SUM(CASE WHEN hit_tp THEN 1 ELSE 0 END)
                    FROM completed_trades
                ''')
                result = cursor.fetchone()
                if result and result[0] > 0:
                    stats['overall_accuracy'] = (result[1] / result[0]) * 100
                else:
                    stats['overall_accuracy'] = 0.0

        except Exception as e:
            print_status(f"‚ö†Ô∏è Stats retrieval failed: {e}", "warn")

        return stats

    def close(self):
        """Close database connection"""
        try:
            if self.conn:
                self.conn.close()
                print_status("‚úÖ Database closed", "success")
        except Exception as e:
            print_status(f"‚ö†Ô∏è Close error: {e}", "warn")

# ======================================================
# 6Ô∏è‚É£ ML PREDICTION FROM PROCESSED PICKLES
# ======================================================
def train_predict_from_pickle(pickle_path, pair_name):
    """Train and predict using processed pickle file with robust feature handling"""
    try:
        # Load processed pickle (already has indicators!)
        df = pd.read_pickle(pickle_path, compression='gzip')

        if df.empty or len(df) < 50:
            return None, None, 0.5

        # ‚úÖ ROBUST FEATURE SELECTION: Exclude all non-indicator columns
        exclude_cols = [
            'close', 'raw_close', 'raw_open', 'raw_high', 'raw_low',
            'open', 'high', 'low',  # Also exclude OHLC
            'volume', 'vwap'  # Optional columns that may not exist everywhere
        ]

        feature_cols = [c for c in df.columns if c not in exclude_cols]

        if not feature_cols:
            print_status(f"‚ö†Ô∏è No features in {pickle_path.name}", "warn")
            return None, None, 0.5

        # Use only available features
        X = df[feature_cols].fillna(0)
        y = (df['close'].diff() > 0).astype(int).fillna(0)

        safe_pair = pair_name.replace("/", "_")

        # ‚úÖ SEPARATE MODELS PER TIMEFRAME to avoid feature mismatch
        timeframe = "unknown"
        if "1d" in pickle_path.name or "daily" in pickle_path.name:
            timeframe = "1d"
        elif "1h" in pickle_path.name:
            timeframe = "1h"
        elif "15m" in pickle_path.name:
            timeframe = "15m"
        elif "5m" in pickle_path.name:
            timeframe = "5m"
        elif "1m" in pickle_path.name:
            timeframe = "1m"

        # ‚úÖ UNIQUE MODEL FILES PER PAIR+TIMEFRAME
        model_id = f"{safe_pair}_{timeframe}"

        # SGD Model (incremental learning)
        sgd_file = PICKLE_FOLDER / f"{model_id}_sgd_model.pkl"

        try:
            if sgd_file.exists():
                sgd = pickle.load(open(sgd_file, "rb"))
                # Try to use existing model
                try:
                    sgd.partial_fit(X, y)
                except ValueError:
                    # Feature mismatch - retrain from scratch
                    sgd = SGDClassifier(max_iter=1000, tol=1e-3)
                    sgd.partial_fit(X, y, classes=np.array([0, 1]))
            else:
                sgd = SGDClassifier(max_iter=1000, tol=1e-3)
                sgd.partial_fit(X, y, classes=np.array([0, 1]))

            pickle.dump(sgd, open(sgd_file, "wb"))
            sgd_pred = int(sgd.predict(X.iloc[[-1]])[0])
        except Exception as e:
            print_status(f"‚ö†Ô∏è SGD error for {model_id}: {e}", "debug")
            sgd_pred = 1  # Default to buy

        # RandomForest (full retrain each time - no feature mismatch issues)
        try:
            rf = RandomForestClassifier(n_estimators=50, class_weight='balanced', random_state=42)
            rf.fit(X, y)

            rf_file = PICKLE_FOLDER / f"{model_id}_rf_model.pkl"
            pickle.dump(rf, open(rf_file, "wb"))

            rf_pred = int(rf.predict(X.iloc[[-1]])[0])
        except Exception as e:
            print_status(f"‚ö†Ô∏è RF error for {model_id}: {e}", "debug")
            rf_pred = 1  # Default to buy

        # Ensemble
        confidence = (sgd_pred + rf_pred) / 2.0

        return sgd_pred, rf_pred, confidence

    except Exception as e:
        print_status(f"‚ö†Ô∏è ML error for {pickle_path.name}: {e}", "debug")
        return None, None, 0.5

# ======================================================
# 7Ô∏è‚É£ PROCESS SINGLE PICKLE FILE
# ======================================================
def process_pickle_file(pickle_path):
    """Process single processed pickle file"""
    # Extract pair from filename
    filename = pickle_path.stem

    # Extract pair (e.g., "EUR_USD_1d_5y" -> "EUR/USD")
    currencies = ['EUR', 'USD', 'GBP', 'JPY', 'AUD', 'NZD', 'CAD', 'CHF']
    pair = None

    for curr1 in currencies:
        for curr2 in currencies:
            if curr1 != curr2 and filename.startswith(f"{curr1}_{curr2}"):
                pair = f"{curr1}/{curr2}"
                break
        if pair:
            break

    if not pair:
        print_status(f"‚ö†Ô∏è Could not extract pair from {filename}", "warn")
        return None, {}, "HOLD"

    # Determine timeframe from filename
    timeframe = "unknown"
    if "1d" in filename or "daily" in filename:
        timeframe = "1d"
    elif "1h" in filename:
        timeframe = "1h"
    elif "15m" in filename:
        timeframe = "15m"
    elif "5m" in filename:
        timeframe = "5m"
    elif "1m" in filename:
        timeframe = "1m"

    try:
        # Load pickle
        df = pd.read_pickle(pickle_path, compression='gzip')

        if df.empty:
            return pair, {}, "HOLD"

        # Get current price (use close if no raw_close)
        current_price = df['raw_close'].iloc[-1] if 'raw_close' in df.columns else df['close'].iloc[-1]

        # Calculate SL/TP using ATR
        if 'ATR' in df.columns:
            atr = df['ATR'].iloc[-1]
            mult = 2.0
            sl = max(0, round(current_price - atr * mult, 5))
            tp = round(current_price + atr * mult, 5)
        else:
            # Fallback if ATR missing
            atr_fallback = current_price * 0.01  # 1% of price
            sl = max(0, round(current_price - atr_fallback * 2, 5))
            tp = round(current_price + atr_fallback * 2, 5)

        # ML Predictions
        sgd_pred, rf_pred, confidence = train_predict_from_pickle(pickle_path, pair)

        if sgd_pred is None:
            return pair, {}, "HOLD"

        ensemble_pred = 1 if (sgd_pred + rf_pred) >= 1 else 0

        signal_data = {
            "signal": ensemble_pred,
            "sgd_pred": sgd_pred,
            "rf_pred": rf_pred,
            "live": current_price,
            "SL": sl,
            "TP": tp,
            "confidence": confidence,
            "timeframe": timeframe
        }

        print_status(
            f"{pair} | {timeframe} | Ens:{ensemble_pred} (SGD:{sgd_pred} RF:{rf_pred}) | "
            f"Price:{current_price:.5f} | SL:{sl:.5f} | TP:{tp:.5f}",
            "info"
        )

        return pair, {timeframe: signal_data}, "LONG" if ensemble_pred == 1 else "SHORT"

    except Exception as e:
        print_status(f"‚ùå Error processing {pickle_path.name}: {e}", "error")
        import traceback
        traceback.print_exc()
        return pair, {}, "HOLD"

# ======================================================
# 8Ô∏è‚É£ MAIN PIPELINE
# ======================================================
def run_pipeline(current_iteration=1):
    """Run complete pipeline"""
    print_status("="*70, "info")
    print_status("üöÄ STARTING ULTRA-PERSISTENT PIPELINE v4.0", "success")
    print_status("="*70, "info")

    # Initialize database
    db = EnhancedTradeMemoryDatabase()

    # Get stats
    print_status("\nüìä CURRENT DATABASE STATS", "data")
    stats = db.get_database_stats()
    print_status(f"  Pending: {stats.get('pending_trades', 0)}", "data")
    print_status(f"  Completed: {stats.get('completed_trades', 0)}", "data")
    print_status(f"  Total P&L: ${stats.get('total_pnl', 0.0):.5f}", "data")
    print_status(f"  Accuracy: {stats.get('overall_accuracy', 0.0):.1f}%", "data")

    # Load processed pickles
    print_status("\nüîÑ LOADING PROCESSED PICKLES", "info")
    print_status(f"üìÇ Looking in: {PICKLE_FOLDER}", "info")

    pickle_files = list(PICKLE_FOLDER.glob("*.pkl"))

    # Filter out model files
    pickle_files = [f for f in pickle_files if not any(
        suffix in f.name for suffix in ['_sgd_model', '_rf_model', 'indicator_cache']
    )]

    if not pickle_files:
        print_status("‚ö†Ô∏è No processed pickles found!", "warn")
        print_status("‚ÑπÔ∏è  Run CSV combiner first to generate processed pickles", "info")
        return {}

    print_status(f"‚úÖ Found {len(pickle_files)} processed pickle files", "success")

    aggregated_signals = {}
    current_prices = {}

    # Process each pickle
    for pickle_file in pickle_files:
        pair, signals, agg_signal = process_pickle_file(pickle_file)

        if pair:
            if pair not in aggregated_signals:
                aggregated_signals[pair] = {"signals": {}, "aggregated": "HOLD"}

            # Merge signals for same pair
            aggregated_signals[pair]["signals"].update(signals)

            # Update aggregated signal (use last non-HOLD)
            if agg_signal != "HOLD":
                aggregated_signals[pair]["aggregated"] = agg_signal

            # Collect current prices
            for tf, signal_data in signals.items():
                if signal_data.get('live', 0) > 0:
                    current_prices[pair] = signal_data['live']
                    break

    # Store new signals
    print_status("\nüíæ STORING SIGNALS", "info")
    stored = db.store_new_signals(aggregated_signals, current_iteration)
    print_status(f"‚úÖ Stored {stored} signals", "success")

    # Evaluate pending trades
    print_status("\nüîç EVALUATING PENDING TRADES", "info")
    if current_prices:
        results = db.evaluate_pending_trades(current_prices, current_iteration)

        if results:
            print_status("\nüìà EVALUATION RESULTS", "data")
            for model, data in results.items():
                print_status(f"  {model}:", "data")
                print_status(f"    Closed: {data['closed_trades']}", "data")
                print_status(f"    Wins: {data['wins']}", "data")
                print_status(f"    Losses: {data['losses']}", "data")
                print_status(f"    Accuracy: {data.get('accuracy', 0):.1f}%", "data")
                print_status(f"    Total P&L: ${data['total_pnl']:.5f}", "data")
    else:
        print_status("‚ö†Ô∏è No current prices for evaluation", "warn")

    # Export to JSON
    print_status("\nüìù EXPORTING TO JSON", "info")
    json_file = OUTPUTS_FOLDER / "latest_signals.json"
    tmp_file = OUTPUTS_FOLDER / "latest_signals_tmp.json"

    export_data = {
        "timestamp": datetime.now(timezone.utc).isoformat(),
        "iteration": current_iteration,
        "pairs": aggregated_signals,
        "database_stats": stats
    }

    with open(tmp_file, "w") as f:
        json.dump(export_data, f, indent=2)

    # Push to GitHub if changes
    if FOREX_PAT and (not json_file.exists() or not filecmp.cmp(tmp_file, json_file)):
        tmp_file.replace(json_file)
        print_status("üì§ Pushing to GitHub...", "info")

        try:
            os.chdir(REPO_FOLDER)

            subprocess.run(["git", "add", str(json_file)], check=False)
            subprocess.run(
                ["git", "commit", "-m", f"üìà Update signals - Iteration {current_iteration}"],
                check=False
            )

            for attempt in range(3):
                result = subprocess.run(
                    ["git", "push", "origin", BRANCH],
                    capture_output=True,
                    text=True,
                    timeout=30
                )
                if result.returncode == 0:
                    print_status("‚úÖ Pushed to GitHub", "success")
                    break
                elif attempt < 2:
                    time.sleep(5)
                    subprocess.run(
                        ["git", "pull", "--rebase", "origin", BRANCH],
                        capture_output=True
                    )

            os.chdir(SAVE_FOLDER)

        except Exception as e:
            print_status(f"‚ö†Ô∏è Git push error: {e}", "warn")
    else:
        print_status("‚ÑπÔ∏è No changes - skipping push", "info")
        if tmp_file.exists():
            tmp_file.unlink()

    # Final stats
    print_status("\nüìä FINAL DATABASE STATS", "data")
    final_stats = db.get_database_stats()
    print_status(f"  Pending: {final_stats.get('pending_trades', 0)}", "data")
    print_status(f"  Completed: {final_stats.get('completed_trades', 0)}", "data")
    print_status(f"  Total P&L: ${final_stats.get('total_pnl', 0.0):.5f}", "data")
    print_status(f"  Accuracy: {final_stats.get('overall_accuracy', 0.0):.1f}%", "data")

    db.close()

    print_status("\n‚úÖ PIPELINE COMPLETED!", "success")
    print_status("="*70, "info")

    return aggregated_signals

# ======================================================
# 9Ô∏è‚É£ MAIN EXECUTION
# ======================================================
if __name__ == "__main__":
    try:
        print_status("üöÄ Initializing Ultra-Persistent Pipeline...", "info")

        # Run pipeline
        signals = run_pipeline(current_iteration=1)

        if signals:
            print_status(f"\nüéâ Generated signals for {len(signals)} pairs!", "success")
            for pair, data in signals.items():
                print_status(f"  {pair}: {data['aggregated']}", "success")
        else:
            print_status("\n‚ö†Ô∏è No signals generated", "warn")

        print_status("\n‚úÖ ALL OPERATIONS COMPLETED!", "success")

    except Exception as e:
        print_status(f"\n‚ùå PIPELINE FAILED: {e}", "error")
        import traceback
        traceback.print_exc()
        raise

In [None]:

#TAG: pipeline_main

#!/usr/bin/env python3
"""
ULTIMATE FOREX PIPELINE v11.2 - OMEGA ULTIMATE (BEST OF BOTH)
==============================================================
‚ö° Dynamic seeds + Live updates + Full logging + Documentation
üõ°Ô∏è All anti-overfit fixes + Enhanced error handling + Progress tracking
‚úÖ Combines strengths of v11.0 and v11.1
"""

import os, sys, json, pickle, random, re, smtplib, subprocess, time, logging, warnings
from pathlib import Path
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from datetime import datetime, timedelta, timezone
from collections import defaultdict
from dataclasses import dataclass
from typing import Dict, List, Tuple, Optional
from multiprocessing import Pool, cpu_count

warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import requests
from scipy import stats

print("="*70, "\nüî¥ OMEGA v11.2 - ULTIMATE EDITION\n", "="*70, sep='')

# ======================================================
# ENVIRONMENT SETUP
# ======================================================
try:
    import google.colab
    IN_COLAB, IN_GHA, ENV_NAME = True, False, "Google Colab"
except ImportError:
    IN_COLAB, IN_GHA = False, "GITHUB_ACTIONS" in os.environ
    ENV_NAME = "GitHub Actions" if IN_GHA else "Local"

BASE_FOLDER = Path("/content" if IN_COLAB else Path.cwd())
SAVE_FOLDER = BASE_FOLDER if IN_GHA else (BASE_FOLDER / "forex-ai-models" if IN_COLAB else BASE_FOLDER)

DIRECTORIES = {k: SAVE_FOLDER / v for k, v in {
    "data_processed": "data/processed", "database": "database", "logs": "logs",
    "outputs": "outputs", "omega_state": "omega_state"
}.items()}

for d in DIRECTORIES.values(): d.mkdir(parents=True, exist_ok=True)

PICKLE_FOLDER, LOGS_FOLDER = DIRECTORIES["data_processed"], DIRECTORIES["logs"]
OUTPUTS_FOLDER, OMEGA_STATE_FOLDER = DIRECTORIES["outputs"], DIRECTORIES["omega_state"]

print(f"üåç {ENV_NAME} | üìÇ {BASE_FOLDER} | ‚ö° {cpu_count()} cores\n{'='*70}")

logging.basicConfig(
    filename=str(LOGS_FOLDER / f"omega_{datetime.now():%Y%m%d_%H%M%S}.log"),
    level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s'
)
logger = logging.getLogger(__name__)

def log(msg, lvl="info"):
    icons = {"info":"‚ÑπÔ∏è","success":"‚úÖ","warn":"‚ö†Ô∏è","error":"‚ùå","rocket":"üöÄ","chart":"üìä","brain":"üß†","omega":"üî¥"}
    getattr(logging, "warning" if lvl=="warn" else lvl, logging.info)(msg)
    print(f"{icons.get(lvl,'‚ÑπÔ∏è')} {msg}")

# ======================================================
# CONFIG
# ======================================================
GIT_USER_NAME = os.getenv("GIT_USER_NAME", "Forex AI Bot")
GIT_USER_EMAIL = os.getenv("GIT_USER_EMAIL", "nakatonabira3@gmail.com")
GITHUB_USERNAME, GITHUB_REPO = "rahim-dotAI", "forex-ai-models"
FOREX_PAT = os.getenv("FOREX_PAT", "").strip()

if not FOREX_PAT and IN_COLAB:
    try:
        from google.colab import userdata
        FOREX_PAT = userdata.get("FOREX_PAT")
        if FOREX_PAT: os.environ["FOREX_PAT"] = FOREX_PAT
    except: pass

GMAIL_USER = os.getenv("GMAIL_USER", "nakatonabira3@gmail.com")
GMAIL_APP_PASSWORD = os.getenv("GMAIL_APP_PASSWORD", "").strip() or "gmwohahtltmcewug"
BROWSERLESS_TOKEN = os.getenv("BROWSERLESS_TOKEN", "")

PAIRS = ["EUR/USD", "GBP/USD", "USD/JPY", "AUD/USD"]
TRAINING_TIMEFRAMES = {p: ["1d", "1h"] for p in PAIRS}
ATR_PERIOD, MIN_ATR, EPS = 14, 1e-5, 1e-8
BASE_CAPITAL, MAX_TRADE_CAP = 100, 5.0
MAX_ATR_SL, MAX_ATR_TP = 3.0, 3.0
WEEKEND_LEARNING_MODE = True

OMEGA_SIGNALS_FILE = OUTPUTS_FOLDER / "omega_signals.json"
OMEGA_LEARNING_FILE = OMEGA_STATE_FOLDER / "omega_learning.pkl"
OMEGA_ITERATION_FILE = OMEGA_STATE_FOLDER / "omega_iteration.pkl"
OMEGA_MEMORY_FILE = OMEGA_STATE_FOLDER / "omega_memory.pkl"

# üõ°Ô∏è FIXED ANTI-OVERFIT CONFIG
COMPETITION_MODELS = {
    "Alpha Momentum": {
        "color": "üî¥", "strategy": "Aggressive momentum",
        "atr_sl_range": (1.5, 2.5), "atr_tp_range": (2.0, 3.5),
        "risk_range": (0.010, 0.020), "confidence_range": (0.55, 0.70),
        "pop_size": 16, "generations": 8, "mutation_rate": 0.12,
        "crossover_rate": 0.75, "elite_ratio": 0.15, "multi_start": 3,
        "adaptive_mutation": True, "sample_rate": 0.30, "use_parallel": True
    },
    "Beta Conservative": {
        "color": "üîµ", "strategy": "Conservative trend",
        "atr_sl_range": (1.2, 2.0), "atr_tp_range": (1.8, 2.8),
        "risk_range": (0.006, 0.012), "confidence_range": (0.65, 0.80),
        "pop_size": 14, "generations": 6, "mutation_rate": 0.10,
        "crossover_rate": 0.70, "elite_ratio": 0.15, "multi_start": 3,
        "adaptive_mutation": True, "sample_rate": 0.30, "use_parallel": True
    },
    "Gamma Adaptive": {
        "color": "üü¢", "strategy": "Adaptive momentum",
        "atr_sl_range": (1.3, 2.3), "atr_tp_range": (2.0, 3.2),
        "risk_range": (0.008, 0.018), "confidence_range": (0.60, 0.75),
        "pop_size": 18, "generations": 10, "mutation_rate": 0.15,
        "crossover_rate": 0.80, "elite_ratio": 0.15, "multi_start": 3,
        "adaptive_mutation": True, "sample_rate": 0.30, "use_parallel": True
    }
}

@dataclass
class MarketRegime:
    volatility: str
    trend: str
    strength: float
    timestamp: datetime

@dataclass
class AdvancedMetrics:
    sharpe: float
    sortino: float
    calmar: float
    max_drawdown: float
    win_rate: float
    profit_factor: float
    avg_win: float
    avg_loss: float
    expectancy: float

class BalancedAntiOverfitAI:
    """
    üõ°Ô∏è FIXED AI-Powered Anti-Overfitting System
    - Realistic thresholds (75% min ratio)
    - Balanced penalties (max 60% severity)
    - Proper trend detection
    """
    def __init__(self, min_val_ratio=0.75, patience=4):
        self.min_val_ratio = min_val_ratio
        self.patience = patience
        self.history = []
        self.best_generalization_score = -float('inf')
        self.patience_counter = 0

    def calculate_generalization_score(self, train_metrics, val_metrics):
        """FIXED: More realistic scoring"""
        if not train_metrics or not val_metrics:
            return 0.0

        train_acc = train_metrics.get('accuracy', 0)
        val_acc = val_metrics.get('accuracy', 0)

        if train_acc < 1:
            return 0.0

        acc_ratio = val_acc / train_acc

        # Scoring: Aim for 80-95% consistency
        if acc_ratio >= 0.90:
            score = 1.0
        elif acc_ratio >= 0.75:
            score = 0.5 + (acc_ratio - 0.75) / 0.15 * 0.5
        else:
            score = max(0.0, acc_ratio / 0.75 * 0.5)

        return score

    def detect_overfitting(self, train_metrics, val_metrics):
        """FIXED: Balanced detection with realistic thresholds"""
        if not train_metrics or not val_metrics:
            return False, 0.0, []

        train_acc = train_metrics.get('accuracy', 0)
        val_acc = val_metrics.get('accuracy', 0)
        train_pnl = train_metrics.get('total_pnl', 0)
        val_pnl = val_metrics.get('total_pnl', 0)

        if train_acc < 1:
            return False, 0.0, []

        acc_ratio = val_acc / train_acc
        signals, severity = [], 0.0

        # Signal 1: Significant gap (realistic)
        if acc_ratio < self.min_val_ratio:
            gap = self.min_val_ratio - acc_ratio
            signals.append(f"Val/Train ratio: {acc_ratio:.2f} < {self.min_val_ratio}")
            severity += min(0.4, gap * 2.0)  # Cap at 0.4

        # Signal 2: Large absolute difference
        if train_acc > val_acc + 20:
            signals.append(f"Large gap: {train_acc:.1f}% vs {val_acc:.1f}%")
            severity += 0.2

        # Signal 3: PnL divergence (relaxed)
        if train_pnl > 10 and val_pnl < -5:
            signals.append(f"PnL divergence: ${train_pnl:.2f} vs ${val_pnl:.2f}")
            severity += 0.15

        # Signal 5: CRITICAL - Zero validation accuracy
        if val_acc < 1.0:
            signals.append(f"CRITICAL: Zero validation accuracy")
            severity = 0.6  # Maximum severity

        # Signal 4: Declining trend
        gen_score = self.calculate_generalization_score(train_metrics, val_metrics)
        self.history.append(gen_score)

        if len(self.history) >= 4:
            recent = np.mean(self.history[-2:])
            older = np.mean(self.history[-4:-2])
            trend = recent - older

            if trend < -0.15:
                signals.append(f"Declining trend: {trend:.3f}")
                severity += 0.15

        # FIXED: Cap total severity at 60%
        severity = min(0.6, severity)
        is_overfitting = severity > 0.20

        # Recommendations
        recommendations = []
        if severity > 0.4:
            recommendations.append("Moderate regularization needed")
        if severity > 0.25:
            recommendations.append("Increase mutation rate slightly")
        if acc_ratio < 0.65:
            recommendations.append("Consider simpler features")

        return is_overfitting, severity, recommendations

    def should_stop_early(self, train_metrics, val_metrics):
        """Determine if training should stop early"""
        gen_score = self.calculate_generalization_score(train_metrics, val_metrics)

        if gen_score > self.best_generalization_score * 1.01:
            self.best_generalization_score = gen_score
            self.patience_counter = 0
            return False, "Improving"
        else:
            self.patience_counter += 1
            if self.patience_counter >= self.patience:
                return True, f"No improvement for {self.patience} checks"
            return False, f"Patience: {self.patience_counter}/{self.patience}"

    def apply_adaptive_regularization(self, config, severity):
        """FIXED: Gradual regularization increases"""
        adjusted_config = config.copy()

        if severity > 0.4:  # Moderate
            adjusted_config['mutation_rate'] = min(0.25, config['mutation_rate'] * 1.5)
            adjusted_config['confidence_range'] = (
                config['confidence_range'][0] + 0.08,
                min(0.90, config['confidence_range'][1] + 0.08)
            )
            log(f"  üõ°Ô∏è Moderate overfitting: Applied regularization", "warn")

        elif severity > 0.25:  # Mild
            adjusted_config['mutation_rate'] = min(0.20, config['mutation_rate'] * 1.2)
            adjusted_config['confidence_range'] = (
                config['confidence_range'][0] + 0.05,
                min(0.90, config['confidence_range'][1] + 0.05)
            )
            log(f"  üõ°Ô∏è Mild overfitting: Applied light regularization", "info")

        return adjusted_config

    def get_report(self):
        """Generate overfitting report"""
        if len(self.history) < 2:
            return {
                'generalization_score': 0,
                'best_score': 0,
                'trend': 'üìä Initializing',
                'patience_counter': 0,
                'history': []
            }

        recent_gen = np.mean(self.history[-3:]) if len(self.history) >= 3 else self.history[-1]
        trend = "üìà Improving" if recent_gen > self.best_generalization_score * 0.90 else "üìâ Adjusting"

        return {
            'generalization_score': recent_gen,
            'best_score': self.best_generalization_score,
            'trend': trend,
            'patience_counter': self.patience_counter,
            'history': self.history[-10:]
        }

class CircuitBreaker:
    def __init__(self, max_errors=5, time_window=300):
        self.max_errors, self.time_window = max_errors, time_window
        self.errors, self.is_open = [], False

    def record_error(self, error_type: str, severity: str):
        self.errors.append({'type': error_type, 'severity': severity, 'timestamp': datetime.now()})
        cutoff = datetime.now() - timedelta(seconds=self.time_window)
        self.errors = [e for e in self.errors if e['timestamp'] > cutoff]
        if len(self.errors) >= self.max_errors:
            self.is_open = True
            log("üö® CIRCUIT BREAKER ACTIVATED", "error")
            return True
        return False

CIRCUIT_BREAKER = CircuitBreaker()

def detect_regime(df: pd.DataFrame) -> MarketRegime:
    returns = df['close'].pct_change().dropna()
    cv, hv = returns.tail(20).std(), returns.std()
    vol = 'high' if cv > hv*1.5 else 'low' if cv < hv*0.6 else 'normal'

    high, low, close = df['high'].values, df['low'].values, df['close'].values
    plus_dm = np.where((high[1:]-high[:-1])>(low[:-1]-low[1:]), np.maximum(high[1:]-high[:-1],0), 0)
    minus_dm = np.where((low[:-1]-low[1:])>(high[1:]-high[:-1]), np.maximum(low[:-1]-low[1:],0), 0)
    tr = np.maximum.reduce([high[1:]-low[1:], np.abs(high[1:]-close[:-1]), np.abs(low[1:]-close[:-1])])

    atr = pd.Series(tr).rolling(14).mean().values
    plus_di = 100 * pd.Series(plus_dm).rolling(14).mean().values / (atr + EPS)
    minus_di = 100 * pd.Series(minus_dm).rolling(14).mean().values / (atr + EPS)
    dx = 100 * np.abs(plus_di - minus_di) / (plus_di + minus_di + EPS)
    adx = pd.Series(dx).rolling(14).mean().iloc[-1]

    return MarketRegime(
        volatility=vol,
        trend='trending' if adx>25 else 'ranging',
        strength=min(100, adx*2) if adx>25 else max(0, 100-adx*3),
        timestamp=datetime.now(timezone.utc)
    )

def calc_kelly(wr: float, aw: float, al: float) -> float:
    if al==0 or wr==0: return 0.0
    kelly = (wr * aw/abs(al) - (1-wr)) / (aw/abs(al))
    return max(0, min(0.25, kelly*0.25))

def calc_position_size(equity, risk_pct, atr, atr_mult, wr=0.5, aw=1.0, al=1.0, positions=0):
    kelly = calc_kelly(wr, aw, al)
    base = equity * kelly
    risk = min(equity*risk_pct, MAX_TRADE_CAP) / (atr*atr_mult + EPS)
    final = min(base, risk)
    return max(0, final*0.5 if positions>=3 else final)

def calc_metrics(trades: List[Dict], eq_curve: List[float]) -> AdvancedMetrics:
    if not trades or len(eq_curve)<2:
        return AdvancedMetrics(0,0,0,0,0,0,0,0,0)

    pnls = [t['pnl'] for t in trades]
    wins = [p for p in pnls if p>0]
    losses = [p for p in pnls if p<0]

    wr = len(wins)/len(pnls) if pnls else 0
    aw = np.mean(wins) if wins else 0
    al = np.mean(losses) if losses else 0
    pf = sum(wins)/abs(sum(losses)) if losses else float('inf')
    exp = (wr*aw) + ((1-wr)*al)

    returns = np.diff(eq_curve) / (np.array(eq_curve[:-1]) + EPS)
    sharpe = (np.mean(returns)/np.std(returns)*np.sqrt(252)) if len(returns)>1 and np.std(returns)>0 else 0

    down_ret = returns[returns<0]
    sortino = (np.mean(returns)/np.std(down_ret)*np.sqrt(252)) if len(down_ret)>1 else sharpe

    peak, mdd = eq_curve[0], 0
    for e in eq_curve:
        peak = max(peak, e)
        mdd = max(mdd, (peak-e)/peak)

    total_ret = (eq_curve[-1]-eq_curve[0])/eq_curve[0]
    calmar = total_ret/mdd if mdd>0 else 0

    return AdvancedMetrics(sharpe, sortino, calmar, mdd*100, wr*100, pf, aw, al, exp)

def is_weekend() -> bool:
    return datetime.now().weekday() >= 5

def fetch_price(pair, timeout=10):
    if not BROWSERLESS_TOKEN: return None
    try:
        fc, tc = pair.split("/")
        url = f"https://production-sfo.browserless.io/content?token={BROWSERLESS_TOKEN}"
        r = requests.post(url, json={"url": f"https://www.x-rates.com/calculator/?from={fc}&to={tc}&amount=1"}, timeout=timeout)
        m = re.search(r'ccOutputRslt[^>]*>([\d,.]+)', r.text)
        return float(m.group(1).replace(",", "")) if m else None
    except Exception as e:
        CIRCUIT_BREAKER.record_error("price_fetch", "medium")
    return None

def ensure_atr(df):
    if "atr" in df.columns and df["atr"].median() > MIN_ATR:
        return df.assign(atr=df["atr"].fillna(MIN_ATR).clip(lower=MIN_ATR))
    high, low, close = df["high"].values, df["low"].values, df["close"].values
    tr = np.maximum.reduce([high-low, np.abs(high-np.roll(close,1)), np.abs(low-np.roll(close,1))])
    tr[0] = high[0]-low[0] if len(tr)>0 else MIN_ATR
    df["atr"] = pd.Series(tr, index=df.index).rolling(ATR_PERIOD, min_periods=1).mean().fillna(MIN_ATR).clip(lower=MIN_ATR)
    return df

def seed_signal(df):
    """Calculate hybrid signal"""
    if "hybrid_signal" in df.columns and df["hybrid_signal"].abs().sum() > 0:
        return df

    fast = df["close"].rolling(10, min_periods=1).mean()
    slow = df["close"].rolling(50, min_periods=1).mean()

    ema12 = df["close"].ewm(span=12, adjust=False).mean()
    ema26 = df["close"].ewm(span=26, adjust=False).mean()
    macd_line = ema12 - ema26
    macd_signal = macd_line.ewm(span=9, adjust=False).mean()

    delta = df["close"].diff()
    gain = (delta.where(delta > 0, 0)).rolling(14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(14).mean()
    rsi = 100 - (100 / (1 + gain / (loss + EPS)))

    roc = df["close"].pct_change(10) * 100

    raw = (
        (fast - slow) * 0.3 +
        (macd_line - macd_signal) * 0.3 +
        ((rsi - 50) / 50) * 0.2 +
        roc.fillna(0) * 0.2
    ).fillna(0)

    df["hybrid_signal"] = raw.ewm(span=3, adjust=False).mean()

    return df

def update_pickle_data():
    """
    üîÑ NEW v11.1: Fetch latest prices and update pickle files with live data
    - Creates backups before modification
    - Validates data integrity
    - Handles compression properly
    """
    log("üîÑ Updating pickle data with live prices...", "info")
    updated_count = 0

    for pair in PAIRS:
        latest_price = fetch_price(pair)
        if not latest_price or latest_price <= 0:
            continue

        pair_key = pair.replace("/", "_")
        for pkl_file in PICKLE_FOLDER.glob(f"{pair_key}*.pkl"):
            # Skip model files - only update data files
            if any(x in pkl_file.name for x in ['_model', '_sgd', '_rf', 'indicator_cache']):
                continue

            try:
                # Create backup before modifying
                backup_file = pkl_file.with_suffix('.pkl.bak')

                # Try gzipped first, then regular pickle
                try:
                    df = pd.read_pickle(pkl_file, compression='gzip')
                except:
                    try:
                        df = pd.read_pickle(pkl_file, compression=None)
                    except:
                        continue

                # Verify it's a DataFrame with price data
                if not isinstance(df, pd.DataFrame) or len(df) < 10:
                    continue
                if not all(c in df.columns for c in ['open', 'high', 'low', 'close']):
                    continue

                # Make backup
                import shutil
                shutil.copy2(pkl_file, backup_file)

                last_time = df.index[-1]
                new_time = datetime.now().replace(second=0, microsecond=0)

                # Only add if time is newer
                if new_time > last_time:
                    new_row = pd.DataFrame({
                        'open': [float(latest_price)],
                        'high': [float(latest_price)],
                        'low': [float(latest_price)],
                        'close': [float(latest_price)],
                        'volume': [0]
                    }, index=[new_time])

                    df = pd.concat([df, new_row]).tail(5000)

                    # Clean data before saving
                    df = df.ffill().bfill()  # Fill any NaN values
                    df = ensure_atr(seed_signal(df))

                    # Verify no NaN in critical columns
                    if df[['open', 'high', 'low', 'close', 'atr']].isna().any().any():
                        log(f"  ‚ö†Ô∏è Skipped {pair} [{pkl_file.stem}] - NaN detected", "warn")
                        shutil.copy2(backup_file, pkl_file)  # Restore backup
                        continue

                    # Save with compression
                    df.to_pickle(pkl_file, compression='gzip')
                    updated_count += 1
                    log(f"  ‚úÖ Updated {pair} [{pkl_file.stem}] @ {latest_price}", "success")

                    # Remove backup on success
                    if backup_file.exists():
                        backup_file.unlink()

            except Exception as e:
                # Restore from backup if exists
                if backup_file.exists():
                    try:
                        shutil.copy2(backup_file, pkl_file)
                        backup_file.unlink()
                    except:
                        pass
                log(f"  ‚ùå Update failed {pair}: {e}", "error")

    log(f"‚úÖ Updated {updated_count} data files with live prices", "success" if updated_count > 0 else "info")
    return updated_count

def load_data(folder):
    log(f"üìÇ Loading from: {folder}", "info")
    if not folder.exists(): return {}

    # v11.1 enhancement: Better file filtering
    all_pkl = [p for p in folder.glob("*.pkl") if not any(s in p.name for s in
               ['_sgd_model','_rf_model','indicator_cache','ultra_','alpha_','_model.pkl','.bak'])]
    if not all_pkl: return {}

    pair_files = defaultdict(list)
    currencies = ["EUR","GBP","USD","AUD","NZD","CAD","CHF","JPY"]
    for pkl in all_pkl:
        parts = pkl.stem.split('_')
        if len(parts)>=2 and parts[0] in currencies and parts[1] in currencies:
            pair_files[f"{parts[0]}_{parts[1]}"].append(pkl)

    combined = {}
    for pk, files in pair_files.items():
        pair = f"{pk[:3]}/{pk[4:]}"
        if pair not in PAIRS: continue
        pair_data = {}
        for pkl in files:
            try:
                # v11.1 enhancement: Better compression handling
                try:
                    df = pd.read_pickle(pkl, compression='gzip')
                except:
                    try:
                        df = pd.read_pickle(pkl, compression=None)
                    except:
                        log(f"‚ùå Failed {pkl.name}: Cannot read file", "error")
                        continue

                if not isinstance(df, pd.DataFrame) or len(df)<50 or not all(c in df.columns for c in ['open','high','low','close']):
                    continue

                # v11.1 enhancement: Enhanced data cleaning
                df = df.ffill().bfill()
                df = df.dropna(subset=['open', 'high', 'low', 'close'])

                if len(df) < 50:
                    continue

                df.index = pd.to_datetime(df.index, errors="coerce")
                if df.index.tz: df.index = df.index.tz_localize(None)
                df = df[df.index.notna()]

                tf = "1d" if "1d" in pkl.stem or "daily" in pkl.stem else "1h" if "1h" in pkl.stem else "15m" if "15m" in pkl.stem else "5m" if "5m" in pkl.stem else "1m" if "1m" in pkl.stem else "unified"
                if tf not in TRAINING_TIMEFRAMES.get(pair, ["1d","1h"]): continue

                df = ensure_atr(seed_signal(df))

                # v11.1 enhancement: Final NaN check
                if df[['open', 'high', 'low', 'close', 'atr']].isna().any().any():
                    log(f"‚ö†Ô∏è Skipped {pkl.name}: Contains NaN after processing", "warn")
                    continue

                pair_data[tf] = df
                log(f"‚úÖ {pair} [{tf}]: {len(df)} rows", "success")
            except Exception as e:
                log(f"‚ùå Failed {pkl.name}: {e}", "error")
                if CIRCUIT_BREAKER.record_error("data_load", "low"): raise
        if pair_data: combined[pair] = pair_data

    log(f"‚úÖ Loaded {len(combined)} pairs, {sum(len(df) for tfs in combined.values() for df in tfs.values())} rows", "success")
    return combined

def split_data(data: Dict, train=0.45, gap=0.05, val=0.30):
    """FIXED: Walk-forward split with temporal gap to prevent leakage"""
    train_data, val_data, test_data = {}, {}, {}

    log(f"üìä Walk-Forward Split: Train={train*100:.0f}% | Gap={gap*100:.0f}% | Val={val*100:.0f}% | Test={100-train*100-gap*100-val*100:.0f}%", "info")

    for pair, tfs in data.items():
        train_data[pair], val_data[pair], test_data[pair] = {}, {}, {}
        for tf, df in tfs.items():
            # Ensure chronological order
            df = df.sort_index()
            n = len(df)

            # Calculate split points with gap
            train_end = int(n * train)
            gap_end = int(n * (train + gap))
            val_end = int(n * (train + gap + val))

            # Split with gap
            train_data[pair][tf] = df.iloc[:train_end].copy()
            val_data[pair][tf] = df.iloc[gap_end:val_end].copy()  # Skip gap
            test_data[pair][tf] = df.iloc[val_end:].copy()

    return train_data, val_data, test_data

def cross_validate_chromosome(data, val_data, tf_map, chrom, n_folds=3):
    """
    FIXED: Proper 3-fold cross-validation with consistent calculations
    """
    all_train_acc, all_val_acc = [], []
    all_train_pnl, all_val_pnl = [], []

    # Combine for k-fold
    combined_data = {}
    for pair in data.keys():
        combined_data[pair] = {}
        for tf in data[pair].keys():
            combined_df = pd.concat([data[pair][tf], val_data[pair][tf]])
            combined_data[pair][tf] = combined_df

    # K-fold cross-validation with walk-forward
    for fold in range(n_folds):
        fold_train_data, fold_val_data = {}, {}

        for pair in combined_data.keys():
            fold_train_data[pair], fold_val_data[pair] = {}, {}

            for tf, df in combined_data[pair].items():
                n = len(df)
                fold_size = n // n_folds

                # Walk-forward approach within folds
                val_start = fold * fold_size
                val_end = min(val_start + fold_size, n)

                # Training: all data before validation fold
                train_indices = list(range(0, val_start))
                val_indices = list(range(val_start, val_end))

                if len(train_indices) < 50 or len(val_indices) < 20:
                    continue

                fold_train_data[pair][tf] = df.iloc[train_indices].copy()
                fold_val_data[pair][tf] = df.iloc[val_indices].copy()

        # Evaluate fold
        try:
            tm = backtest(fold_train_data, tf_map, chrom, sample_rate=0.4)
            vm = backtest(fold_val_data, tf_map, chrom, sample_rate=0.4)

            all_train_acc.append(tm['accuracy'])
            all_val_acc.append(vm['accuracy'])
            all_train_pnl.append(tm['total_pnl'])
            all_val_pnl.append(vm['total_pnl'])
        except:
            continue

    if len(all_train_acc) < 2:
        return {'accuracy': 0, 'total_pnl': 0, 'sharpe': 0}, \
               {'accuracy': 0, 'total_pnl': 0, 'sharpe': 0}, \
               0.0

    # Average metrics
    avg_train = {
        'accuracy': np.mean(all_train_acc),
        'total_pnl': np.mean(all_train_pnl),
        'sharpe': 0
    }
    avg_val = {
        'accuracy': np.mean(all_val_acc),
        'total_pnl': np.mean(all_val_pnl),
        'sharpe': 0
    }

    # FIXED: Proper consistency calculation
    val_acc_std = np.std(all_val_acc)
    val_acc_mean = np.mean(all_val_acc) + 1e-6

    cv = val_acc_std / val_acc_mean  # Coefficient of variation
    consistency = max(0.0, min(1.0, 1.0 - cv))  # Scale to [0, 1]

    return avg_train, avg_val, consistency

def create_chromosome(tf_map, config, learning, model):
    best = learning.get_best_chromosomes(model, top_n=5)
    if best and random.random()<0.3:
        base = random.choice(best).copy()
        for i in range(len(base)):
            if random.random()<0.3:
                if i==0: base[i] = float(np.clip(base[i]+random.gauss(0,0.2), *config['atr_sl_range']))
                elif i==1: base[i] = float(np.clip(base[i]+random.gauss(0,0.2), *config['atr_tp_range']))
                elif i==2: base[i] = float(np.clip(base[i]+random.gauss(0,0.003), *config['risk_range']))
                elif i==3: base[i] = float(np.clip(base[i]+random.gauss(0,0.05), *config['confidence_range']))
                else: base[i] = float(max(0.01, base[i]+random.gauss(0,0.1)))
        return base
    chrom = [float(random.uniform(*config['atr_sl_range'])), float(random.uniform(*config['atr_tp_range'])),
             float(random.uniform(*config['risk_range'])), float(random.uniform(*config['confidence_range']))]
    for p in PAIRS:
        n = max(1, len(tf_map.get(p,[])))
        chrom.extend(np.random.dirichlet(np.ones(n)*2.0).tolist())
    return chrom

def decode_chromosome(chrom, tf_map):
    atr_sl = np.clip(chrom[0], 1.0, MAX_ATR_SL)
    atr_tp = np.clip(chrom[1], 1.0, MAX_ATR_TP)
    risk, conf = chrom[2], chrom[3]
    tf_w, idx = {}, 4
    for p in PAIRS:
        n = max(1, len(tf_map.get(p,[])))
        w = np.array(chrom[idx:idx+n], dtype=float)
        w = w/(w.sum()+EPS) if w.sum()>0 else np.ones(n)/n
        tf_w[p] = {tf: float(wt) for tf, wt in zip(tf_map.get(p,[]), w)}
        idx += n
    return atr_sl, atr_tp, risk, conf, tf_w

def backtest(data, tf_map, chrom, sample_rate=0.5):
    atr_sl, atr_tp, risk, conf, tf_w = decode_chromosome(chrom, tf_map)
    equity, eq_curve, trades = BASE_CAPITAL, [BASE_CAPITAL], []
    position, positions = None, 0

    all_times = sorted(set().union(*[df.index for tfs in data.values() for df in tfs.values()]))
    if sample_rate<1.0 and len(all_times)>500:
        all_times = all_times[::int(1/sample_rate)]

    for t in all_times:
        if position:
            pair, price = position['pair'], 0
            for tf in tf_map.get(pair,[]):
                if tf in data.get(pair,{}) and t in data[pair][tf].index:
                    price = data[pair][tf].loc[t, 'close']
                    break
            if price>0:
                hit_tp = (position['dir']=='BUY' and price>=position['tp']) or (position['dir']=='SELL' and price<=position['tp'])
                hit_sl = (position['dir']=='BUY' and price<=position['sl']) or (position['dir']=='SELL' and price>=position['sl'])
                if hit_tp or hit_sl:
                    ep = position['tp'] if hit_tp else position['sl']
                    pnl = ((ep-position['entry'])*position['size'] if position['dir']=='BUY' else (position['entry']-ep)*position['size'])
                    pnl -= ep*(0.0002+0.0001) + ep*position['size']*0.0005
                    equity += pnl
                    eq_curve.append(equity)
                    trades.append({'pnl':pnl, 'correct':hit_tp, 'entry':position['entry'], 'exit':ep})
                    position, positions = None, positions-1

        if position is None and positions<3:
            for pair in PAIRS:
                sig, price, atr = 0, 0, MIN_ATR
                for tf, w in tf_w.get(pair,{}).items():
                    if tf in data.get(pair,{}) and t in data[pair][tf].index:
                        row = data[pair][tf].loc[t]
                        sig += row.get('hybrid_signal',0)*w
                        price, atr = row['close'], max(row.get('atr',MIN_ATR), MIN_ATR)

                direction = 'BUY' if sig>0 else 'SELL' if sig<0 else None
                if direction and abs(sig)>conf and price>0:
                    wr = len([t for t in trades if t['correct']])/len(trades) if trades else 0.5
                    wins, losses = [t['pnl'] for t in trades if t['pnl']>0], [t['pnl'] for t in trades if t['pnl']<0]
                    size = calc_position_size(equity, risk, atr, atr_sl, wr, np.mean(wins) if wins else 1.0, np.mean(losses) if losses else 1.0, positions)

                    sl = price-(atr*atr_sl) if direction=='BUY' else price+(atr*atr_sl)
                    tp = price+(atr*atr_tp) if direction=='BUY' else price-(atr*atr_tp)
                    position = {'pair':pair, 'dir':direction, 'entry':price, 'sl':sl, 'tp':tp, 'size':size}
                    positions += 1
                    break

    metrics = calc_metrics(trades, eq_curve)
    if len(trades)<10:
        penalty = len(trades)/10
        metrics = AdvancedMetrics(metrics.sharpe*penalty, metrics.sortino*penalty, metrics.calmar*penalty,
                                  metrics.max_drawdown, metrics.win_rate*penalty, metrics.profit_factor*penalty,
                                  metrics.avg_win, metrics.avg_loss, metrics.expectancy*penalty)

    return {
        'total_trades': len(trades), 'winning_trades': sum(1 for t in trades if t['correct']),
        'accuracy': metrics.win_rate, 'total_pnl': sum(t['pnl'] for t in trades) if trades else 0,
        'sharpe': metrics.sharpe, 'sortino': metrics.sortino, 'max_drawdown': metrics.max_drawdown,
        'profit_factor': metrics.profit_factor, 'advanced_metrics': metrics, 'trades': trades, 'equity_curve': eq_curve
    }

def eval_chrom_parallel(args):
    chrom, data, tf_map, sr, use_val, val_data = args
    try:
        tm = backtest(data, tf_map, chrom, sample_rate=sr)
        if use_val and val_data:
            vm = backtest(val_data, tf_map, chrom, sample_rate=sr)
            # BALANCED fitness function
            fitness = (
                (tm['total_pnl']*0.4 + vm['total_pnl']*0.6)*0.3 +
                ((tm['accuracy']*0.4 + vm['accuracy']*0.6)/100)*35 +
                (tm['sharpe']*0.4 + vm['sharpe']*0.6)*15 +
                (tm['sortino']*0.4 + vm['sortino']*0.6)*10 +
                -(tm['max_drawdown']*0.4 + vm['max_drawdown']*0.6)*0.5
            )
            val_train_ratio = vm['accuracy']/(tm['accuracy']+1e-6)
            # BALANCED penalty
            if val_train_ratio<0.75:
                fitness -= (0.75-val_train_ratio)*150
            nt = tm['total_trades']
            if nt<15: fitness -= (15-nt)*8
            elif nt>100: fitness -= (nt-100)*0.8
        else:
            fitness = tm['total_pnl']*0.3 + (tm['accuracy']/100)*30 + tm['sharpe']*15 + tm['sortino']*10 - tm['max_drawdown']*0.5
        return (fitness, chrom)
    except:
        return (0.0, chrom)

def run_ga(data, tf_map, model, config, val_data=None):
    """FIXED GA with balanced overfitting controls + v11.0 detailed logging"""
    log(f"{config['color']} Training {model}...", "info")
    log(f"  üìä Pop: {config['pop_size']} | Gen: {config['generations']}", "info")

    ps, gens = config['pop_size'], config['generations']
    elite_r, multi = config['elite_ratio'], config['multi_start']
    base_mut, cross_r = config['mutation_rate'], config.get('crossover_rate', 0.8)
    use_par, use_val = config.get('use_parallel', True), val_data is not None and len(val_data)>0

    antioverfit_ai = BalancedAntiOverfitAI(min_val_ratio=0.75, patience=4)
    log(f"  üõ°Ô∏è Balanced Anti-Overfitting AI: Active (target ratio ‚â•0.75)", "info")

    # v11.0: Detailed initialization logging
    log(f"  üå± Init with {multi} multi-start...", "info")
    all_cand = []
    best_hist = LEARNING.get_best_chromosomes(model, top_n=min(8, ps//2))

    if best_hist:
        sr = config.get('sample_rate', 0.30)
        args = [(c, data, tf_map, sr, use_val, val_data) for c in best_hist]
        if use_par and len(args)>1:
            with Pool(processes=min(cpu_count(), len(args))) as pool:
                all_cand.extend(pool.map(eval_chrom_parallel, args))
        else:
            all_cand.extend([eval_chrom_parallel(a) for a in args])
        print(f"    Evaluated {len(best_hist)} historical seeds")

    log(f"  üé≤ Generating random...", "info")
    target = ps*multi
    rand_chroms = [create_chromosome(tf_map, config, LEARNING, model) for _ in range(target-len(all_cand))]

    sr = config.get('sample_rate', 0.30)
    args = [(c, data, tf_map, sr, use_val, val_data) for c in rand_chroms]

    # v11.0: Progress tracking during initialization
    if use_par and len(args)>1:
        batch_size = max(1, len(args)//10)
        for i in range(0, len(args), batch_size):
            batch = args[i:i+batch_size]
            with Pool(processes=min(cpu_count(), len(batch))) as pool:
                all_cand.extend(pool.map(eval_chrom_parallel, batch))
            print(f"    Candidates: {len(all_cand)}/{target} ({len(all_cand)/target*100:.0f}%)", end='\r')
    else:
        for idx, a in enumerate(args):
            all_cand.append(eval_chrom_parallel(a))
            print(f"    Candidates: {len(all_cand)}/{target} ({len(all_cand)/target*100:.0f}%)", end='\r')

    print()
    pop = sorted(all_cand, reverse=True)[:ps]
    best_ever, stag = pop[0][0], 0
    log(f"  ‚úÖ Init ready | Best: {best_ever:.2f}", "success")
    log(f"  üß¨ Evolution...", "info")

    best_generalization_chrom = pop[0][1]
    best_generalization_fitness = -float('inf')

    for gen in range(gens):
        t0 = time.time()

        if len(pop)>=2:
            samp = random.sample([ind[1] for ind in pop], min(10, len(pop)))
            dists = [np.linalg.norm(np.array(samp[i])-np.array(samp[j])) for i in range(len(samp)) for j in range(i+1, len(samp))]
            div = min(1.0, np.mean(dists)/5.0) if dists else 1.0
        else:
            div = 1.0

        prog = gen/gens

        # Check overfitting every 3 generations with 3-fold CV
        if use_val and gen > 0 and gen % 3 == 0:
            best_chrom = pop[0][1]
            avg_train, avg_val, consistency = cross_validate_chromosome(data, val_data, tf_map, best_chrom, n_folds=3)

            gen_score_base = antioverfit_ai.calculate_generalization_score(avg_train, avg_val)
            gen_score = gen_score_base * consistency

            is_overfit, severity, recommendations = antioverfit_ai.detect_overfitting(avg_train, avg_val)

            if is_overfit:
                log(f"  üõ°Ô∏è Overfitting detected (severity={severity:.2f}, consistency={consistency:.2f})", "warn")
                for rec in recommendations:
                    log(f"    ‚Üí {rec}", "info")

                config = antioverfit_ai.apply_adaptive_regularization(config, severity)
                base_mut = config['mutation_rate']

            if gen_score > best_generalization_fitness:
                best_generalization_fitness = gen_score
                best_generalization_chrom = best_chrom.copy()
                log(f"  üõ°Ô∏è New best: GenScore={gen_score:.2f} (base={gen_score_base:.2f}, consistency={consistency:.2f})", "success")

        mut_r = base_mut * (1.5 if div<0.3 else 0.7 if prog>0.7 else 1.0) if config.get('adaptive_mutation', True) else base_mut

        elite_cnt = max(2, int(ps*elite_r))
        new_pop = pop[:elite_cnt].copy()
        offspring = []

        while len(offspring) < ps-elite_cnt:
            ts = 3 if div>0.5 else 5
            p1 = max(random.sample(pop, min(ts, len(pop))), key=lambda x:x[0])[1]
            p2 = max(random.sample(pop, min(ts, len(pop))), key=lambda x:x[0])[1]

            if random.random()<cross_r:
                if prog<0.5:
                    pts = sorted(random.sample(range(1, len(p1)), random.randint(2,4)))
                    child, cur, last = [], p1, 0
                    for pt in pts+[len(p1)]:
                        child.extend(cur[last:pt])
                        cur, last = p2 if cur==p1 else p1, pt
                else:
                    pt = random.randint(1, len(p1)-1)
                    child = p1[:pt]+p2[pt:]
            else:
                child = p1.copy()

            for i in range(len(child)):
                if random.random()<mut_r:
                    scale = 0.3 if prog<0.5 else 0.15
                    if i==0: child[i] = float(np.clip(child[i]+random.gauss(0,scale), *config['atr_sl_range']))
                    elif i==1: child[i] = float(np.clip(child[i]+random.gauss(0,scale), *config['atr_tp_range']))
                    elif i==2: child[i] = float(np.clip(child[i]+random.gauss(0,0.005 if prog<0.5 else 0.002), *config['risk_range']))
                    elif i==3: child[i] = float(np.clip(child[i]+random.gauss(0,0.1 if prog<0.5 else 0.05), *config['confidence_range']))
                    else: child[i] = float(max(0.01, child[i]+random.gauss(0,0.2 if prog<0.5 else 0.1)))
            offspring.append(child)

        sr = config.get('sample_rate', 0.30)
        args = [(c, data, tf_map, sr, use_val, val_data) for c in offspring]
        if use_par and len(args)>1:
            with Pool(processes=min(cpu_count(), len(args))) as pool:
                off_res = pool.map(eval_chrom_parallel, args)
        else:
            off_res = [eval_chrom_parallel(a) for a in args]

        new_pop.extend(off_res)
        pop = sorted(new_pop, reverse=True)

        cur_best = pop[0][0]
        if cur_best > best_ever*1.01:
            best_ever, stag, icon = cur_best, 0, "üìà"
        else:
            stag, icon = stag+1, "üìä"

        # v11.0: Detailed generation logging
        gen_info = f" | GenScore={best_generalization_fitness:.2f}" if use_val and best_generalization_fitness > -float('inf') else ""
        log(f"  {icon} Gen {gen+1}/{gens}: Best={cur_best:.2f} | Avg={np.mean([i[0] for i in pop]):.2f} | Div={div:.2f} | Mut={mut_r:.3f}{gen_info} | T={time.time()-t0:.1f}s", "info")

        if stag>=5 and cur_best<50 and div<0.2:
            log(f"  ‚ö†Ô∏è Early stop at gen {gen+1}", "warn")
            break
        if stag>=6 and cur_best>100 and div<0.15:
            log(f"  üéØ Early stop at gen {gen+1}", "success")
            break

    if best_generalization_fitness > -float('inf'):
        final_chrom = best_generalization_chrom
        log(f"  üõ°Ô∏è Using best generalization model (score={best_generalization_fitness:.2f})", "success")
    else:
        final_chrom = pop[0][1]

    log(f"  üîß Fine-tuning...", "info")
    ref_chroms = []
    for _ in range(6):
        ref = final_chrom.copy()
        for i in range(len(ref)):
            if random.random()<0.2:
                if i==0: ref[i] = float(np.clip(ref[i]+random.gauss(0,0.03), *config['atr_sl_range']))
                elif i==1: ref[i] = float(np.clip(ref[i]+random.gauss(0,0.03), *config['atr_tp_range']))
                elif i==2: ref[i] = float(np.clip(ref[i]+random.gauss(0,0.0005), *config['risk_range']))
                elif i==3: ref[i] = float(np.clip(ref[i]+random.gauss(0,0.01), *config['confidence_range']))
                else: ref[i] = float(max(0.01, ref[i]+random.gauss(0,0.03)))
        ref_chroms.append(ref)

    sr = config.get('sample_rate', 0.30)
    args = [(c, data, tf_map, sr, use_val, val_data) for c in ref_chroms]
    if use_par and len(args)>1:
        with Pool(processes=min(cpu_count(), len(args))) as pool:
            ref_res = pool.map(eval_chrom_parallel, args)
    else:
        ref_res = [eval_chrom_parallel(a) for a in args]

    best_refined_chrom = final_chrom
    best_refined_gen_score = best_generalization_fitness

    for fit, ref in ref_res:
        if use_val:
            tm_ref = backtest(data, tf_map, ref, sample_rate=0.4)
            vm_ref = backtest(val_data, tf_map, ref, sample_rate=0.4)
            gen_score_ref = antioverfit_ai.calculate_generalization_score(tm_ref, vm_ref)

            if gen_score_ref > best_refined_gen_score:
                best_refined_gen_score = gen_score_ref
                best_refined_chrom = ref
        elif fit > pop[0][0]:
            best_refined_chrom = ref

    final_chrom = best_refined_chrom

    print()
    log(f"  üìä Final validation on full data...", "info")
    tm = backtest(data, tf_map, final_chrom, sample_rate=1.0)
    vm = None

    if val_data:
        vm = backtest(val_data, tf_map, final_chrom, sample_rate=1.0)
        log(f"  üìä Train: {tm['accuracy']:.1f}% | ${tm['total_pnl']:.2f}", "info")
        log(f"  üìä Val: {vm['accuracy']:.1f}% | ${vm['total_pnl']:.2f}", "info")

        is_overfit, severity, recommendations = antioverfit_ai.detect_overfitting(tm, vm)

        ovr = vm['accuracy']/(tm['accuracy']+1e-6)
        if severity > 0.4:
            log(f"  üõ°Ô∏è AI: Moderate overfitting (severity={severity:.2f}, ratio={ovr:.2f})", "warn")
        elif severity > 0.20:
            log(f"  üõ°Ô∏è AI: Mild overfitting (severity={severity:.2f}, ratio={ovr:.2f})", "info")
        elif ovr < 0.75:
            log(f"  üõ°Ô∏è AI: Minor concern (Val/Train={ovr:.2f})", "info")
        else:
            log(f"  üõ°Ô∏è AI: EXCELLENT generalization (Val/Train={ovr:.2f})", "success")

        ai_report = antioverfit_ai.get_report()
        log(f"  üõ°Ô∏è AI Report: GenScore={ai_report['generalization_score']:.2f} | {ai_report['trend']}", "info")

    log(f"  ‚úÖ {model}: Sharpe={tm['sharpe']:.2f} | Sortino={tm['sortino']:.2f} | MaxDD={tm['max_drawdown']:.1f}%", "success")

    return {
        'chromosome': final_chrom,
        'metrics': tm,
        'val_metrics': vm,
        'antioverfit_report': antioverfit_ai.get_report() if use_val else None
    }

def gen_signals(data, tf_map, chrom, model, ct, use_live=True):
    atr_sl, atr_tp, risk, conf, tf_w = decode_chromosome(chrom, tf_map)
    signals = {}
    for pair in PAIRS:
        pd = data.get(pair, {})
        if not pd: continue

        ptf = sorted(pd.keys(), key=lambda x:{'1d':4,'1h':2,'15m':1}.get(x,0), reverse=True)[0]
        regime = detect_regime(pd[ptf])

        sig, hist_price, atr = 0, 0, MIN_ATR
        for tf, w in tf_w.get(pair, {}).items():
            if tf in pd and len(pd[tf])>0:
                row = pd[tf].iloc[-1]
                sig += row.get('hybrid_signal', 0)*w
                hist_price, atr = row['close'], max(row.get('atr', MIN_ATR), MIN_ATR)

        price = fetch_price(pair) if use_live else None
        price = price if price and price>0 else hist_price
        if price<=0: continue

        direction = 'HOLD' if regime.trend=='ranging' and abs(sig)<conf*1.5 else 'BUY' if sig>0 else 'SELL' if sig<0 else 'HOLD'

        sm = abs(sig)
        confidence = int(np.clip(
            30+(sm/(conf*0.5))*20 if sm<conf*0.5 else
            50+((sm-conf*0.5)/(conf*0.5))*20 if sm<conf else
            70+min((sm-conf)/conf*20, 20),
            25, 95
        ))

        sl = price-(atr*atr_sl) if direction=="BUY" else price+(atr*atr_sl) if direction=="SELL" else price
        tp = price+(atr*atr_tp) if direction=="BUY" else price-(atr*atr_tp) if direction=="SELL" else price

        signals[pair] = {
            'direction': direction, 'last_price': float(price), 'SL': float(sl), 'TP': float(tp),
            'atr': float(atr), 'score_1_100': confidence, 'signal_strength': float(sig),
            'model': model, 'timestamp': ct.isoformat(),
            'rr_ratio': float(abs(tp-price)/abs(price-sl) if abs(price-sl)>0 else 0),
            'regime': {'volatility': regime.volatility, 'trend': regime.trend, 'strength': regime.strength}
        }
    return signals

class IterationCounter:
    def __init__(self, file=OMEGA_ITERATION_FILE):
        self.file = file
        self.data = self._load()
    def _load(self):
        if self.file.exists():
            try: return pickle.load(open(self.file, 'rb'))
            except: pass
        return {'total': 0, 'start': datetime.now(timezone.utc).isoformat(), 'history': []}
    def increment(self, success=True):
        self.data['total'] += 1
        self.data['history'].append({'iteration': self.data['total'], 'time': datetime.now(timezone.utc).isoformat(), 'success': success})
        if len(self.data['history'])>1000: self.data['history'] = self.data['history'][-1000:]
        pickle.dump(self.data, open(self.file, 'wb'), protocol=4)
        return self.data['total']
    def get_stats(self):
        sd = datetime.fromisoformat(self.data['start'])
        days = max(1, (datetime.now(timezone.utc)-sd).days)
        return {'total': self.data['total'], 'days': days, 'per_day': self.data['total']/days}

class MemorySystem:
    def __init__(self, file=OMEGA_MEMORY_FILE):
        self.file = file
        self.data = self._load()
    def _load(self):
        if self.file.exists():
            try: return pickle.load(open(self.file, 'rb'))
            except: pass
        return {'signals': [], 'trades': [], 'created_at': datetime.now(timezone.utc).isoformat()}
    def store_signals(self, sigs, ts):
        for model, s in sigs.items():
            for pair, sig in s.items():
                if sig['direction']!='HOLD':
                    self.data['signals'].append({
                        'timestamp': ts.isoformat(), 'model': model, 'pair': pair,
                        'direction': sig['direction'], 'entry': sig['last_price'],
                        'sl': sig['SL'], 'tp': sig['TP'], 'confidence': sig['score_1_100']
                    })
        if len(self.data['signals'])>1000: self.data['signals'] = self.data['signals'][-1000:]
        self._save()
    def _save(self): pickle.dump(self.data, open(self.file, 'wb'), protocol=4)
    def close(self): self._save()

class LearningSystem:
    def __init__(self, file=OMEGA_LEARNING_FILE):
        self.file = file
        self.data = self._load()
    def _load(self):
        if self.file.exists():
            try: return pickle.load(open(self.file, 'rb'))
            except: pass
        return {'iterations': 0, 'successful_patterns': {}, 'learning_curve': [], 'adaptation_score': 0.0}
    def record_iteration(self, results):
        self.data['iterations'] += 1
        for model, res in results.items():
            if not res or 'metrics' not in res: continue
            m = res['metrics']
            vm = res.get('val_metrics', {})
            pnl, acc = m['total_pnl'], m['accuracy']
            val_acc = vm.get('accuracy', 0) if vm else 0
            val_train_ratio = val_acc / (acc + 1e-6)

            # BALANCED: Save if generalization is reasonable
            if (pnl>12 or acc>=42) and val_train_ratio >= 0.70:
                key = f"{model}_success"
                if key not in self.data['successful_patterns']: self.data['successful_patterns'][key] = []
                self.data['successful_patterns'][key].append({
                    'chromosome': res.get('chromosome'), 'pnl': pnl, 'accuracy': acc,
                    'val_accuracy': val_acc, 'val_train_ratio': val_train_ratio,
                    'sharpe': m.get('sharpe', 0), 'time': datetime.now(timezone.utc).isoformat()
                })
                if len(self.data['successful_patterns'][key])>50:
                    self.data['successful_patterns'][key] = sorted(
                        self.data['successful_patterns'][key],
                        key=lambda x: x['pnl']+x['accuracy']+x.get('sharpe',0)*10+x.get('val_train_ratio',0)*40,
                        reverse=True
                    )[:50]

        total_pnl = sum(r['metrics']['total_pnl'] for r in results.values() if r and 'metrics' in r)
        self.data['learning_curve'].append(total_pnl)
        if len(self.data['learning_curve'])>100: self.data['learning_curve'] = self.data['learning_curve'][-100:]

        if len(self.data['learning_curve'])>=5:
            ra, oa = np.mean(self.data['learning_curve'][-5:]), np.mean(self.data['learning_curve'])
            self.data['adaptation_score'] = min(100, max(0, 50+(ra/(oa+EPS)-1)*100 if oa>0 else 30+ra))
        else:
            self.data['adaptation_score'] = min(100, max(0, 30+total_pnl/5))

        pickle.dump(self.data, open(self.file, 'wb'), protocol=4)

    def get_best_chromosomes(self, model, top_n=5):
        patterns = self.data['successful_patterns'].get(f"{model}_success", [])
        quality = [p for p in patterns if p.get('val_train_ratio',0)>=0.70 and (p.get('pnl',0)>12 or p.get('accuracy',0)>=42)]
        sorted_p = sorted(quality, key=lambda x: x['pnl']+x.get('accuracy',0)/100*50+x.get('sharpe',0)*10+x.get('val_train_ratio',0)*40, reverse=True)
        return [p['chromosome'] for p in sorted_p[:top_n] if p.get('chromosome')]

    def get_report(self):
        total_success = sum(len(p) for p in self.data['successful_patterns'].values())
        trend = "üìà Improving" if (
            len(self.data['learning_curve'])>=5 and
            np.mean(self.data['learning_curve'][-5:]) > np.mean(self.data['learning_curve'][:-5] or [0])
        ) else "üìâ Adjusting"
        return {
            'iterations': self.data['iterations'], 'adaptation_score': self.data['adaptation_score'],
            'total_successes': total_success, 'trend': trend, 'learning_curve': self.data['learning_curve'][-10:]
        }

COUNTER = IterationCounter()
MEMORY = MemorySystem()
LEARNING = LearningSystem()

def send_email(sigs, it_stats, lr, is_wknd):
    if not GMAIL_APP_PASSWORD:
        log("‚ùå Email skipped: No password", "error")
        return

    if IN_GHA and not os.getenv("GMAIL_APP_PASSWORD"):
        log("‚ö†Ô∏è GitHub Actions: Email skipped (no env credentials)", "warn")
        return

    try:
        active = sum(1 for m in sigs.values() for s in m.values() if s['direction']!='HOLD')
        msg = MIMEMultipart('alternative')
        msg['Subject'] = f"{'üìö OMEGA Learning' if is_wknd else 'üî¥ OMEGA v11.2 ULTIMATE'} - Iteration #{it_stats['iteration']}"
        msg['From'] = msg['To'] = GMAIL_USER

        mode_badge = "üìö LEARNING MODE" if is_wknd else "üéØ LIVE TRADING"
        mode_color = "#6c757d" if is_wknd else "#c92a2a"

        html = f"""<!DOCTYPE html><html><head><style>
body{{font-family:Arial,sans-serif;background:#f4f4f4;margin:0;padding:20px}}
.container{{max-width:800px;margin:0 auto;background:white;border-radius:10px;overflow:hidden;box-shadow:0 2px 10px rgba(0,0,0,0.1)}}
.header{{background:linear-gradient(135deg,{mode_color} 0%,#862e2e 100%);color:white;padding:30px;text-align:center}}
.header h1{{margin:0;font-size:28px}}
.badge{{display:inline-block;padding:5px 10px;background:#862e2e;color:white;border-radius:5px;font-size:12px;margin-top:10px}}
.mode-badge{{background:{mode_color};font-size:14px;padding:8px 15px;margin-top:15px}}
.stats{{display:flex;justify-content:space-around;padding:20px;background:#f8f9fa;border-bottom:2px solid #e9ecef}}
.stat{{text-align:center}}
.stat-value{{font-size:24px;font-weight:bold;color:#c92a2a}}
.stat-label{{font-size:12px;color:#6c757d;margin-top:5px}}
.model-section{{padding:20px;border-bottom:1px solid #eee}}
.model-header{{font-size:20px;font-weight:bold;margin-bottom:10px}}
.signal{{padding:15px;background:#f8f9fa;border-radius:5px;margin:10px 0;border-left:4px solid}}
.signal-buy{{border-left-color:#28a745}}
.signal-sell{{border-left-color:#dc3545}}
.signal-hold{{border-left-color:#6c757d}}
.signal-header{{font-weight:bold;font-size:16px;margin-bottom:8px}}
.signal-details{{color:#666;font-size:14px}}
.regime{{background:#e7f3ff;padding:5px 10px;border-radius:3px;display:inline-block;margin-top:5px;font-size:12px}}
.footer{{padding:20px;text-align:center;background:#f8f9fa;color:#666;font-size:12px}}
.metrics{{background:#fff3cd;padding:15px;margin:10px 0;border-radius:5px}}
</style></head><body><div class="container">
<div class="header"><h1>üî¥ OMEGA v11.2 ULTIMATE</h1>
<span class="badge">Best of Both Worlds</span>
<div class="mode-badge">{mode_badge}</div>
<p>Iteration #{it_stats['iteration']} | {datetime.now():%Y-%m-%d %H:%M UTC}</p></div>
<div class="stats">
<div class="stat"><div class="stat-value">{it_stats['total_iterations']}</div><div class="stat-label">Total Runs</div></div>
<div class="stat"><div class="stat-value">{lr['adaptation_score']:.1f}/100</div><div class="stat-label">Learning Score</div></div>
<div class="stat"><div class="stat-value">{lr['trend']}</div><div class="stat-label">Trend</div></div>
<div class="stat"><div class="stat-value">{active if not is_wknd else 'N/A'}</div><div class="stat-label">Active Signals</div></div>
</div>"""

        if is_wknd:
            html += """<div class="metrics"><strong>üìö Weekend Learning Mode Active</strong><br>
System optimizing on historical data. Live trading resumes on weekdays.</div>"""

        for model, signals in sigs.items():
            config = COMPETITION_MODELS[model]
            html += f'<div class="model-section"><div class="model-header">{config["color"]} {model}</div>'
            for pair, sig in signals.items():
                dc = sig['direction'].lower()
                html += f'''<div class="signal signal-{dc}">
<div class="signal-header">{pair} <span class="badge badge-{dc}">{sig['direction']}</span></div>
<div class="signal-details">üí∞ {sig['last_price']:.5f} | üõ°Ô∏è SL: {sig['SL']:.5f} | üéØ TP: {sig['TP']:.5f} | üìä {sig['score_1_100']}/100 | üìà RR: {sig['rr_ratio']:.2f}</div>
<div class="signal-details">‚ö° Signal: {sig['signal_strength']:.6f}</div>
<div class="regime">üå°Ô∏è {sig['regime']['volatility']} | üìà {sig['regime']['trend']} | üí™ {sig['regime']['strength']:.0f}/100</div></div>'''
            html += '</div>'

        html += f'''<div class="footer"><div>üî¥ OMEGA v11.2 - ULTIMATE EDITION</div>
<div style="margin-top:10px;">üé≤ Dynamic Seeds | üîÑ Live Updates | üõ°Ô∏è Anti-Overfit | üìä Full Logging</div>
<div style="margin-top:5px;">{ENV_NAME} Environment</div></div></div></body></html>'''

        msg.attach(MIMEText(html, 'html'))
        with smtplib.SMTP_SSL('smtp.gmail.com', 465, timeout=30) as srv:
            srv.login(GMAIL_USER, GMAIL_APP_PASSWORD)
            srv.send_message(msg)
        log(f"‚úÖ Email sent to {GMAIL_USER}", "success")
    except Exception as e:
        log(f"‚ùå Email failed: {e}", "error")
        CIRCUIT_BREAKER.record_error("email", "low")

def push_git(files, msg):
    if IN_GHA:
        log("ü§ñ GHA: Git skipped", "info")
        return True
    if not FOREX_PAT:
        log("‚ö†Ô∏è No PAT - Git skipped", "warn")
        return False
    try:
        REPO_URL = f"https://{GITHUB_USERNAME}:{FOREX_PAT}@github.com/{GITHUB_USERNAME}/{GITHUB_REPO}.git"
        repo_path = SAVE_FOLDER if IN_GHA else (SAVE_FOLDER if (SAVE_FOLDER/".git").exists() else BASE_FOLDER)

        if not (repo_path/".git").exists():
            subprocess.run(["git", "clone", REPO_URL, str(repo_path)], capture_output=True, timeout=60, check=True)

        os.chdir(repo_path)
        for f in files:
            if (repo_path/f).exists():
                subprocess.run(["git", "add", str(f)], check=False)
        subprocess.run(["git", "commit", "-m", msg], capture_output=True, check=False)
        subprocess.run(["git", "pull", "--rebase", "origin", "main"], capture_output=True, check=False)

        for attempt in range(3):
            result = subprocess.run(["git", "push", "origin", "main"], capture_output=True, timeout=30)
            if result.returncode==0:
                log("‚úÖ GitHub push successful", "success")
                return True
            if attempt<2: time.sleep(2)
        return False
    except Exception as e:
        log(f"‚ùå Git error: {e}", "error")
        CIRCUIT_BREAKER.record_error("git", "low")
        return False
    finally:
        try: os.chdir(SAVE_FOLDER)
        except: pass

def main():
    log("="*70, "omega")
    log("üî¥ OMEGA v11.2 - ULTIMATE EDITION", "omega")
    log("="*70, "omega")

    # üé≤ DYNAMIC SEED from v11.1 - Changes every iteration
    iteration_seed = COUNTER.data['total'] * 137 + int(time.time() * 1000) % 10000
    random.seed(iteration_seed)
    np.random.seed(iteration_seed)
    log(f"üé≤ Dynamic Seed: {iteration_seed} (Unique per iteration)", "rocket")

    success = False
    try:
        if CIRCUIT_BREAKER.is_open:
            log("üö® Circuit breaker open - halted", "error")
            return

        cur_iter = COUNTER.data['total']+1
        stats = COUNTER.get_stats()
        log(f"\nüìä Iteration #{cur_iter} | {ENV_NAME}", "info")
        log(f"Total: {stats['total']} | Days: {stats['days']} | Avg/Day: {stats['per_day']:.1f}", "info")

        is_wknd = is_weekend() and WEEKEND_LEARNING_MODE
        log(f"{'üìö Weekend Learning' if is_wknd else 'üéØ Weekday Trading'} Mode", "info" if is_wknd else "success")

        # üîÑ v11.1: UPDATE PICKLE DATA WITH LIVE PRICES
        log("\nüîÑ Updating data with live prices...", "info")
        updated_count = update_pickle_data()

        log("\nüì¶ Loading updated data...", "info")
        data = load_data(PICKLE_FOLDER)
        if not data: raise ValueError("‚ùå No data")

        log("\nüìä Splitting data with walk-forward validation...", "info")
        train_data, val_data, test_data = split_data(data, train=0.45, gap=0.05, val=0.30)
        tf_map = {p: list(tfs.keys()) for p, tfs in data.items()}

        log("\nüèÜ Running Model Competition...", "chart")
        comp_results, sigs_by_model = {}, {}

        for model, config in COMPETITION_MODELS.items():
            try:
                result = run_ga(train_data, tf_map, model, config, val_data)
                comp_results[model] = result

                if not is_wknd:
                    sigs_by_model[model] = gen_signals(data, tf_map, result['chromosome'], model, datetime.now(timezone.utc), use_live=True)
                else:
                    test_m = backtest(test_data, tf_map, result['chromosome'], sample_rate=1.0)
                    log(f"  üìö Test: {test_m['accuracy']:.1f}% | ${test_m['total_pnl']:.2f} | Sharpe={test_m['sharpe']:.2f}", "info")
                    result['test_metrics'] = test_m
                    sigs_by_model[model] = {}
            except Exception as e:
                log(f"‚ùå {model} failed: {e}", "error")
                if CIRCUIT_BREAKER.record_error("model_training", "high"):
                    raise SystemError("Circuit breaker: too many failures")

        if sigs_by_model: MEMORY.store_signals(sigs_by_model, datetime.now(timezone.utc))

        LEARNING.record_iteration(comp_results)
        lr = LEARNING.get_report()
        log(f"\nüß† Learning: {lr['trend']} | Score: {lr['adaptation_score']:.1f}/100", "brain")

        # üèÜ v11.0: Rank models with detailed reporting
        model_rankings = []
        for model, result in comp_results.items():
            if result and 'val_metrics' in result and result['val_metrics']:
                val_train_ratio = result['val_metrics']['accuracy'] / (result['metrics']['accuracy'] + 1e-6)
                test_acc = result.get('test_metrics', {}).get('accuracy', 0) if 'test_metrics' in result else 0
                ai_report = result.get('antioverfit_report', {})
                gen_score = ai_report.get('generalization_score', 0) if isinstance(ai_report, dict) else 0

                quality_score = (
                    min(val_train_ratio, 1.0) * 45 +
                    (1 - abs(val_train_ratio - 0.85)) * 25 +
                    gen_score * 30
                )

                model_rankings.append({
                    'model': model,
                    'quality_score': quality_score,
                    'val_train_ratio': val_train_ratio,
                    'val_acc': result['val_metrics']['accuracy'],
                    'test_acc': test_acc,
                    'gen_score': gen_score
                })

        model_rankings.sort(key=lambda x: x['quality_score'], reverse=True)

        log("\nüèÜ Model Rankings by Generalization Quality:", "chart")
        for i, rank in enumerate(model_rankings, 1):
            medal = "ü•á" if i == 1 else "ü•à" if i == 2 else "ü•â"
            log(f"  {medal} {rank['model']}: Quality={rank['quality_score']:.1f} | Val/Train={rank['val_train_ratio']:.2f} | Test={rank['test_acc']:.1f}% | GenScore={rank['gen_score']:.2f}", "info")

        if model_rankings:
            best_model = model_rankings[0]
            log(f"\nüéØ Recommended for Live Trading: {best_model['model']}", "success")
            log(f"   Reason: Best generalization quality (score={best_model['quality_score']:.1f})", "info")

            if best_model['val_train_ratio'] < 0.75:
                log(f"   ‚ö†Ô∏è WARNING: Val/Train ratio {best_model['val_train_ratio']:.2f} < 0.75 (possible overfitting)", "warn")

        log("\nüíæ Saving signals...", "info")

        # v11.0: Comprehensive quality report
        model_quality_report = {}
        for rank in model_rankings:
            model_quality_report[rank['model']] = {
                'quality_score': rank['quality_score'],
                'val_train_ratio': rank['val_train_ratio'],
                'validation_accuracy': rank['val_acc'],
                'test_accuracy': rank['test_acc'],
                'generalization_score': rank['gen_score'],
                'recommended': rank == model_rankings[0],
                'overfit_warning': rank['val_train_ratio'] < 0.75
            }

        output = {
            'timestamp': datetime.now(timezone.utc).isoformat(),
            'iteration': cur_iter,
            'system': 'OMEGA',
            'version': 'v11.2-ultimate',
            'mode': 'weekend_learning' if is_wknd else 'live_trading',
            'models': sigs_by_model,
            'environment': ENV_NAME,
            'learning': lr,
            'model_quality': model_quality_report,
            'circuit_breaker_status': 'open' if CIRCUIT_BREAKER.is_open else 'closed',
            'dynamic_seed': iteration_seed,
            'updated_files': updated_count,
            'features': {
                'from_v11_0': [
                    'Detailed logging throughout execution',
                    'Progress tracking during initialization',
                    'Generation-by-generation statistics',
                    'Comprehensive model quality reporting',
                    'Detailed anti-overfit analysis',
                    'Complete documentation in output'
                ],
                'from_v11_1': [
                    'Dynamic random seeds per iteration',
                    'Live data updates with backups',
                    'Enhanced data cleaning (NaN handling)',
                    'Better compression handling',
                    'Improved file filtering'
                ]
            },
            'fixes': {
                'fixed_random_seed': f'Dynamic: {iteration_seed}',
                'walk_forward_split': '45% train, 5% gap, 30% val, 20% test',
                'cross_validation_folds': 3,
                'consistency_calculation': 'Fixed coefficient of variation',
                'balanced_penalties': 'Max 60% severity',
                'min_val_train_ratio': 0.75,
                'larger_populations': '14-18 individuals',
                'removed_ensemble': 'Single run optimization',
                'live_data_updates': updated_count,
                'backup_system': 'Automatic .bak creation'
            },
            'anti_overfit': {
                'val_weight': '60%',
                'overfit_penalty': 'Balanced',
                'min_ratio': 0.75,
                'cross_validation': '3-fold walk-forward',
                'max_severity': 0.6
            }
        }

        with open(OMEGA_SIGNALS_FILE, 'w') as f:
            json.dump(output, f, indent=2, default=str)
        log(f"‚úÖ Saved to {OMEGA_SIGNALS_FILE.name}", "success")

        it_stats = {'iteration': cur_iter, 'total_iterations': stats['total']}

        if not is_wknd:
            send_email(sigs_by_model, it_stats, lr, is_wknd)
            log("‚úÖ Email sent", "success")
        else:
            log("üìö Weekend: Email skipped", "info")

        log("\nüîÑ Git operations...", "info")
        files = [
            f"outputs/{OMEGA_SIGNALS_FILE.name}",
            f"omega_state/{OMEGA_LEARNING_FILE.name}",
            f"omega_state/{OMEGA_ITERATION_FILE.name}",
            f"omega_state/{OMEGA_MEMORY_FILE.name}"
        ]
        push_git(files, f"üî¥ OMEGA v11.2 ULTIMATE [{'Learning' if is_wknd else 'Live'}]: #{cur_iter} - {datetime.now():%Y-%m-%d %H:%M UTC}")

        active = sum(1 for m in sigs_by_model.values() for s in m.values() if s.get('direction')!='HOLD') if not is_wknd else 0

        log("\n"+"="*70, "success")
        log("‚úÖ OMEGA v11.2 - ULTIMATE EDITION COMPLETED", "success")
        log("="*70, "success")
        log(f"Mode: {'Learning' if is_wknd else 'Live'} | Env: {ENV_NAME}", "info")
        log(f"Iteration: #{cur_iter} | Models: {len(comp_results)}", "info")
        if not is_wknd:
            log(f"Live Signals: {active}", "info")
        else:
            log("Weekend Learning: Historical optimization complete", "info")
        log(f"Circuit Breaker: {'üî¥ OPEN' if CIRCUIT_BREAKER.is_open else '‚úÖ Closed'}", "info")

        # v11.0 + v11.1 features combined
        log("\nüéâ ULTIMATE FEATURES:", "rocket")
        log("  FROM v11.0:", "info")
        log("    üé≤ Fixed Seeds ‚Üí Now Dynamic for exploration", "info")
        log("    üìä Full detailed logging maintained", "success")
        log("    üìà Progress tracking preserved", "success")
        log("    üèÜ Model quality rankings active", "success")
        log("  FROM v11.1:", "info")
        log("    üé≤ Dynamic Seeds per iteration", "success")
        log(f"    üîÑ Live Data Updates: {updated_count} files", "success")
        log("    üõ°Ô∏è Enhanced data validation", "success")
        log("    üíæ Backup system active", "success")
        log("\n  ANTI-OVERFIT FIXES (Both versions):", "info")
        log("    üõ°Ô∏è Walk-Forward: 5% temporal gap", "success")
        log("    üéØ 3-Fold Cross-Validation", "success")
        log("    üìä Balanced Penalties: Max 60%", "success")
        log("    ‚öñÔ∏è Min 75% val/train ratio", "success")
        log("    ‚úÖ All Critical Fixes Applied!", "success")

        success = True

    except KeyboardInterrupt:
        log("\n‚ö†Ô∏è Shutdown requested", "warn")
    except SystemError as e:
        log(f"\nüö® EMERGENCY SHUTDOWN: {e}", "error")
        CIRCUIT_BREAKER.is_open = True
    except Exception as e:
        log(f"\n‚ùå Fatal error: {e}", "error")
        logging.exception("Fatal error")
        if CIRCUIT_BREAKER.record_error("fatal", "critical"):
            log("üö® Circuit breaker activated", "error")
    finally:
        COUNTER.increment(success=success)
        MEMORY.close()
        if CIRCUIT_BREAKER.is_open:
            log("\nüö® SYSTEM HALTED - Manual intervention required", "error")
        log("üî¥ Omega pipeline complete", "omega")

if __name__ == "__main__":
    main()