In [None]:
def create_time_splits(X, y, timestamps, train_size=0.6, val_size=0.2):
    """Create time-based train/val/test splits"""
    n_samples = len(X)
    
    train_end = int(n_samples * train_size)
    val_end = int(n_samples * (train_size + val_size))
    
    splits = {
        'X_train': X.iloc[:train_end].copy(),
        'y_train': y.iloc[:train_end].copy(),
        'X_val': X.iloc[train_end:val_end].copy(),
        'y_val': y.iloc[train_end:val_end].copy(),
        'X_test': X.iloc[val_end:].copy(),
        'y_test': y.iloc[val_end:].copy()
    }
    
    print(f"📊 Time-based splits:")
    print(f"  Train: {len(splits['X_train'])} samples")
    print(f"  Val:   {len(splits['X_val'])} samples")
    print(f"  Test:  {len(splits['X_test'])} samples")
    
    return splits

def train_lightgbm_model(X_train, y_train, X_val, y_val, task_type='binary'):
    """Train LightGBM model"""
    
    if task_type == 'binary':
        params = {
            'objective': 'binary',
            'metric': 'auc',
            'boosting_type': 'gbdt',
            'num_leaves': 31 if CFG['fast_test'] else 63,
            'learning_rate': 0.1 if CFG['fast_test'] else 0.05,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'min_data_in_leaf': 10 if CFG['fast_test'] else 20,
            'verbose': -1,
            'random_state': CFG['seed']
        }
        metric_name = 'auc'
    else:
        params = {
            'objective': 'regression',
            'metric': 'rmse',
            'boosting_type': 'gbdt',
            'num_leaves': 31 if CFG['fast_test'] else 63,
            'learning_rate': 0.1 if CFG['fast_test'] else 0.05,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'min_data_in_leaf': 10 if CFG['fast_test'] else 20,
            'verbose': -1,
            'random_state': CFG['seed']
        }
        metric_name = 'rmse'
    
    # Create datasets
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
    
    # Train model
    n_rounds = 100 if CFG['fast_test'] else 1000
    early_stop = 20 if CFG['fast_test'] else 50
    
    model = lgb.train(
        params,
        train_data,
        num_boost_round=n_rounds,
        valid_sets=[train_data, val_data],
        valid_names=['train', 'val'],
        callbacks=[
            lgb.early_stopping(early_stop, verbose=False),
            lgb.log_evaluation(50 if CFG['fast_test'] else 100)
        ]
    )
    
    return model, params

def evaluate_model(model, X_test, y_test, task_type='binary', model_name='Model'):
    """Evaluate trained model"""
    
    try:
        if 'xgb' in str(type(model)).lower():
            # XGBoost model
            dtest = xgb.DMatrix(X_test)
            y_pred = model.predict(dtest)
        else:
            # LightGBM or other sklearn-compatible model
            y_pred = model.predict(X_test)
        
        if task_type == 'binary':
            # Ensure probabilities are in [0,1]
            if y_pred.max() > 1:
                y_pred = 1 / (1 + np.exp(-y_pred))  # sigmoid
            
            auc = roc_auc_score(y_test, y_pred)
            print(f"📊 {model_name} - AUC: {auc:.4f}")
            return {'auc': auc, 'predictions': y_pred}
        else:
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            r2 = r2_score(y_test, y_pred)
            print(f"📊 {model_name} - RMSE: {rmse:.6f}, R²: {r2:.4f}")
            return {'rmse': rmse, 'r2': r2, 'predictions': y_pred}
    
    except Exception as e:
        print(f"❌ Error evaluating {model_name}: {e}")
        return None

# Create data splits
print("🔄 Creating data splits...")
binary_splits = create_time_splits(X, y_binary, timestamps)
regression_splits = create_time_splits(X, y_regression, timestamps)

# Train LightGBM models
print("\n🚀 Training LightGBM Binary Classifier...")
lgbm_binary, lgbm_binary_params = train_lightgbm_model(
    binary_splits['X_train'], binary_splits['y_train'],
    binary_splits['X_val'], binary_splits['y_val'],
    task_type='binary'
)

print("\n🚀 Training LightGBM Regressor...")
lgbm_regressor, lgbm_reg_params = train_lightgbm_model(
    regression_splits['X_train'], regression_splits['y_train'],
    regression_splits['X_val'], regression_splits['y_val'],
    task_type='regression'
)

# Train XGBoost models (if available)
xgb_binary = None
xgb_regressor = None
xgb_binary_params = None
xgb_reg_params = None

if XGB_AVAILABLE and not CFG['fast_test']:
    print("\n🚀 Training XGBoost models...")
    
    try:
        # XGBoost Binary
        xgb_binary_params = {
            'objective': 'binary:logistic',
            'eval_metric': 'auc',
            'max_depth': 5,
            'eta': 0.1,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'min_child_weight': 10,
            'random_state': CFG['seed'],
            'verbosity': 0
        }
        
        dtrain = xgb.DMatrix(binary_splits['X_train'], label=binary_splits['y_train'])
        dval = xgb.DMatrix(binary_splits['X_val'], label=binary_splits['y_val'])
        
        xgb_binary = xgb.train(
            xgb_binary_params,
            dtrain,
            num_boost_round=200,
            evals=[(dtrain, 'train'), (dval, 'val')],
            early_stopping_rounds=30,
            verbose_eval=50
        )
        print("✅ XGBoost Binary trained")
        
        # XGBoost Regressor
        xgb_reg_params = {
            'objective': 'reg:squarederror',
            'eval_metric': 'rmse',
            'max_depth': 5,
            'eta': 0.1,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'min_child_weight': 10,
            'random_state': CFG['seed'],
            'verbosity': 0
        }
        
        dtrain_reg = xgb.DMatrix(regression_splits['X_train'], label=regression_splits['y_train'])
        dval_reg = xgb.DMatrix(regression_splits['X_val'], label=regression_splits['y_val'])
        
        xgb_regressor = xgb.train(
            xgb_reg_params,
            dtrain_reg,
            num_boost_round=200,
            evals=[(dtrain_reg, 'train'), (dval_reg, 'val')],
            early_stopping_rounds=30,
            verbose_eval=50
        )
        print("✅ XGBoost Regressor trained")
        
    except Exception as e:
        print(f"⚠️  XGBoost training failed: {e}")
        XGB_AVAILABLE = False

print("\n🎯 Training completed!")

# 📊 Model Evaluation

Evaluate all trained models on the test set.

In [None]:
print("📊 Evaluating models on test set...")
print("=" * 50)

# Evaluate binary classification models
print("\n🔍 Binary Classification Results:")
print("-" * 30)

lgbm_binary_results = evaluate_model(
    lgbm_binary, binary_splits['X_test'], binary_splits['y_test'], 
    'binary', 'LightGBM Binary'
)

xgb_binary_results = None
if xgb_binary is not None:
    xgb_binary_results = evaluate_model(
        xgb_binary, binary_splits['X_test'], binary_splits['y_test'], 
        'binary', 'XGBoost Binary'
    )

# Evaluate regression models
print("\n🔍 Regression Results:")
print("-" * 20)

lgbm_reg_results = evaluate_model(
    lgbm_regressor, regression_splits['X_test'], regression_splits['y_test'], 
    'regression', 'LightGBM Regression'
)

xgb_reg_results = None
if xgb_regressor is not None:
    xgb_reg_results = evaluate_model(
        xgb_regressor, regression_splits['X_test'], regression_splits['y_test'], 
        'regression', 'XGBoost Regression'
    )

print("\n✅ Model evaluation complete!")

# 💾 Save Model Artifacts

Save trained models with metadata to repository and Google Drive.

In [None]:
def get_git_commit_hash(repo_path):
    """Get current git commit hash"""
    try:
        result = subprocess.run(
            ['git', 'rev-parse', 'HEAD'],
            cwd=repo_path,
            capture_output=True,
            text=True
        )
        if result.returncode == 0:
            return result.stdout.strip()[:10]  # Short hash
        return 'unknown'
    except:
        return 'unknown'

def calculate_dataset_hash(X, y):
    """Calculate hash of dataset for reproducibility"""
    try:
        data_str = f"{X.shape}_{X.iloc[0].values.sum():.6f}_{y.sum()}"
        return hashlib.md5(data_str.encode()).hexdigest()[:10]
    except:
        return 'unknown'

def save_model_artifacts(models_info, base_path, symbol, cfg):
    """Save model artifacts with metadata"""
    
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    
    # Create directory structure
    symbol_path = Path(base_path) / symbol
    timestamp_path = symbol_path / timestamp
    
    artifact_paths = []
    
    for model_info in models_info:
        if model_info['model'] is None:
            continue
            
        model_id = f"{model_info['name'].lower().replace(' ', '_')}"
        model_path = timestamp_path / model_id
        model_path.mkdir(parents=True, exist_ok=True)
        
        # Save model
        model_file = model_path / 'model.pkl'
        joblib.dump(model_info['model'], model_file, compress=3)
        
        # Save scaler if available
        scaler = StandardScaler().fit(X)  # Fit on full dataset for consistency
        scaler_file = model_path / 'scaler.pkl'
        joblib.dump(scaler, scaler_file, compress=3)
        
        # Create metadata
        metadata = {
            'model_id': model_id,
            'model_name': model_info['name'],
            'model_type': model_info['type'],
            'task_type': model_info['task'],
            'symbol': symbol,
            'timestamp': timestamp,
            'commit_hash': get_git_commit_hash(REPO_PATH) if REPO_CLONED else 'unknown',
            'dataset_hash': calculate_dataset_hash(X, y_binary),
            'config': cfg,
            'performance': model_info.get('results', {}),
            'hyperparameters': model_info.get('params', {}),
            'feature_names': list(X.columns),
            'n_features': len(X.columns),
            'n_samples': len(X),
            'class_distribution': y_binary.value_counts().to_dict(),
            'artifacts': {
                'model': 'model.pkl',
                'scaler': 'scaler.pkl',
                'metadata': 'meta.json'
            }
        }
        
        # Save metadata
        meta_file = model_path / 'meta.json'
        with open(meta_file, 'w') as f:
            json.dump(metadata, f, indent=2, default=str)
        
        artifact_info = {
            'model_id': model_id,
            'path': str(model_path),
            'files': ['model.pkl', 'scaler.pkl', 'meta.json'],
            'size_mb': sum(f.stat().st_size for f in model_path.glob('*')) / (1024*1024)
        }
        artifact_paths.append(artifact_info)
        
        print(f"✅ Saved {model_info['name']} to {model_path}")
    
    return artifact_paths, timestamp

# Prepare models for saving
models_to_save = [
    {
        'name': 'LightGBM Binary',
        'type': 'lightgbm',
        'task': 'classification',
        'model': lgbm_binary,
        'params': lgbm_binary_params,
        'results': lgbm_binary_results
    },
    {
        'name': 'LightGBM Regression',
        'type': 'lightgbm', 
        'task': 'regression',
        'model': lgbm_regressor,
        'params': lgbm_reg_params,
        'results': lgbm_reg_results
    }
]

if xgb_binary is not None:
    models_to_save.append({
        'name': 'XGBoost Binary',
        'type': 'xgboost',
        'task': 'classification', 
        'model': xgb_binary,
        'params': xgb_binary_params,
        'results': xgb_binary_results
    })

if xgb_regressor is not None:
    models_to_save.append({
        'name': 'XGBoost Regression',
        'type': 'xgboost',
        'task': 'regression',
        'model': xgb_regressor, 
        'params': xgb_reg_params,
        'results': xgb_reg_results
    })

# Save to repository
print("💾 Saving model artifacts to repository...")
repo_artifacts, training_timestamp = save_model_artifacts(
    models_to_save, MODEL_SAVE_REPO_PATH, SYMBOL, CFG
)

# Save to Google Drive (if mounted)
drive_artifacts = []
if DRIVE_MOUNTED:
    print("\n💾 Copying artifacts to Google Drive...")
    try:
        drive_artifacts, _ = save_model_artifacts(
            models_to_save, MODEL_SAVE_DRIVE_PATH, SYMBOL, CFG
        )
        print("✅ Artifacts copied to Google Drive")
    except Exception as e:
        print(f"⚠️  Drive copy failed: {e}")

print(f"\n📦 Total artifacts saved: {len(repo_artifacts)} models")
total_size = sum(a['size_mb'] for a in repo_artifacts)
print(f"📦 Total size: {total_size:.2f} MB")