In [17]:
import numpy as np
import pandas as pd
import joblib
import os
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# ‚úÖ **Paths to Saved Models**
MODEL_DIR = "../models/"
ARIMA_DIR = os.path.join(MODEL_DIR, "arima_sarima_models")
BOOSTING_DIR = os.path.join(MODEL_DIR, "boosting_models")
RESULTS_FILE = "../data/model_comparison.csv"
FEATURE_DIR = "../data/feature_engineering data"

# ‚úÖ **Function to Load and Validate Data**
def load_stock_data(stock):
    """Loads the feature-engineered dataset for a stock, ensuring 'Date' column exists."""
    file_path = os.path.join(FEATURE_DIR, f"{stock}_boosting_features.csv")

    if not os.path.exists(file_path):
        print(f"‚ö†Ô∏è Feature file missing for {stock}, skipping...")
        return None

    df = pd.read_csv(file_path, parse_dates=["Date"])  # Ensure 'Date' is parsed
    if "Date" not in df.columns:
        print(f"‚ùå ERROR: 'Date' column not found in {file_path}")
        print(f"üîç Available columns: {df.columns.tolist()}")
        return None

    df.set_index("Date", inplace=True)
    return df

# ‚úÖ **Function to Load & Predict with Models**
def load_predictions(stock):
    """Loads trained models and makes predictions for a stock."""
    
    df = load_stock_data(stock)
    if df is None:
        return None
    
    last_row = df.iloc[-1].drop("Date", errors="ignore")  # Ensure it doesn't crash if 'Date' is absent

    # ‚úÖ **Load Saved Models (Skipping if Missing)**
    models = {}
    model_paths = {
        "ARIMA": os.path.join(ARIMA_DIR, f"{stock}_ARIMA.pkl"),
        "SARIMA": os.path.join(ARIMA_DIR, f"{stock}_SARIMA.pkl"),
        "Gradient Boosting": os.path.join(BOOSTING_DIR, f"{stock}_GradientBoosting.pkl"),
        "XGBoost": os.path.join(BOOSTING_DIR, f"{stock}_XGBoost.pkl"),
        "LightGBM": os.path.join(BOOSTING_DIR, f"{stock}_LightGBM.pkl"),
    }

    for model_name, path in model_paths.items():
        if os.path.exists(path):
            models[model_name] = joblib.load(path)
        else:
            print(f"‚ö†Ô∏è {model_name} model for {stock} not found, skipping...")

    # ‚úÖ **Make Predictions (Only for Available Models)**
    predictions = {}

    # ‚úÖ **ARIMA/SARIMA Predictions (Use Date Index)**
    for model_name in ["ARIMA", "SARIMA"]:
        if model_name in models:
            try:
                last_index = df.index[-1]
                future_dates = pd.date_range(start=last_index, periods=2, freq="D")[1:]  # Predict next day
                pred_series = models[model_name].predict(start=future_dates[0], end=future_dates[0])
                predictions[model_name] = pred_series.iloc[0]
            except Exception as e:
                print(f"‚ùå Error in {model_name} prediction for {stock}: {e}")

    # ‚úÖ **Machine Learning Models Predictions**
    for model_name in ["Gradient Boosting", "XGBoost", "LightGBM"]:
        if model_name in models:
            try:
                predictions[model_name] = models[model_name].predict([last_row.values])[0]
            except Exception as e:
                print(f"‚ùå Error in {model_name} prediction for {stock}: {e}")

    return predictions

# ‚úÖ **Run Evaluation for All Stocks**
if __name__ == "__main__":
    stock_files = [f for f in os.listdir(FEATURE_DIR) if f.endswith("_boosting_features.csv")]
    stock_symbols = [f.split("_")[0] for f in stock_files]

    results = []

    print(f"\n‚úÖ Detected stocks for evaluation: {stock_symbols}")

    for stock in stock_symbols:
        print(f"\nüìä Evaluating models for {stock}...")

        predictions = load_predictions(stock)
        if predictions is None:
            continue  # Skip stock if missing data

        # Get actual last known stock price
        df = load_stock_data(stock)
        y_true = np.array(df[f"{stock}_Close"].iloc[-5:])  # Use last 5 days of actual prices

        # ‚úÖ **Evaluate Each Model**
        for model_name, y_pred in predictions.items():
            results.append(evaluate_model(y_true, np.full_like(y_true, y_pred), model_name, stock))

    # ‚úÖ **Convert Results to DataFrame**
    results_df = pd.DataFrame(results)

    # ‚úÖ **Find the Best Model Per Stock**
    best_models = results_df.loc[results_df.groupby("stock")["rmse"].idxmin()]

    # ‚úÖ **Display & Save Results**
    print("\nüìä Model Performance Comparison:")
    print(results_df)

    print("\nüèÜ **Best Model Per Stock:**")
    print(best_models)

    results_df.to_csv(RESULTS_FILE, index=False)
    best_models.to_csv("../data/best_models.csv", index=False)

    print("\n‚úÖ Results saved to 'model_comparison.csv' and 'best_models.csv'")



‚úÖ Detected stocks for evaluation: ['NVDA', 'MSFT', 'TSLA', 'AAPL', 'GOOGL', 'AMZN', 'ORCL', 'IBM', 'META', 'NFLX']

üìä Evaluating models for NVDA...




‚ùå Error in Gradient Boosting prediction for NVDA: X has 36 features, but GradientBoostingRegressor is expecting 35 features as input.
‚ùå Error in XGBoost prediction for NVDA: Feature shape mismatch, expected: 35, got 36
‚ùå Error in LightGBM prediction for NVDA: Number of features of the model must match the input. Model n_features_ is 35 and input n_features is 36

üìä --- ARIMA Evaluation for NVDA ---
‚úÖ RMSE: 1.2027
‚úÖ MAE: 1.1272
‚úÖ MAPE: 7.85%
‚úÖ R¬≤ Score: -7.2221
----------------------------------------

üìä --- SARIMA Evaluation for NVDA ---
‚úÖ RMSE: 7.1491
‚úÖ MAE: 7.1368
‚úÖ MAPE: 49.29%
‚úÖ R¬≤ Score: -289.4991
----------------------------------------

üìä Evaluating models for MSFT...




‚ùå Error in Gradient Boosting prediction for MSFT: X has 36 features, but GradientBoostingRegressor is expecting 35 features as input.
‚ùå Error in XGBoost prediction for MSFT: Feature shape mismatch, expected: 35, got 36
‚ùå Error in LightGBM prediction for MSFT: Number of features of the model must match the input. Model n_features_ is 35 and input n_features is 36

üìä --- ARIMA Evaluation for MSFT ---
‚úÖ RMSE: 13.5270
‚úÖ MAE: 13.3349
‚úÖ MAPE: 5.61%
‚úÖ R¬≤ Score: -34.4512
----------------------------------------

üìä --- SARIMA Evaluation for MSFT ---
‚úÖ RMSE: 65.3165
‚úÖ MAE: 65.2769
‚úÖ MAPE: 27.41%
‚úÖ R¬≤ Score: -825.5545
----------------------------------------

üìä Evaluating models for TSLA...




‚ùå Error in Gradient Boosting prediction for TSLA: X has 36 features, but GradientBoostingRegressor is expecting 35 features as input.
‚ùå Error in XGBoost prediction for TSLA: Feature shape mismatch, expected: 35, got 36
‚ùå Error in LightGBM prediction for TSLA: Number of features of the model must match the input. Model n_features_ is 35 and input n_features is 36

üìä --- ARIMA Evaluation for TSLA ---
‚úÖ RMSE: 83.7842
‚úÖ MAE: 83.5749
‚úÖ MAPE: 71.27%
‚úÖ R¬≤ Score: -199.4037
----------------------------------------

üìä --- SARIMA Evaluation for TSLA ---
‚úÖ RMSE: 188.0725
‚úÖ MAE: 187.9794
‚úÖ MAPE: 159.99%
‚úÖ R¬≤ Score: -1008.7933
----------------------------------------

üìä Evaluating models for AAPL...




‚ùå Error in Gradient Boosting prediction for AAPL: X has 36 features, but GradientBoostingRegressor is expecting 35 features as input.
‚ùå Error in XGBoost prediction for AAPL: Feature shape mismatch, expected: 35, got 36
‚ùå Error in LightGBM prediction for AAPL: Number of features of the model must match the input. Model n_features_ is 35 and input n_features is 36

üìä --- ARIMA Evaluation for AAPL ---
‚úÖ RMSE: 3.1970
‚úÖ MAE: 2.9250
‚úÖ MAPE: 2.24%
‚úÖ R¬≤ Score: -1.8362
----------------------------------------

üìä --- SARIMA Evaluation for AAPL ---
‚úÖ RMSE: 25.8028
‚úÖ MAE: 25.7329
‚úÖ MAPE: 19.90%
‚úÖ R¬≤ Score: -183.7458
----------------------------------------

üìä Evaluating models for GOOGL...




‚ùå Error in Gradient Boosting prediction for GOOGL: X has 36 features, but GradientBoostingRegressor is expecting 35 features as input.
‚ùå Error in XGBoost prediction for GOOGL: Feature shape mismatch, expected: 35, got 36
‚ùå Error in LightGBM prediction for GOOGL: Number of features of the model must match the input. Model n_features_ is 35 and input n_features is 36

üìä --- ARIMA Evaluation for GOOGL ---
‚úÖ RMSE: 30.2885
‚úÖ MAE: 30.2688
‚úÖ MAPE: 34.47%
‚úÖ R¬≤ Score: -767.5496
----------------------------------------

üìä --- SARIMA Evaluation for GOOGL ---
‚úÖ RMSE: 51.0311
‚úÖ MAE: 51.0194
‚úÖ MAPE: 58.09%
‚úÖ R¬≤ Score: -2180.6533
----------------------------------------

üìä Evaluating models for AMZN...




‚ùå Error in Gradient Boosting prediction for AMZN: X has 36 features, but GradientBoostingRegressor is expecting 35 features as input.
‚ùå Error in XGBoost prediction for AMZN: Feature shape mismatch, expected: 35, got 36
‚ùå Error in LightGBM prediction for AMZN: Number of features of the model must match the input. Model n_features_ is 35 and input n_features is 36

üìä --- ARIMA Evaluation for AMZN ---
‚úÖ RMSE: 79.2451
‚úÖ MAE: 79.2366
‚úÖ MAPE: 94.75%
‚úÖ R¬≤ Score: -4696.5656
----------------------------------------

üìä --- SARIMA Evaluation for AMZN ---
‚úÖ RMSE: 96.9258
‚úÖ MAE: 96.9189
‚úÖ MAPE: 115.89%
‚úÖ R¬≤ Score: -7026.6082
----------------------------------------

üìä Evaluating models for ORCL...




‚ùå Error in Gradient Boosting prediction for ORCL: X has 36 features, but GradientBoostingRegressor is expecting 35 features as input.
‚ùå Error in XGBoost prediction for ORCL: Feature shape mismatch, expected: 35, got 36
‚ùå Error in LightGBM prediction for ORCL: Number of features of the model must match the input. Model n_features_ is 35 and input n_features is 36

üìä --- ARIMA Evaluation for ORCL ---
‚úÖ RMSE: 2.2034
‚úÖ MAE: 2.1501
‚úÖ MAPE: 2.65%
‚úÖ R¬≤ Score: -19.9016
----------------------------------------

üìä --- SARIMA Evaluation for ORCL ---
‚úÖ RMSE: 11.4165
‚úÖ MAE: 11.4064
‚úÖ MAPE: 14.08%
‚úÖ R¬≤ Score: -560.1241
----------------------------------------

üìä Evaluating models for IBM...




‚ùå Error in Gradient Boosting prediction for IBM: X has 36 features, but GradientBoostingRegressor is expecting 35 features as input.
‚ùå Error in XGBoost prediction for IBM: Feature shape mismatch, expected: 35, got 36
‚ùå Error in LightGBM prediction for IBM: Number of features of the model must match the input. Model n_features_ is 35 and input n_features is 36

üìä --- ARIMA Evaluation for IBM ---
‚úÖ RMSE: 3.7389
‚úÖ MAE: 3.6523
‚úÖ MAPE: 2.58%
‚úÖ R¬≤ Score: -20.8515
----------------------------------------

üìä --- SARIMA Evaluation for IBM ---
‚úÖ RMSE: 1.8558
‚úÖ MAE: 1.6746
‚úÖ MAPE: 1.18%
‚úÖ R¬≤ Score: -4.3833
----------------------------------------

üìä Evaluating models for META...




‚ùå Error in Gradient Boosting prediction for META: X has 36 features, but GradientBoostingRegressor is expecting 35 features as input.
‚ùå Error in XGBoost prediction for META: Feature shape mismatch, expected: 35, got 36
‚ùå Error in LightGBM prediction for META: Number of features of the model must match the input. Model n_features_ is 35 and input n_features is 36

üìä --- ARIMA Evaluation for META ---
‚úÖ RMSE: 209.3922
‚úÖ MAE: 209.3839
‚úÖ MAPE: 177.17%
‚úÖ R¬≤ Score: -12712.3876
----------------------------------------

üìä --- SARIMA Evaluation for META ---
‚úÖ RMSE: 265.8602
‚úÖ MAE: 265.8537
‚úÖ MAPE: 224.95%
‚úÖ R¬≤ Score: -20493.9533
----------------------------------------

üìä Evaluating models for NFLX...
‚ùå Error in Gradient Boosting prediction for NFLX: X has 36 features, but GradientBoostingRegressor is expecting 35 features as input.
‚ùå Error in XGBoost prediction for NFLX: Feature shape mismatch, expected: 35, got 36
‚ùå Error in LightGBM prediction for NFLX: 



In [18]:
import pandas as pd
import os
import joblib
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# ‚úÖ **Paths**
MODEL_DIR = "../models/boosting_models/"  # Path where ML models are stored
FEATURE_DIR = "../data/feature_engineering data/"  # Feature dataset path
BEST_MODEL_FILE = "../data/best_models.csv"  # ARIMA & SARIMA best models
COMPARISON_FILE = "../data/model_comparison.csv"  # ARIMA & SARIMA comparison
FINAL_OUTPUT = "../data/full_model_comparison.csv"  # Merged comparison file

# ‚úÖ **Function to Evaluate Model Performance**
def evaluate_model(y_true, y_pred, stock, model_name):
    """Calculate RMSE, MAE, MAPE, and R¬≤ Score"""
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    r2 = r2_score(y_true, y_pred)
    
    return {"stock": stock, "model": model_name, "rmse": rmse, "mae": mae, "mape": mape, "r2": r2}

# ‚úÖ **Load ARIMA & SARIMA Results**
arima_sarima_results = pd.read_csv(COMPARISON_FILE) if os.path.exists(COMPARISON_FILE) else pd.DataFrame()

# ‚úÖ **Detect ML Model Files**
ml_models = [f for f in os.listdir(MODEL_DIR) if f.endswith(".pkl")]
stock_symbols = list(set([f.split("_")[0] for f in ml_models]))

# ‚úÖ **Store ML Results**
ml_results = []

# ‚úÖ **Evaluate ML Models for Each Stock**
for stock in stock_symbols:
    print(f"\nüìä Evaluating ML models for {stock}...")

    # ‚úÖ Load Feature Data
    feature_file = os.path.join(FEATURE_DIR, f"{stock}_boosting_features.csv")
    if not os.path.exists(feature_file):
        print(f"‚ùå Feature file missing for {stock}! Skipping...")
        continue

    df = pd.read_csv(feature_file)
    if "Date" in df.columns:
        df.drop(columns=["Date"], inplace=True)  # Drop Date column if exists

    X_test = df.iloc[-50:, :]  # Use last 50 rows as test data
    y_test = X_test[f"{stock}_Close"].values  # True values
    X_test = X_test.drop(columns=[f"{stock}_Close"])  # Features only

    # ‚úÖ **Load and Evaluate ML Models**
    for model_type in ["GradientBoosting", "XGBoost", "LightGBM"]:
        model_file = os.path.join(MODEL_DIR, f"{stock}_{model_type}.pkl")
        
        if os.path.exists(model_file):
            model = joblib.load(model_file)
            y_pred = model.predict(X_test)
            result = evaluate_model(y_test, y_pred, stock, model_type)
            ml_results.append(result)
        else:
            print(f"‚ùå Missing {model_type} model for {stock}")

# ‚úÖ **Convert ML Results to DataFrame**
ml_results_df = pd.DataFrame(ml_results)

# ‚úÖ **Merge ARIMA/SARIMA and ML Model Comparisons**
full_comparison_df = pd.concat([arima_sarima_results, ml_results_df])

# ‚úÖ **Find Best Model Per Stock Based on RMSE**
best_models_per_stock = full_comparison_df.loc[full_comparison_df.groupby("stock")["rmse"].idxmin()]

# ‚úÖ **Save Final Results**
full_comparison_df.to_csv(FINAL_OUTPUT, index=False)
best_models_per_stock.to_csv("../data/best_models_per_stock.csv", index=False)

print("\nüìä **Full Model Performance Comparison Saved!**")
print("\nüèÜ **Best Model Per Stock (Lowest RMSE):**")
print(best_models_per_stock)



üìä Evaluating ML models for IBM...


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- IBM_Adj Close
Feature names seen at fit time, yet now missing:
- IBM_Close


In [None]:
import pandas as pd
import os

# ‚úÖ **Define File Paths**
DATA_DIR = "../data"
BEST_MODELS_FILE = os.path.join(DATA_DIR, "best_models.csv")  # ARIMA/SARIMA
ML_MODELS_FILE = os.path.join(DATA_DIR, "model_performance_comparison.csv")  # ML models
COMPARISON_OUTPUT = os.path.join(DATA_DIR, "final_model_comparison.csv")
BEST_MODEL_OUTPUT = os.path.join(DATA_DIR, "final_best_models.csv")

# ‚úÖ **Step 1: Load ARIMA & SARIMA Results**
print("üì• Loading ARIMA & SARIMA results...")
arima_sarima_df = pd.read_csv(BEST_MODELS_FILE)

# ‚úÖ **Step 2: Load ML Model Results**
print("üì• Loading ML Model results...")
ml_df = pd.read_csv(ML_MODELS_FILE)

# ‚úÖ **Step 3: Merge All Models for Comparison**
print("üîÑ Merging ARIMA/SARIMA & ML model results...")
comparison_df = pd.concat([arima_sarima_df, ml_df])

# ‚úÖ **Step 4: Find the Best Model per Stock**
best_models_df = comparison_df.groupby("stock").apply(lambda x: x.nsmallest(1, "rmse")).reset_index(drop=True)

# ‚úÖ **Step 5: Save & Display Results**
comparison_df.to_csv(COMPARISON_OUTPUT, index=False)
best_models_df.to_csv(BEST_MODEL_OUTPUT, index=False)

print("\nüèÜ **Final Model Comparison Saved:**", COMPARISON_OUTPUT)
print("üèÜ **Best Model per Stock Saved:**", BEST_MODEL_OUTPUT)

# ‚úÖ **Step 6: Display Best Models**
print("\nüìä **Best Models for Each Stock**")
print(best_models_df)

# ‚úÖ **Optional: Display First Few Rows of the Full Comparison**
print("\nüìä **Full Model Comparison (First 10 rows)**")
print(comparison_df.head(10))


üì• Loading ARIMA & SARIMA results...
üì• Loading ML Model results...
üîÑ Merging ARIMA/SARIMA & ML model results...

üèÜ **Final Model Comparison Saved:** ../data/final_model_comparison.csv
üèÜ **Best Model per Stock Saved:** ../data/final_best_models.csv


  best_models_df = comparison_df.groupby("stock").apply(lambda x: x.nsmallest(1, "rmse")).reset_index(drop=True)


ModuleNotFoundError: No module named 'ace_tools'