### ARIMA (AutoRegressive Integrated Moving Average) requires:

Stationary Data (Using differencing if necessary)

Hyperparameter Selection (p, d, q)

ACF & PACF Plots to select model parameters

Forecasting Future Prices using trained ARIMA

In [27]:
import pandas as pd
import numpy as np
import logging
import warnings
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pmdarima import auto_arima
from statsmodels.tsa.stattools import adfuller
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from typing import Tuple, Dict

warnings.filterwarnings("ignore")


class TimeSeriesAnalyzer:
    """A comprehensive time series analysis system for financial data."""

    def __init__(self, file_path: str, stock: str):
        """
        Initialize the TimeSeriesAnalyzer.

        Args:
            file_path (str): Path to the CSV file containing stock data.
            stock (str): Stock symbol to analyze.
        """
        self.file_path = file_path
        self.stock = stock
        self.data = None
        self.best_order = None
        self.arima_model = None
        self.sarima_model = None
        self.train_data = None
        self.test_data = None

        # Configure logging
        self._setup_logging()

    def _setup_logging(self) -> None:
        """Configure logging settings."""
        logging.basicConfig(
            level=logging.INFO,
            format="%(asctime)s - %(levelname)s - %(message)s",
            datefmt="%Y-%m-%d %H:%M:%S",
        )
        self.logger = logging.getLogger(__name__)

    def load_data(self, train_size: float = 0.8) -> pd.DataFrame:
        """Load and prepare time series data."""
        try:
            # Load data
            df = pd.read_csv(self.file_path, parse_dates=["Date"])
            df.set_index("Date", inplace=True)

            # Check if stock exists in dataset
            stock_col = f"{self.stock}_Close"
            if stock_col not in df.columns:
                raise ValueError(f"Column '{stock_col}' not found in the dataset!")

            # Set frequency and sort
            self.data = df[[stock_col]].asfreq("D")
            self.data.sort_index(inplace=True)

            # Handle missing values
            self.data = self.data.fillna(method="ffill").fillna(method="bfill")

            # Split data
            train_size = int(len(self.data) * train_size)
            self.train_data = self.data[:train_size]
            self.test_data = self.data[train_size:]

            self.logger.info(f"✅ Data Loaded for {self.stock}: {len(self.data)} records")
            return self.data

        except Exception as e:
            self.logger.error(f"❌ Error loading data: {str(e)}")
            raise

    def analyze_data(self) -> Dict:
        """Perform initial data analysis."""
        try:
            adf_result = adfuller(self.data[f"{self.stock}_Close"])
            analysis = {
                "total_samples": len(self.data),
                "date_range": f"{self.data.index.min()} to {self.data.index.max()}",
                "mean_price": self.data[f"{self.stock}_Close"].mean(),
                "std_price": self.data[f"{self.stock}_Close"].std(),
                "adf_statistic": adf_result[0],
                "adf_pvalue": adf_result[1],
                "is_stationary": adf_result[1] < 0.05,
            }
            return analysis

        except Exception as e:
            self.logger.error(f"❌ Error in data analysis: {str(e)}")
            raise

    def find_best_parameters(self) -> Tuple[int, int, int]:
        """Find optimal ARIMA parameters."""
        try:
            model = auto_arima(
                self.train_data[f"{self.stock}_Close"],
                seasonal=False,
                stepwise=True,
                suppress_warnings=True,
                error_action="ignore",
            )
            self.best_order = model.order
            self.logger.info(f"✅ Best ARIMA order: {self.best_order}")
            return self.best_order

        except Exception as e:
            self.logger.error(f"❌ Error finding best parameters: {str(e)}")
            raise

    def train_models(self, seasonal_order: Tuple[int, int, int, int] = (1, 1, 1, 12)) -> None:
        """Train ARIMA and SARIMA models."""
        try:
            if self.best_order is None:
                self.best_order = self.find_best_parameters()

            # Train ARIMA
            self.arima_model = ARIMA(
                self.train_data[f"{self.stock}_Close"], order=self.best_order
            ).fit()

            # Train SARIMA
            self.sarima_model = SARIMAX(
                self.train_data[f"{self.stock}_Close"],
                order=self.best_order,
                seasonal_order=seasonal_order,
            ).fit()

            self.logger.info("✅ Models trained successfully")

        except Exception as e:
            self.logger.error(f"❌ Error training models: {str(e)}")
            raise

    def evaluate_model(self, model, model_name: str) -> Dict:
        """Evaluate model performance."""
        try:
            # Generate predictions
            predictions = model.predict(
                start=self.test_data.index[0], end=self.test_data.index[-1]
            )

            # Ensure alignment
            actual_values = self.test_data[f"{self.stock}_Close"]
            predictions, actual_values = predictions.align(actual_values, join="inner")

            # Calculate metrics
            metrics = {
                "mae": mean_absolute_error(actual_values, predictions),
                "rmse": np.sqrt(mean_squared_error(actual_values, predictions)),
                "r2": r2_score(actual_values, predictions),
                "mape": np.mean(np.abs((actual_values - predictions) / actual_values)) * 100,
            }

            self.logger.info(f"✅ {model_name} Evaluation Done")
            return metrics

        except Exception as e:
            self.logger.error(f"❌ Error evaluating {model_name}: {str(e)}")
            raise


def main():
    """Main execution function."""
    try:
        # Load dataset
        file_path = "../data/processed/final_merged_stock_data.csv"
        df = pd.read_csv(file_path)

        # Identify stock symbols dynamically
        stock_symbols = [col.split("_")[0] for col in df.columns if "_Close" in col]

        print(f"✅ Detected stocks for ARIMA & SARIMA training: {stock_symbols}")

        for stock in stock_symbols:
            print(f"\n📊 Processing {stock}...")

            try:
                # Initialize analyzer
                analyzer = TimeSeriesAnalyzer(file_path, stock)

                # Load and analyze data
                analyzer.load_data()
                analysis = analyzer.analyze_data()

                print("\n📊 Initial Data Analysis:")
                for key, value in analysis.items():
                    print(f"{key}: {value}")

                # Train models
                analyzer.train_models()

                # Evaluate models
                arima_metrics = analyzer.evaluate_model(analyzer.arima_model, "ARIMA")
                sarima_metrics = analyzer.evaluate_model(analyzer.sarima_model, "SARIMA")

                # Print results
                print(f"\n🏆 {stock} Model Performance:")
                print(f"ARIMA RMSE: {arima_metrics['rmse']:.4f}")
                print(f"SARIMA RMSE: {sarima_metrics['rmse']:.4f}")

            except Exception as e:
                print(f"❌ Error processing {stock}: {e}")

        print("\n🏆 All stocks processed successfully!")

    except Exception as e:
        logging.error(f"❌ Error in main execution: {str(e)}")
        raise


if __name__ == "__main__":
    main()


2025-02-08 12:48:47,474 - INFO - ✅ Data Loaded for AAPL: 2920 records


✅ Detected stocks for ARIMA & SARIMA training: ['AAPL', 'AMZN', 'GOOGL', 'IBM', 'META', 'MSFT', 'NFLX', 'NVDA', 'ORCL', 'TSLA']

📊 Processing AAPL...

📊 Initial Data Analysis:
total_samples: 2920
date_range: 2015-01-02 00:00:00 to 2022-12-30 00:00:00
mean_price: 95.75009939639214
std_price: 32.60055192615524
adf_statistic: -1.0779745837131058
adf_pvalue: 0.7237749958529591
is_stationary: False


2025-02-08 12:48:49,282 - INFO - ✅ Best ARIMA order: (1, 1, 0)
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.55227D+00    |proj g|=  2.84121D-01

At iterate    5    f=  1.42083D+00    |proj g|=  6.97815D-02

At iterate   10    f=  1.40658D+00    |proj g|=  9.82765D-03

At iterate   15    f=  1.40515D+00    |proj g|=  2.16596D-04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4     18     23      1     0     0   9.903D-06   1.405D+00
  F =   1.4051457153584666     

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL            


2025-02-08 12:48:52,212 - INFO - ✅ Models trained successfully
2025-02-08 12:48:52,230 - INFO - ✅ ARIMA Evaluation Done
2025-02-08 12:48:52,251 - INFO - ✅ SARIMA Evaluation Done
2025-02-08 12:48:52,300 - INFO - ✅ Data Loaded for AMZN: 2920 records



🏆 AAPL Model Performance:
ARIMA RMSE: 28.8987
SARIMA RMSE: 19.1217

📊 Processing AMZN...

📊 Initial Data Analysis:
total_samples: 2920
date_range: 2015-01-02 00:00:00 to 2022-12-30 00:00:00
mean_price: 112.68077714735657
std_price: 28.592651975375738
adf_statistic: -1.4590056706533119
adf_pvalue: 0.5536844873317207
is_stationary: False


2025-02-08 12:48:53,733 - INFO - ✅ Best ARIMA order: (1, 1, 0)
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.71714D+00    |proj g|=  2.27225D-01

At iterate    5    f=  1.59084D+00    |proj g|=  5.19844D-02

At iterate   10    f=  1.57303D+00    |proj g|=  3.76821D-03

At iterate   15    f=  1.57225D+00    |proj g|=  1.97940D-03

At iterate   20    f=  1.57222D+00    |proj g|=  2.14335D-04

At iterate   25    f=  1.57222D+00    |proj g|=  6.65779D-05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4     26     33      1     0     0   

2025-02-08 12:48:56,989 - INFO - ✅ Models trained successfully
2025-02-08 12:48:57,007 - INFO - ✅ ARIMA Evaluation Done
2025-02-08 12:48:57,042 - INFO - ✅ SARIMA Evaluation Done
2025-02-08 12:48:57,098 - INFO - ✅ Data Loaded for GOOGL: 2920 records



🏆 AMZN Model Performance:
ARIMA RMSE: 35.1000
SARIMA RMSE: 44.1672

📊 Processing GOOGL...

📊 Initial Data Analysis:
total_samples: 2920
date_range: 2015-01-02 00:00:00 to 2022-12-30 00:00:00
mean_price: 81.84587502282919
std_price: 23.70813860032267
adf_statistic: -1.1321715937760801
adf_pvalue: 0.7020732582676272
is_stationary: False


2025-02-08 12:49:03,315 - INFO - ✅ Best ARIMA order: (5, 2, 0)


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            8     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.24343D+00    |proj g|=  3.29180D-01


 This problem is unconstrained.



At iterate    5    f=  1.15456D+00    |proj g|=  1.04014D-01

At iterate   10    f=  1.08823D+00    |proj g|=  3.04157D-02

At iterate   15    f=  1.07858D+00    |proj g|=  7.91227D-03

At iterate   20    f=  1.07817D+00    |proj g|=  1.69913D-03

At iterate   25    f=  1.07814D+00    |proj g|=  2.74628D-04

At iterate   30    f=  1.07813D+00    |proj g|=  8.74471D-04

At iterate   35    f=  1.07813D+00    |proj g|=  3.79750D-04



   evaluations in the last line search.  Termination
   may possibly be caused by a bad search direction.



           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    8     39     58      1     0     0   3.205D-04   1.078D+00
  F =   1.0781326051410554     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             


2025-02-08 12:49:18,065 - INFO - ✅ Models trained successfully
2025-02-08 12:49:18,084 - INFO - ✅ ARIMA Evaluation Done
2025-02-08 12:49:18,112 - INFO - ✅ SARIMA Evaluation Done
2025-02-08 12:49:18,168 - INFO - ✅ Data Loaded for IBM: 2920 records



🏆 GOOGL Model Performance:
ARIMA RMSE: 296.7784
SARIMA RMSE: 309.4768

📊 Processing IBM...

📊 Initial Data Analysis:
total_samples: 2920
date_range: 2015-01-02 00:00:00 to 2022-12-30 00:00:00
mean_price: 128.58919028193682
std_price: 6.613850409966291
adf_statistic: -5.252956278535726
adf_pvalue: 6.847874308303097e-06
is_stationary: True


2025-02-08 12:49:19,785 - INFO - ✅ Best ARIMA order: (1, 1, 0)
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.63002D+00    |proj g|=  2.52307D-01

At iterate    5    f=  1.49507D+00    |proj g|=  7.53310D-02

At iterate   10    f=  1.47374D+00    |proj g|=  3.32733D-03

At iterate   15    f=  1.47192D+00    |proj g|=  3.96696D-03

At iterate   20    f=  1.47174D+00    |proj g|=  4.70775D-04

At iterate   25    f=  1.47172D+00    |proj g|=  8.48177D-05

At iterate   30    f=  1.47172D+00    |proj g|=  4.98982D-05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nac

2025-02-08 12:49:24,394 - INFO - ✅ Models trained successfully
2025-02-08 12:49:24,414 - INFO - ✅ ARIMA Evaluation Done
2025-02-08 12:49:24,438 - INFO - ✅ SARIMA Evaluation Done
2025-02-08 12:49:24,491 - INFO - ✅ Data Loaded for META: 2920 records



🏆 IBM Model Performance:
ARIMA RMSE: 8.8779
SARIMA RMSE: 9.3942

📊 Processing META...

📊 Initial Data Analysis:
total_samples: 2920
date_range: 2015-01-02 00:00:00 to 2022-12-30 00:00:00
mean_price: 223.1374034959995
std_price: 47.81226178195996
adf_statistic: -1.4016582209289736
adf_pvalue: 0.5814808888781947
is_stationary: False


2025-02-08 12:49:30,237 - INFO - ✅ Best ARIMA order: (3, 1, 1)
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            7     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.43376D+00    |proj g|=  1.10873D-01

At iterate    5    f=  2.31475D+00    |proj g|=  4.11046D-02

At iterate   10    f=  2.28678D+00    |proj g|=  1.32752D-02

At iterate   15    f=  2.28516D+00    |proj g|=  2.56796D-03

At iterate   20    f=  2.28481D+00    |proj g|=  2.48440D-03

At iterate   25    f=  2.28480D+00    |proj g|=  9.24445D-05

At iterate   30    f=  2.28479D+00    |proj g|=  1.49542D-03
  ys=-1.996E-05  -gs= 4.533E-05 BFGS update SKIPPED

At iterate   35    f=  2.28451D+00    |proj g|=  1.28939D-03

At iterate   40    f=  2.28450D+00    |proj g|=  6.29234D-05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of ac

2025-02-08 12:49:39,179 - INFO - ✅ Models trained successfully
2025-02-08 12:49:39,196 - INFO - ✅ ARIMA Evaluation Done
2025-02-08 12:49:39,230 - INFO - ✅ SARIMA Evaluation Done
2025-02-08 12:49:39,306 - INFO - ✅ Data Loaded for MSFT: 2920 records



🏆 META Model Performance:
ARIMA RMSE: 123.9132
SARIMA RMSE: 155.1130

📊 Processing MSFT...

📊 Initial Data Analysis:
total_samples: 2920
date_range: 2015-01-02 00:00:00 to 2022-12-30 00:00:00
mean_price: 192.5405653078337
std_price: 50.495724407607916
adf_statistic: -0.907146304342869
adf_pvalue: 0.7856099279535994
is_stationary: False


2025-02-08 12:49:40,714 - INFO - ✅ Best ARIMA order: (1, 1, 0)
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.08538D+00    |proj g|=  1.51842D-01

At iterate    5    f=  1.95657D+00    |proj g|=  4.72513D-02

At iterate   10    f=  1.94387D+00    |proj g|=  8.90013D-03

At iterate   15    f=  1.94366D+00    |proj g|=  3.35636D-04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4     16     18      1     0     0   3.708D-06   1.944D+00
  F =   1.9436590181114923     

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL            


2025-02-08 12:49:42,437 - INFO - ✅ Models trained successfully
2025-02-08 12:49:42,452 - INFO - ✅ ARIMA Evaluation Done
2025-02-08 12:49:42,483 - INFO - ✅ SARIMA Evaluation Done
2025-02-08 12:49:42,538 - INFO - ✅ Data Loaded for NFLX: 2920 records



🏆 MSFT Model Performance:
ARIMA RMSE: 41.7838
SARIMA RMSE: 40.6850

📊 Processing NFLX...

📊 Initial Data Analysis:
total_samples: 2920
date_range: 2015-01-02 00:00:00 to 2022-12-30 00:00:00
mean_price: 367.3373817025837
std_price: 95.41266336423242
adf_statistic: -1.851168738407077
adf_pvalue: 0.3553697154288761
is_stationary: False


2025-02-08 12:49:56,105 - INFO - ✅ Best ARIMA order: (5, 1, 2)


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =           10     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  3.16151D+00    |proj g|=  6.57798D-02


 This problem is unconstrained.



At iterate    5    f=  3.02204D+00    |proj g|=  1.80882D-02

At iterate   10    f=  3.01716D+00    |proj g|=  3.10902D-02

At iterate   15    f=  3.01347D+00    |proj g|=  4.25317D-03

At iterate   20    f=  3.01288D+00    |proj g|=  1.10059D-02

At iterate   25    f=  3.01185D+00    |proj g|=  1.07211D-03

At iterate   30    f=  3.01184D+00    |proj g|=  1.56330D-04

At iterate   35    f=  3.01183D+00    |proj g|=  4.79764D-04

At iterate   40    f=  3.01183D+00    |proj g|=  7.43772D-05

At iterate   45    f=  3.01133D+00    |proj g|=  9.76648D-03

At iterate   50    f=  3.01113D+00    |proj g|=  1.28581D-04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tn

2025-02-08 12:50:19,539 - INFO - ✅ Models trained successfully
2025-02-08 12:50:19,558 - INFO - ✅ ARIMA Evaluation Done
2025-02-08 12:50:19,580 - INFO - ✅ SARIMA Evaluation Done
2025-02-08 12:50:19,657 - INFO - ✅ Data Loaded for NVDA: 2920 records



🏆 NFLX Model Performance:
ARIMA RMSE: 196.6917
SARIMA RMSE: 236.8004

📊 Processing NVDA...

📊 Initial Data Analysis:
total_samples: 2920
date_range: 2015-01-02 00:00:00 to 2022-12-30 00:00:00
mean_price: 9.737666139077225
std_price: 6.202162843817104
adf_statistic: -1.268413778147473
adf_pvalue: 0.643496353980515
is_stationary: False


2025-02-08 12:50:28,572 - INFO - ✅ Best ARIMA order: (5, 2, 1)


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            9     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f= -3.50888D-01    |proj g|=  1.49540D+00


 This problem is unconstrained.



At iterate    5    f= -3.98318D-01    |proj g|=  5.94338D-01

At iterate   10    f= -4.91316D-01    |proj g|=  7.98395D-02

At iterate   15    f= -5.33833D-01    |proj g|=  4.31553D-01

At iterate   20    f= -5.75420D-01    |proj g|=  3.06730D-01

At iterate   25    f= -5.92481D-01    |proj g|=  5.06132D-02

At iterate   30    f= -5.97243D-01    |proj g|=  1.09140D-01

At iterate   35    f= -5.98452D-01    |proj g|=  2.89077D-02

At iterate   40    f= -5.98557D-01    |proj g|=  1.05310D-02

At iterate   45    f= -5.98568D-01    |proj g|=  2.18263D-03

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    9     47     66   

2025-02-08 12:50:46,007 - INFO - ✅ Models trained successfully
2025-02-08 12:50:46,024 - INFO - ✅ ARIMA Evaluation Done
2025-02-08 12:50:46,060 - INFO - ✅ SARIMA Evaluation Done
2025-02-08 12:50:46,117 - INFO - ✅ Data Loaded for ORCL: 2920 records



🏆 NVDA Model Performance:
ARIMA RMSE: 6.6306
SARIMA RMSE: 7.3667

📊 Processing ORCL...

📊 Initial Data Analysis:
total_samples: 2920
date_range: 2015-01-02 00:00:00 to 2022-12-30 00:00:00
mean_price: 60.272771084120585
std_price: 11.708642201383306
adf_statistic: -0.9313911903780308
adf_pvalue: 0.7774591851479496
is_stationary: False


2025-02-08 12:50:52,835 - INFO - ✅ Best ARIMA order: (5, 2, 0)
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            8     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  9.14779D-01    |proj g|=  4.91902D-01

At iterate    5    f=  8.15483D-01    |proj g|=  9.31636D-02

At iterate   10    f=  7.41913D-01    |proj g|=  3.24767D-01

At iterate   15    f=  7.19744D-01    |proj g|=  2.23741D-02

At iterate   20    f=  7.18536D-01    |proj g|=  6.83354D-03

At iterate   25    f=  7.18453D-01    |proj g|=  7.12307D-04

At iterate   30    f=  7.18450D-01    |proj g|=  5.24924D-05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nac

2025-02-08 12:51:01,098 - INFO - ✅ Models trained successfully
2025-02-08 12:51:01,115 - INFO - ✅ ARIMA Evaluation Done
2025-02-08 12:51:01,154 - INFO - ✅ SARIMA Evaluation Done
2025-02-08 12:51:01,217 - INFO - ✅ Data Loaded for TSLA: 2920 records



🏆 ORCL Model Performance:
ARIMA RMSE: 13.0921
SARIMA RMSE: 27.0810

📊 Processing TSLA...

📊 Initial Data Analysis:
total_samples: 2920
date_range: 2015-01-02 00:00:00 to 2022-12-30 00:00:00
mean_price: 95.21452031921945
std_price: 104.0368708192596
adf_statistic: -1.3122437470952293
adf_pvalue: 0.6235523349491868
is_stationary: False


2025-02-08 12:51:01,917 - INFO - ✅ Best ARIMA order: (0, 1, 0)
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            3     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.46538D+00    |proj g|=  1.10992D-01

At iterate    5    f=  2.39476D+00    |proj g|=  3.78431D-02

At iterate   10    f=  2.38942D+00    |proj g|=  4.34490D-04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    3     14     17      1     0     0   5.662D-08   2.389D+00
  F =   2.3894127972278687     

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL            


2025-02-08 12:51:03,365 - INFO - ✅ Models trained successfully
2025-02-08 12:51:03,382 - INFO - ✅ ARIMA Evaluation Done
2025-02-08 12:51:03,400 - INFO - ✅ SARIMA Evaluation Done



🏆 TSLA Model Performance:
ARIMA RMSE: 87.6417
SARIMA RMSE: 72.9781

🏆 All stocks processed successfully!


In [28]:
import joblib
import os
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Define paths
data_path = "../data/processed/final_merged_stock_data.csv"
save_dir = "../models/arima_sarima_models"

# Ensure save directory exists
os.makedirs(save_dir, exist_ok=True)

# Load dataset
df = pd.read_csv(data_path, parse_dates=["Date"])
df.set_index("Date", inplace=True)

# Identify stock symbols dynamically
stock_symbols = [col.split("_")[0] for col in df.columns if "_Close" in col]

print(f"✅ Detected stocks for saving models: {stock_symbols}")

# Loop through stocks and save models
for stock in stock_symbols:
    print(f"📌 Saving models for {stock}...")

    try:
        stock_col = f"{stock}_Close"
        if stock_col not in df.columns:
            print(f"🚨 Skipping {stock}: No closing price data found!")
            continue
        
        # Use past 80% data for training
        train_size = int(len(df) * 0.8)
        train_data = df[[stock_col]].iloc[:train_size]

        # Load trained ARIMA model
        arima_model = ARIMA(train_data, order=(1,1,0)).fit()
        joblib.dump(arima_model, os.path.join(save_dir, f"{stock}_ARIMA.pkl"))

        # Load trained SARIMA model
        sarima_model = SARIMAX(train_data, order=(1,1,0), seasonal_order=(1,1,1,12)).fit()
        joblib.dump(sarima_model, os.path.join(save_dir, f"{stock}_SARIMA.pkl"))

        print(f"✅ Models saved successfully for {stock}!")

    except Exception as e:
        print(f"❌ Error saving models for {stock}: {e}")

print("\n🏆 All models saved successfully!")


✅ Detected stocks for saving models: ['AAPL', 'AMZN', 'GOOGL', 'IBM', 'META', 'MSFT', 'NFLX', 'NVDA', 'ORCL', 'TSLA']
📌 Saving models for AAPL...
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.55227D+00    |proj g|=  2.84121D-01


 This problem is unconstrained.



At iterate    5    f=  1.42083D+00    |proj g|=  6.97815D-02

At iterate   10    f=  1.40658D+00    |proj g|=  9.82765D-03

At iterate   15    f=  1.40515D+00    |proj g|=  2.16596D-04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4     18     23      1     0     0   9.903D-06   1.405D+00
  F =   1.4051457153584666     

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL            
✅ Models saved successfully for AAPL!
📌 Saving models for AMZN...
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         0 variables are exactly at the bounds

At 

 This problem is unconstrained.



At iterate    5    f=  1.59084D+00    |proj g|=  5.19844D-02

At iterate   10    f=  1.57303D+00    |proj g|=  3.76821D-03

At iterate   15    f=  1.57225D+00    |proj g|=  1.97940D-03

At iterate   20    f=  1.57222D+00    |proj g|=  2.14335D-04

At iterate   25    f=  1.57222D+00    |proj g|=  6.65779D-05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4     26     33      1     0     0   5.489D-05   1.572D+00
  F =   1.5722169812393598     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
✅ Models saved successfully for AMZN!
📌 Saving models for GOOGL...
RUNNING THE L-BFGS-B CODE

           * * *

M

 This problem is unconstrained.



At iterate    5    f=  1.05736D+00    |proj g|=  8.45759D-02

At iterate   10    f=  1.00918D+00    |proj g|=  3.22754D-02

At iterate   15    f=  1.00687D+00    |proj g|=  3.97079D-03

At iterate   20    f=  1.00685D+00    |proj g|=  2.09691D-05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4     21     28      1     0     0   8.769D-06   1.007D+00
  F =   1.0068497201145117     

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL            
✅ Models saved successfully for GOOGL!
📌 Saving models for IBM...
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =         

 This problem is unconstrained.



At iterate    5    f=  1.49507D+00    |proj g|=  7.53310D-02

At iterate   10    f=  1.47374D+00    |proj g|=  3.32733D-03

At iterate   15    f=  1.47192D+00    |proj g|=  3.96696D-03

At iterate   20    f=  1.47174D+00    |proj g|=  4.70775D-04

At iterate   25    f=  1.47172D+00    |proj g|=  8.48177D-05

At iterate   30    f=  1.47172D+00    |proj g|=  4.98982D-05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4     34     47      1     0     0   1.163D-04   1.472D+00
  F =   1.4717163120253351     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
✅ Models saved successfully for IBM!
📌 Saving model

 This problem is unconstrained.



At iterate    5    f=  2.30536D+00    |proj g|=  4.36588D-02

At iterate   10    f=  2.28907D+00    |proj g|=  4.50695D-03

At iterate   15    f=  2.28869D+00    |proj g|=  5.96868D-04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4     18     20      1     0     0   2.665D-06   2.289D+00
  F =   2.2886917377832838     

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL            
✅ Models saved successfully for META!
📌 Saving models for MSFT...
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         0 variables are exactly at the bounds

At 

 This problem is unconstrained.



At iterate    5    f=  1.95657D+00    |proj g|=  4.72513D-02

At iterate   10    f=  1.94387D+00    |proj g|=  8.90013D-03

At iterate   15    f=  1.94366D+00    |proj g|=  3.35636D-04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4     16     18      1     0     0   3.708D-06   1.944D+00
  F =   1.9436590181114923     

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL            
✅ Models saved successfully for MSFT!
📌 Saving models for NFLX...


 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  3.16528D+00    |proj g|=  6.48681D-02

At iterate    5    f=  3.02712D+00    |proj g|=  2.00706D-02

At iterate   10    f=  3.02260D+00    |proj g|=  2.83448D-02

At iterate   15    f=  3.02025D+00    |proj g|=  1.39626D-03

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4     19     23      1     0     0   6.839D-06   3.020D+00
  F =   3.0202434803319451     

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL            
✅ Models

 This problem is unconstrained.



At iterate    5    f= -5.45833D-01    |proj g|=  7.14030D-01

At iterate   10    f= -5.89819D-01    |proj g|=  3.84776D-01

At iterate   15    f= -5.99531D-01    |proj g|=  1.02291D-01

At iterate   20    f= -6.02031D-01    |proj g|=  3.81742D-03

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4     24     35      1     0     0   2.301D-04  -6.020D-01
  F = -0.60203246722752379     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
✅ Models saved successfully for NVDA!
📌 Saving models for ORCL...


 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  8.04647D-01    |proj g|=  6.39120D-01

At iterate    5    f=  7.07423D-01    |proj g|=  8.31945D-02

At iterate   10    f=  6.46076D-01    |proj g|=  1.09712D-01

At iterate   15    f=  6.39376D-01    |proj g|=  7.75580D-03

At iterate   20    f=  6.39283D-01    |proj g|=  3.26064D-04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4     22     26      1     0     0   7.369D-06   6.393D-01
  F =  0.63928281638491746     

CONVERG

 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            4     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.53241D+00    |proj g|=  1.12907D-01

At iterate    5    f=  2.40258D+00    |proj g|=  2.81815D-02

At iterate   10    f=  2.38993D+00    |proj g|=  8.16244D-03

At iterate   15    f=  2.38941D+00    |proj g|=  3.02455D-05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    4     16     17      1     0     0   7.454D-06   2.389D+00
  F =   2.3894093951718851     

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL            
✅ Models

### ARIMA(2,1,2) model has been trained on AAPL_Close prices 

### Gradient Boosting Requires:

Feature Engineering (Lagged Prices, Rolling Mean, Volatility)

Train-Test Split for Supervised Learning

Hyperparameter Tuning using Grid Search

Evaluation Metrics (RMSE, MAE)

In [None]:
import pandas as pd
import os
import numpy as np
import joblib
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import lightgbm as lgb
import warnings

warnings.filterwarnings("ignore")

# ✅ **Prepare Data for ML**
def prepare_ml_data(df, stock):
    """Prepare features and target for Gradient Boosting."""
    col_name = f"{stock}_Close"

    # ✅ **Target Variable (Next Day's Closing Price)**
    df[f"{stock}_Target"] = df[col_name].shift(-1)

    # ✅ **Feature Selection (Drop Over-Dominant Features)**
    drop_cols = [f"{stock}_Adj Close"]  # Reduce importance of adjusted close price
    feature_cols = [col for col in df.columns if stock in col and col != f"{stock}_Target" and col not in drop_cols]

    df = df.dropna()  # Drop last row (NaN target)
    X = df[feature_cols]
    y = df[f"{stock}_Target"]

    return X, y

# ✅ **Hyperparameter Tuning & Training Functions**
def train_model(model_type, X_train, y_train):
    """Train & optimize models using RandomizedSearchCV."""

    param_dist = {
        "n_estimators": [100, 300, 500],
        "learning_rate": [0.01, 0.05, 0.1],
        "max_depth": [3, 5, 7],
        "subsample": [0.7, 0.9, 1.0],
    }

    if model_type == "GradientBoosting":
        model = GradientBoostingRegressor(random_state=42)
        param_dist.pop("colsample_bytree", None)  # REMOVE invalid parameter
    
    elif model_type == "XGBoost":
        model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)
        param_dist["colsample_bytree"] = [0.6, 0.8, 1.0]  # Only for XGBoost
    
    elif model_type == "LightGBM":
        model = lgb.LGBMRegressor(random_state=42)
        param_dist["colsample_bytree"] = [0.6, 0.8, 1.0]  # Only for LightGBM

    random_search = RandomizedSearchCV(
        model, param_distributions=param_dist, n_iter=20, cv=3, scoring="neg_mean_squared_error", n_jobs=-1
    )
    random_search.fit(X_train, y_train)
    
    return random_search.best_estimator_

# ✅ **Evaluate Model Performance**
def evaluate_model(model, X_test, y_test, model_name, stock):
    """Evaluate model performance and return metrics."""
    y_pred = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"\n✅ {model_name} Model Evaluation for {stock}:")
    print(f"📉 RMSE: {rmse:.4f}")
    print(f"📉 MAE: {mae:.4f}")
    print(f"📈 R² Score: {r2:.4f}")

    return {"stock": stock, "model": model_name, "rmse": rmse, "mae": mae, "r2": r2}

# ✅ **Main Training Pipeline**
if __name__ == "__main__":
    feature_dir = "../data/feature_engineering data"
    model_save_dir = "../models/boosting_models"

    os.makedirs(model_save_dir, exist_ok=True)

    stock_files = [f for f in os.listdir(feature_dir) if f.endswith("_boosting_features.csv")]
    stock_symbols = [f.split("_")[0] for f in stock_files]

    print(f"✅ Detected feature datasets for stocks: {stock_symbols}")

    results = []

    for stock, file in zip(stock_symbols, stock_files):
        print(f"\n📊 Training models for {stock}...")

        file_path = os.path.join(feature_dir, file)
        df = pd.read_csv(file_path)

        # ✅ **Prepare Data**
        X, y = prepare_ml_data(df, stock)

        # ✅ **Train-Test Split**
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # ✅ **Train Models**
        gb_model = train_model("GradientBoosting", X_train, y_train)
        xgb_model = train_model("XGBoost", X_train, y_train)
        lgb_model = train_model("LightGBM", X_train, y_train)

        # ✅ **Evaluate Models**
        gb_results = evaluate_model(gb_model, X_test, y_test, "Gradient Boosting", stock)
        xgb_results = evaluate_model(xgb_model, X_test, y_test, "XGBoost", stock)
        lgb_results = evaluate_model(lgb_model, X_test, y_test, "LightGBM", stock)

        results.extend([gb_results, xgb_results, lgb_results])

        # ✅ **Save Models**
        joblib.dump(gb_model, os.path.join(model_save_dir, f"{stock}_GradientBoosting.pkl"))
        joblib.dump(xgb_model, os.path.join(model_save_dir, f"{stock}_XGBoost.pkl"))
        joblib.dump(lgb_model, os.path.join(model_save_dir, f"{stock}_LightGBM.pkl"))

        print(f"✅ Models saved for {stock} in {model_save_dir}")

    # ✅ **Convert Results to DataFrame**
    results_df = pd.DataFrame(results)
    results_df.to_csv("../data/model_performance_comparison.csv", index=False)

    # ✅ **Find Best Model Per Stock**
    best_models = results_df.groupby("stock").apply(lambda x: x.nsmallest(1, "rmse")).reset_index(drop=True)

    print("\n🏆 **Best Models for Each Stock**")
    print(best_models)

    # ✅ **Save Best Model Results**
    best_models.to_csv("../data/best_models.csv", index=False)
    print("\n🏆 Best models saved to `best_models.csv`")


✅ Detected feature datasets for stocks: ['NVDA', 'MSFT', 'TSLA', 'AAPL', 'GOOGL', 'AMZN', 'ORCL', 'IBM', 'META', 'NFLX']

📊 Training models for NVDA...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001215 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5585
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001115 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5585
[LightGBM] [Info] Number of data points in the train set: 582, number of used features: 29
[LightGBM] [Info] Number of data points in the train set: 582, number of used features: 29
[LightGBM] [Info] Start training from score 16.235139
[LightGBM] [Info] Start training from score 15.781509
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000916 seconds

### we will select the best gradient model for prediction Lightbgm 



In [26]:

import joblib

# Save LightGBM model
joblib.dump(lgb_model, "lightgbm_model.pkl")

print("✅ LightGBM model saved as 'lightgbm_model.pkl'")


✅ LightGBM model saved as 'lightgbm_model.pkl'


In [29]:
import joblib

# Save LightGBM model
joblib.dump(xgb_model, "xbg_model.pkl")

print("✅ LightGBM model saved as 'lightgbm_model.pkl'")

✅ LightGBM model saved as 'lightgbm_model.pkl'
