In [10]:
import pandas as pd
import numpy as np
from data_collection import get_stock_data, preprocess_stock_data
from feature_engineering import add_technical_indicators, create_lagged_features
from model_implementation import SARIMAXModel, LinearRegressionModel, LightGBMModel
from model_evaluation import compare_models
import pandas_market_calendars as mcal

In [18]:
stock_data = get_stock_data(ticker, start_date, end_date)
processed_data = preprocess_stock_data(stock_data)
data_with_indicators = add_technical_indicators(processed_data)
final_data = create_lagged_features(data_with_indicators)

In [30]:
last_date=final_data['Date'].iloc[-1]

'2024-07-04'

In [47]:
nyse = mcal.get_calendar('NYSE')

nyse.valid_days(start_date=pd.to_datetime(last_date+pd.Timedelta(days=1)).strftime('%Y-%m-%d'), end_date=pd.to_datetime(last_date+pd.Timedelta(days=10)).strftime('%Y-%m-%d'))[0].strftime('%Y-%m-%d')

'2024-07-05'

In [50]:
def prepare_data_for_forecast(ticker, start_date, end_date):
    """
    Prepare data for forecasting.
    
    :param ticker: Stock symbol
    :param start_date: Start date for historical data
    :param end_date: End date for historical data
    :return: Prepared DataFrame
    """
    stock_data = get_stock_data(ticker, start_date, end_date)
    processed_data = preprocess_stock_data(stock_data)
    data_with_indicators = add_technical_indicators(processed_data)
    final_data = create_lagged_features(data_with_indicators)
    return final_data

def select_best_model(X, y):
    """
    Select the best model based on RMSE.
    
    :param X: Feature data
    :param y: Target data
    :return: Best model
    """
    models = {
        'SARIMAX': SARIMAXModel(),
        'Linear Regression': LinearRegressionModel(),
        'LightGBM': LightGBMModel()
    }
    results = compare_models(models, X, y)
    best_model_name = results['rmse'].idxmin()
    return models[best_model_name]

def forecast_next_day(model, data, feature_columns):
    """
    Forecast the next day's opening price.
    
    :param model: Trained model
    :param data: Historical data
    :param feature_columns: List of feature column names
    :return: Forecasted opening price
    """
    last_data_point = data[feature_columns].iloc[-1:]
    forecast = model.predict(last_data_point)
 #   print(last_data_point.index)
    return forecast[0] if isinstance(forecast, np.ndarray) else forecast

def run_forecast_pipeline(ticker, start_date, end_date):
    """
    Run the complete forecasting pipeline.
    
    :param ticker: Stock symbol
    :param start_date: Start date for historical data
    :param end_date: End date for historical data
    :return: Forecasted opening price for the next day
    """
    

    # Prepare data
    data = prepare_data_for_forecast(ticker, start_date, end_date)
    
    # Define feature columns
    feature_columns = ['Open','Close' ,'High', 'Low', 'Volume', 'Returns', 'Volatility', 
                       'SMA_20', 'RSI', 'MACD', 'ATR', 'OBV', 
                       'Close_Lag_1', 'Volume_Lag_1', 'Returns_Lag_1']
    
    data = data.dropna()
    # Prepare features and target
    X = data[feature_columns]
    y = data['Close'].shift(-1) # Shift target column by 1 day to prevent lookahead bias
    
    
    # Select and train the best model
    best_model = select_best_model( X.iloc[:-1], y.iloc[:-1])
    best_model.fit(X.iloc[:-1], y.iloc[:-1])
    
    # Make forecast
    forecast = forecast_next_day(best_model, data, feature_columns)
    last_date=pd.Timestamp(data['Date'].iloc[-1])
    nyse = mcal.get_calendar('NYSE')
    next_trading_day = nyse.valid_days(start_date=pd.to_datetime(last_date+pd.Timedelta(days=1)).strftime('%Y-%m-%d'), end_date=pd.to_datetime(last_date+pd.Timedelta(days=10)).strftime('%Y-%m-%d'))[0].strftime('%Y-%m-%d')
    return forecast,next_trading_day,best_model.__class__.__name__

In [52]:
if __name__ == "__main__":
    from datetime import datetime
    ticker = "AAPL"
    start_date = "2024-01-01"
    end_date = datetime.now().strftime("%Y-%m-%d")
    
    forecast,_,_ = run_forecast_pipeline(ticker, start_date, end_date)
    print(f"Forecasted closing  price for {ticker} for the next trading day: ${forecast:.2f}")

Evaluating SARIMAX...
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            5     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  6.59976D-01    |proj g|=  3.37789D-01

At iterate    5    f=  5.00042D-01    |proj g|=  4.82370D-03

At iterate   10    f=  4.96967D-01    |proj g|=  1.19386D-02

At iterate   15    f=  4.96459D-01    |proj g|=  1.66571D-03


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Too few observations to estimate starting parameters%s.'
 This problem is unconstrained.



At iterate   20    f=  4.96317D-01    |proj g|=  2.84018D-03

At iterate   25    f=  4.96263D-01    |proj g|=  7.06384D-04

At iterate   30    f=  4.96236D-01    |proj g|=  2.35249D-03

At iterate   35    f=  4.96219D-01    |proj g|=  8.17662D-05

At iterate   40    f=  4.96211D-01    |proj g|=  2.45950D-03

At iterate   45    f=  4.96205D-01    |proj g|=  2.15184D-03


  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Too few observations to estimate starting parameters%s.'
 This problem is unconstrained.



At iterate   50    f=  4.96200D-01    |proj g|=  1.07637D-03

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    5     50     64      1     0     0   1.076D-03   4.962D-01
  F =  0.49619960563431464     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            5     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.53980D+00    |proj g|=  2.53384D-02

At iterate    5    f=  1.47483D+00    |proj g|=  2.84064D-02

At iterate   10    f=  1.47090D+00    |proj g|=  1.51775D-02

At iter

  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Too few observations to estimate starting parameters%s.'
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            5     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.96112D+00    |proj g|=  1.26401D-01

At iterate    5    f=  1.92376D+00    |proj g|=  1.03486D-02

At iterate   10    f=  1.92163D+00    |proj g|=  1.34351D-02

At iterate   15    f=  1.92051D+00    |proj g|=  1.42808D-03

At iterate   20    f=  1.92034D+00    |proj g|=  5.05781D-04

At iterate   25    f=  1.92025D+00    |proj g|=  9.90205D-04

At iterate   30    f=  1.92025D+00    |proj g|=  2.69772D-04

At iterate   35    f=  1.92024D+00    |proj g|=  9.85757D-05

At iterate   40    f=  1.92024D+00    |proj g|=  1.17114D-04

At iterate   45    f=  1.92024D+00    |proj g|=  5.20563D-05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = nu

  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting seasonal autoregressive'
  warn('Non-invertible starting seasonal moving average'
 This problem is unconstrained.



At iterate   10    f=  2.02391D+00    |proj g|=  3.71649D-03

At iterate   15    f=  2.02307D+00    |proj g|=  2.91778D-03

At iterate   20    f=  2.02297D+00    |proj g|=  2.61237D-04

At iterate   25    f=  2.02296D+00    |proj g|=  1.44580D-03

At iterate   30    f=  2.02275D+00    |proj g|=  9.81168D-03

At iterate   35    f=  2.02230D+00    |proj g|=  1.05827D-04

At iterate   40    f=  2.02228D+00    |proj g|=  2.31859D-03

At iterate   45    f=  2.02225D+00    |proj g|=  9.45225D-04


  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting seasonal moving average'
 This problem is unconstrained.



At iterate   50    f=  2.02225D+00    |proj g|=  3.96410D-05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    5     50     62      1     0     0   3.964D-05   2.022D+00
  F =   2.0222457517421568     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT                 
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            5     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.26615D+00    |proj g|=  5.59392D-02

At iterate    5    f=  2.22989D+00    |proj g|=  3.63648D-02

At iterate   10    f=  2.22552D+00    |proj g|=  1.31094D-03

At iter

  return get_prediction_index(


Evaluating Linear Regression...
Evaluating LightGBM...
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 17, number of used features: 0
[LightGBM] [Info] Start training from score 171.441583
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 29, number of used features: 0
[LightGBM] [Info] Start training from score 170.505630
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000059 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 225
[LightGBM] [Info] Number of data points in the train set: 41, number of used features: 15
[LightGBM] [Info] Start training from score 172.322812
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000058 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 285
[LightGBM] [Info] Number of data points in the train set: 53

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
 This problem is unconstrained.



At iterate    5    f=  2.39422D+00    |proj g|=  1.08163D-02

At iterate   10    f=  2.28247D+00    |proj g|=  7.66665D-03

At iterate   15    f=  2.28201D+00    |proj g|=  1.34209D-03

At iterate   20    f=  2.28172D+00    |proj g|=  1.83821D-03

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    5     23     31      1     0     0   2.626D-06   2.282D+00
  F =   2.2817161100404388     

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL            


  return get_prediction_index(


TypeError: unsupported format string passed to Series.__format__