In [230]:
# !pip install yfinance pandas numpy scikit-learn xgboost lightgbm tensorflow matplotlib optuna ta

In [275]:
import yfinance as yf
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import ta
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import StackingRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import mutual_info_regression
from sklearn.decomposition import PCA
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, GRU

from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import optuna

In [232]:
def fetch_stock_data(symbol, start_date, end_date):
       
    stock = yf.Ticker(symbol)
    df = stock.history(start=start_date, end=end_date)
    # return df.reset_index()
    return df
    

def add_technical_indicators(df):
    df['SMA_20'] = ta.trend.sma_indicator(df['Close'], window=20)
    df['SMA_50'] = ta.trend.sma_indicator(df['Close'], window=50)
    df['EMA_20'] = ta.trend.ema_indicator(df['Close'], window=20)
    df['MACD'] = ta.trend.macd_diff(df['Close'])
    df['ADX'] = ta.trend.adx(df['High'], df['Low'], df['Close'])
    df['RSI'] = ta.momentum.rsi(df['Close'])
    df['Stoch_Osc'] = ta.momentum.stoch(df['High'], df['Low'], df['Close'])
    df['Williams_R'] = ta.momentum.williams_r(df['High'], df['Low'], df['Close'])
    df['BBlow'], df['BBmid'], df['BBupp'] = ta.volatility.bollinger_hband_indicator(df['Close']), ta.volatility.bollinger_mavg(df['Close']), ta.volatility.bollinger_lband_indicator(df['Close'])
    df['ATR'] = ta.volatility.average_true_range(df['High'], df['Low'], df['Close'])
    df['OBV'] = ta.volume.on_balance_volume(df['Close'], df['Volume'])
    df['CMF'] = ta.volume.chaikin_money_flow(df['High'], df['Low'], df['Close'], df['Volume'])

    df['Keltner_Channel_Upper'], df['Keltner_Channel_Lower'] = ta.volatility.keltner_channel_hband(df['High'], df['Low'], df['Close']), ta.volatility.keltner_channel_lband(df['High'], df['Low'], df['Close'])
    df['Mass_Index'] = ta.trend.mass_index(df['High'], df['Low'])
    df['TRIX'] = ta.trend.trix(df['Close'])
    df['Ultimate_Oscillator'] = ta.momentum.ultimate_oscillator(df['High'], df['Low'], df['Close'])
    return df


def add_derived_features(df):
    df['Price_Change'] = df['Close'].diff()
    df['Pct_Change'] = df['Close'].pct_change()
    df['VWAP'] = (df['Volume'] * (df['High'] + df['Low'] + df['Close']) / 3).cumsum() / df['Volume'].cumsum()
    df['High_Volume'] = (df['Volume'] > df['Volume'].rolling(window=20).mean() * 1.5).astype(int)
    df['Significant_Price_Move'] = ((df['Close'] - df['Open']).abs() > df['Close'].rolling(window=20).std()).astype(int)
    df['Volume_Spike_With_Price_Move'] = ((df['Volume'] > df['Volume'].rolling(window=20).mean() * 2) & (df['Significant_Price_Move'] == 1)).astype(int)
    df['Relative_Volume'] = df['Volume'] / df['Volume'].rolling(window=20).mean()
    df['Day_of_Week'] = df.index.dayofweek
    df['Is_Month_End'] = df.index.is_month_end.astype(int)
    df['Price_Momentum'] = df['Close'] - df['Close'].shift(10)
    df['Volume_Price_Trend'] = (df['Volume'] * (df['Close'] - df['Close'].shift(1))).cumsum()
    df['Acceleration'] = df['Price_Change'] - df['Price_Change'].shift(1)

    # Add polynomial features
    poly = PolynomialFeatures(degree=2, include_bias=False)
    poly_features = poly.fit_transform(df[['Close', 'Volume']])
    df['Close_Squared'] = poly_features[:, 2]
    df['Volume_Squared'] = poly_features[:, 3]
    df['Close_Volume_Interaction'] = poly_features[:, 4]

# Add lagged features
    for i in [1, 2, 3, 5, 10]:
        df[f'Close_Lag_{i}'] = df['Close'].shift(i)
        df[f'Volume_Lag_{i}'] = df['Volume'].shift(i)

# Add rolling window features
    for window in [5, 10, 20]:
        df[f'Close_Roll_Mean_{window}'] = df['Close'].rolling(window=window).mean()
        df[f'Close_Roll_Std_{window}'] = df['Close'].rolling(window=window).std()
        df[f'Volume_Roll_Mean_{window}'] = df['Volume'].rolling(window=window).mean()

    # Add Fourier features
    for period in [5, 10, 21]:
        df[f'Fourier_Cos_{period}'] = np.cos(2 * np.pi * df.index.dayofyear / period)
        df[f'Fourier_Sin_{period}'] = np.sin(2 * np.pi * df.index.dayofyear / period)
    
    return df

def prepare_data_for_model(df):
    df = df.sort_index()
    df['Target'] = df['Close'].shift(-1)
    df = df.dropna()
    
    features = [col for col in df.columns if col not in ['Target', 'Open', 'High', 'Low', 'Close', 'Volume']]
    X = df[features]
    y = df['Target']
    
    return X, y

def feature_selection(X, y, k=50):
    selector = SelectKBest(score_func=mutual_info_regression, k=k)
    X_selected = selector.fit_transform(X, y)
    selected_features = X.columns[selector.get_support()]
    return X[selected_features]



In [233]:
def apply_pca(X, n_components=0.95):
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)
    return X_pca, pca



In [254]:
# def split_and_scale_data(X, y, test_size=0.2):
    
#     # Ensure data is sorted by date
#     # X = X.sort_index()
#     # y = y.sort_index()
#     # Use TimeSeriesSplit for more appropriate validation
#     tscv = TimeSeriesSplit(n_splits=5)
#     for train_index, test_index in tscv.split(X):
#         X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#         y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
#     # Use RobustScaler to handle outliers better
#     scaler = RobustScaler()
#     X_train_scaled = scaler.fit_transform(X_train)
#     X_test_scaled = scaler.transform(X_test)
    
#     return X_train_scaled, X_test_scaled, y_train, y_test, scaler

def split_and_scale_data(X, y, test_size=0.2):
    # Ensure X and y are pandas DataFrames/Series
    X = pd.DataFrame(X)
    y = pd.Series(y)
    
    # Ensure data is sorted by date
    X = X.sort_index()
    y = y.sort_index()
    
    # Use TimeSeriesSplit for more appropriate validation
    tscv = TimeSeriesSplit(n_splits=5)
    
    # We'll use the last split for our final train/test set
    train_index, test_index = list(tscv.split(X))[-1]
    
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Use RobustScaler to handle outliers better
    scaler = RobustScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled, y_train, y_test, scaler




In [235]:
def create_lstm_model(input_shape):
    model = Sequential([
        Bidirectional(LSTM(64, return_sequences=True, input_shape=input_shape)),
        Dropout(0.2),
        Bidirectional(LSTM(32)),
        Dropout(0.2),
        Dense(16, activation='relu'),
        Dense(1)
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='huber')  # Huber loss for robustness
    return model

def create_gru_model(input_shape):
    model = Sequential([
        GRU(64, return_sequences=True, input_shape=input_shape),
        Dropout(0.2),
        GRU(32),
        Dropout(0.2),
        Dense(16, activation='relu'),
        Dense(1)
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='huber')
    return model

In [236]:
def objective(trial):
    xgb_params = {
        'max_depth': trial.suggest_int('xgb_max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('xgb_learning_rate', 1e-3, 1.0),
        'n_estimators': trial.suggest_int('xgb_n_estimators', 50, 300),
        'min_child_weight': trial.suggest_int('xgb_min_child_weight', 1, 10),
        'subsample': trial.suggest_uniform('xgb_subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('xgb_colsample_bytree', 0.6, 1.0),
    }
    
    lgb_params = {
        'num_leaves': trial.suggest_int('lgb_num_leaves', 20, 100),
        'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-3, 1.0),
        'n_estimators': trial.suggest_int('lgb_n_estimators', 50, 300),
        'min_child_samples': trial.suggest_int('lgb_min_child_samples', 1, 100),
        'subsample': trial.suggest_uniform('lgb_subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.6, 1.0),
    }
    # Added hyperparameters for GRU
    gru_units = trial.suggest_int('gru_units', 32, 128)
    gru_layers = trial.suggest_int('gru_layers', 1, 3)
    
    xgb_model = xgb.XGBRegressor(**xgb_params)
    lgb_model = lgb.LGBMRegressor(**lgb_params)
    lstm_model = create_lstm_model((X_train_scaled.shape[1], 1))
    gru_model = create_gru_model((X_train_scaled.shape[1], 1))
    rf_model = RandomForestRegressor(n_estimators=100)
    
    stacked_model = StackingRegressor(
        estimators=[
            ('xgb', xgb_model),
            ('lgb', lgb_model),
            ('lstm', lstm_model),
            ('gru', gru_model),
            ('rf', rf_model)
        ],
        final_estimator=xgb.XGBRegressor(max_depth=3, learning_rate=0.1, n_estimators=100)
    )
    
    stacked_model.fit(X_train_scaled, y_train)
    y_pred = stacked_model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    
    return mse

In [237]:
def train_model(X_train, y_train, X_test, y_test):
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=100)  # Increased number of trials
    
    best_params = study.best_params
    
    # [Create models with best parameters]
    
    stacked_model = StackingRegressor(
        estimators=[
            ('xgb', xgb_model),
            ('lgb', lgb_model),
            ('lstm', lstm_model),
            ('gru', gru_model),
            ('rf', rf_model)
        ],
        final_estimator=xgb.XGBRegressor(max_depth=3, learning_rate=0.1, n_estimators=100)
    )
    
    # Use early stopping and learning rate reduction
    early_stopping = EarlyStopping(patience=10, restore_best_weights=True)
    lr_reducer = ReduceLROnPlateau(factor=0.5, patience=5)
    
    stacked_model.fit(X_train, y_train, 
                      lstm__callbacks=[early_stopping, lr_reducer],
                      gru__callbacks=[early_stopping, lr_reducer])
    return stacked_model


In [238]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"Mean Squared Error: {mse}")
    print(f"Mean Absolute Error: {mae}")
    print(f"R-squared Score: {r2}")
    
    return y_pred

In [239]:
def plot_actual_vs_predicted(y_test, y_pred):
    plt.figure(figsize=(12, 6))
    plt.plot(y_test.index, y_test.values, label='Actual')
    plt.plot(y_test.index, y_pred, label='Predicted')
    plt.title('Actual vs Predicted Stock Prices')
    plt.xlabel('Date')
    plt.ylabel('Price')
    plt.legend()
    plt.show()

In [240]:
def predict_next_day(model, scaler, last_data_point):
    last_data_point_scaled = scaler.transform(last_data_point.values.reshape(1, -1))
    next_day_prediction = model.predict(last_data_point_scaled)[0]
    return next_day_prediction

In [241]:
# if __name__ == "__main__":
symbol = "RELIANCE.NS"
end_date = datetime.now()
start_date = end_date - timedelta(days=1500)  # Increased historical data

df = fetch_stock_data(symbol, start_date, end_date)
df = df[[ 'Open', 'High', 'Low', 'Close', 'Volume']]

df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-05-19 00:00:00+05:30,1325.683498,1329.959932,1276.777905,1281.918579,21157927
2020-05-20 00:00:00+05:30,1282.919525,1316.448343,1267.451656,1304.483521,27660492
2020-05-21 00:00:00+05:30,1305.666277,1329.73238,1296.567574,1311.353027,19583990
2020-05-22 00:00:00+05:30,1320.952305,1326.593448,1297.932505,1302.527344,18914486
2020-05-26 00:00:00+05:30,1317.631133,1319.041449,1288.651667,1295.703247,16608317


In [242]:
df.index.dayofyear

Index([140, 141, 142, 143, 147, 148, 149, 150, 153, 154,
       ...
       164, 165, 166, 170, 171, 172, 173, 176, 177, 178],
      dtype='int32', name='Date', length=1018)

In [243]:
df = add_technical_indicators(df)
df = add_derived_features(df)


In [244]:
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,SMA_20,SMA_50,EMA_20,MACD,ADX,...,Volume_Roll_Mean_10,Close_Roll_Mean_20,Close_Roll_Std_20,Volume_Roll_Mean_20,Fourier_Cos_5,Fourier_Sin_5,Fourier_Cos_10,Fourier_Sin_10,Fourier_Cos_21,Fourier_Sin_21
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-05-19 00:00:00+05:30,1325.683498,1329.959932,1276.777905,1281.918579,21157927,,,,,0.0,...,,,,,1.0,-6.858022e-15,1.0,-3.429011e-15,-0.5,-0.8660254
2020-05-20 00:00:00+05:30,1282.919525,1316.448343,1267.451656,1304.483521,27660492,,,,,0.0,...,,,,,0.309017,0.9510565,0.809017,0.5877853,-0.222521,-0.9749279
2020-05-21 00:00:00+05:30,1305.666277,1329.73238,1296.567574,1311.353027,19583990,,,,,0.0,...,,,,,-0.809017,0.5877853,0.309017,0.9510565,0.07473,-0.9972038
2020-05-22 00:00:00+05:30,1320.952305,1326.593448,1297.932505,1302.527344,18914486,,,,,0.0,...,,,,,-0.809017,-0.5877853,-0.309017,0.9510565,0.365341,-0.9308737
2020-05-26 00:00:00+05:30,1317.631133,1319.041449,1288.651667,1295.703247,16608317,,,,,0.0,...,,,,,-0.809017,0.5877853,-0.309017,-0.9510565,1.0,-1.714506e-15


In [245]:
print(f"No of rows {df.shape[0]}, No of Columns {df.shape[1]}")

No of rows 1018, No of Columns 64


In [250]:
X, y = prepare_data_for_model(df)
X = feature_selection(X, y)
# X_pca, pca = apply_pca(X)

In [261]:
X.head()

Unnamed: 0_level_0,SMA_20,SMA_50,EMA_20,MACD,ADX,RSI,Stoch_Osc,Williams_R,BBlow,BBmid,...,Close_Roll_Std_5,Volume_Roll_Mean_5,Close_Roll_Mean_10,Close_Roll_Std_10,Volume_Roll_Mean_10,Close_Roll_Mean_20,Close_Roll_Std_20,Volume_Roll_Mean_20,Fourier_Cos_21,Fourier_Sin_21
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-07-28 00:00:00+05:30,1751.265771,1563.624546,1771.622142,20.215582,53.422859,81.992962,94.885424,-5.114576,1.0,1751.265771,...,67.709534,37412182.6,1830.109192,115.747193,35162319.7,1751.265771,120.839408,27886564.7,1.0,-2.449294e-15
2020-07-29 00:00:00+05:30,1767.988403,1576.295098,1785.319669,14.499729,52.526176,69.569951,74.513448,-25.486552,0.0,1767.988403,...,44.521072,38701376.8,1853.190662,106.127703,31812001.0,1767.988403,119.166122,28991547.1,0.955573,0.2947552
2020-07-30 00:00:00+05:30,1783.907446,1588.737266,1798.774169,10.021093,51.693541,70.299429,77.557398,-22.442602,0.0,1783.907446,...,30.741908,40153746.2,1877.441504,89.614782,32406787.0,1783.907446,117.958218,30093244.9,0.826239,0.5633201
2020-07-31 00:00:00+05:30,1796.66095,1600.279207,1807.314731,3.384438,50.497824,64.592906,67.140705,-32.859295,0.0,1796.66095,...,41.108892,35589815.0,1891.638452,76.906997,33668351.8,1796.66095,114.566987,31231339.75,0.62349,0.7818315
2020-08-03 00:00:00+05:30,1803.841644,1610.936086,1809.986789,-5.217867,48.3773,57.587237,52.644687,-47.355313,0.0,1803.841644,...,56.24041,32230870.6,1899.778394,63.930087,34174474.0,1803.841644,112.120424,31223819.7,-0.222521,0.9749279


In [264]:
X_train_scaled, X_test_scaled, y_train, y_test, scaler = split_and_scale_data(X, y)


In [276]:
model = train_model(X_train_scaled, y_train, X_test_scaled, y_test)

[I 2024-06-27 06:23:07,440] A new study created in memory with name: no-name-214d81b4-0896-41d2-8d31-2369fe07d1c5
  'learning_rate': trial.suggest_loguniform('xgb_learning_rate', 1e-3, 1.0),
  'subsample': trial.suggest_uniform('xgb_subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('xgb_colsample_bytree', 0.6, 1.0),
  'learning_rate': trial.suggest_loguniform('lgb_learning_rate', 1e-3, 1.0),
  'subsample': trial.suggest_uniform('lgb_subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('lgb_colsample_bytree', 0.6, 1.0),
  super().__init__(**kwargs)
[W 2024-06-27 06:23:07,512] Trial 0 failed with parameters: {'xgb_max_depth': 6, 'xgb_learning_rate': 0.0916000310155272, 'xgb_n_estimators': 133, 'xgb_min_child_weight': 5, 'xgb_subsample': 0.7719998314269622, 'xgb_colsample_bytree': 0.9652527226244636, 'lgb_num_leaves': 63, 'lgb_learning_rate': 0.22587236227784482, 'lgb_n_estimators': 231, 'lgb_min_child_samples': 57, 'lgb_subsample': 0.801679779949622, 'lgb_c

ValueError: The estimator Sequential should be a regressor.