<a href="https://colab.research.google.com/github/singhtejn/Stock_AI_ML/blob/main/Hybrid_model_stock_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Download TA-Lib
!wget http://prdownloads.sourceforge.net/ta-lib/ta-lib-0.4.0-src.tar.gz

!ls
!tar xvzf ta-lib-0.4.0-src.tar.gz
!ls

import os
os.chdir('ta-lib') # Can't use !cd in co-lab

!./configure --prefix=/usr
!make
!make install

# wait ~ 30s
os.chdir('../')
!ls

!pip install TA-Lib
import talib


In [32]:
# !pip install imbalanced-learn
import yfinance as yf
import talib
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout, Flatten, Reshape
from sklearn.metrics import classification_report, accuracy_score
from imblearn.under_sampling import RandomUnderSampler

# List of stock tickers to fetch data for
tickers = ['TATAMOTORS.NS', 'M&M.NS', 'MARUTI.NS']  # Add more tickers as needed

# Function to fetch historical stock data
def fetch_stock_data(tickers, start, end):
    data = {}
    for ticker in tickers:
        df = yf.download(ticker, start=start, end=end)
        df['Ticker'] = ticker
        data[ticker] = df
    return pd.concat(data.values())

# Fetch data
start_date='2019-08-01'
end_date='2024-07-31'
data = fetch_stock_data(tickers, start_date, end_date)

# Calculate technical indicators using TA-Lib
def calculate_indicators(df):
    df['EMA20'] = talib.EMA(df['Close'], timeperiod=20)
    df['EMA50'] = talib.EMA(df['Close'], timeperiod=50)
    df['EMA100'] = talib.EMA(df['Close'], timeperiod=100)
    df['EMA200'] = talib.EMA(df['Close'], timeperiod=200)
    df['RSI'] = talib.RSI(df['Close'])
    df['MACD'], df['MACDSignal'], _ = talib.MACD(df['Close'])
    df['STOCHF'], _ = talib.STOCHF(df['High'], df['Low'], df['Close'])
    df['CCI'] = talib.CCI(df['High'], df['Low'], df['Close'])
    df['ADX'] = talib.ADX(df['High'], df['Low'], df['Close'])
    df['CMO'] = talib.CMO(df['Close'])
    df['MOM'] = talib.MOM(df['Close'])
    df['WILLR'] = talib.WILLR(df['High'], df['Low'], df['Close'])
    df['MFI'] = talib.MFI(df['High'], df['Low'], df['Close'], df['Volume'])
    df['ATR'] = talib.ATR(df['High'], df['Low'], df['Close'])
    df['BOLL'], _, _ = talib.BBANDS(df['Close'])
    df['AD'] = talib.AD(df['High'], df['Low'], df['Close'], df['Volume'])
    df['OBV'] = talib.OBV(df['Close'], df['Volume'])
    # VWAP is not directly available in TA-Lib, so calculate manually
    df['VWAP'] = (df['Volume'] * df['Close']).cumsum() / df['Volume'].cumsum()
    return df

# Function to create labels for a single ticker's DataFrame
def create_labels(df):
    df['Label'] = 'Hold'
    # Set the window for future price comparison
    days = 5

    for i in range(len(df) - days):
        current_price = df.iloc[i]['Close']
        future_price = df.iloc[i + days]['Close']
        price_change = (future_price - current_price) / current_price

        # Check if conditions for Buy or Sell are met
        if price_change >= 0.05:
            df.at[df.index[i], 'Label'] = 'Buy'
        elif price_change <= -0.05:
            df.at[df.index[i], 'Label'] = 'Sell'

    return df

# Process each ticker separately
def process_tickers(tickers):
    all_data = []
    for ticker in tickers:
        df = data[data['Ticker'] == ticker].copy()
        df = calculate_indicators(df)
        df = create_labels(df)
        df['Ticker'] = ticker  # Keep ticker information
        all_data.append(df)
    return pd.concat(all_data)

# Process data
data = process_tickers(tickers)

# Prepare features and labels
features = ['Open', 'High', 'Low', 'Close', 'Volume', 'EMA20', 'EMA50', 'EMA100', 'EMA200', 'RSI', 'MACD', 'MACDSignal', 'STOCHF', 'CCI', 'ADX', 'CMO', 'MOM', 'WILLR', 'MFI', 'ATR', 'BOLL', 'AD', 'OBV', 'VWAP']
X = data[features].fillna(0)
y = data['Label']

# Encode labels
y = np.array(y)  # Ensure y is a numpy array

# Label encoding for categorical labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Balance classes if necessary
# Perform undersampling
undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)

# Normalize features
scaler = MinMaxScaler()
X_train_resampled_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# Convert labels to categorical (one-hot encoding)
from tensorflow.keras.utils import to_categorical
y_train_resampled_cat = to_categorical(y_train_resampled)
y_test_cat = to_categorical(y_test)

# Build and compile the model
model = Sequential([
    Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train_resampled_scaled.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    Dropout(0.5),
    LSTM(50, return_sequences=False),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(3, activation='softmax')  # Assuming 3 classes: Hold, Buy, Sell
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Reshape data for CNN input
X_train_resampled_reshaped = np.expand_dims(X_train_resampled_scaled, axis=2)
X_test_reshaped = np.expand_dims(X_test_scaled, axis=2)

# Train the model
history = model.fit(X_train_resampled_reshaped, y_train_resampled_cat, epochs=50, batch_size=32, validation_split=0.2)

# Evaluate the model
y_pred = model.predict(X_test_reshaped)
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test_cat, axis=1)

print('Neural Network Accuracy:', accuracy_score(y_test_classes, y_pred_classes))
print('Neural Network Classification Report:\n', classification_report(y_test_classes, y_pred_classes, target_names=['Hold', 'Buy', 'Sell']))

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 60ms/step - accuracy: 0.3246 - loss: 1.0998 - val_accuracy: 0.0000e+00 - val_loss: 1.4066
Epoch 2/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.3822 - loss: 1.0537 - val_accuracy: 0.0000e+00 - val_loss: 1.6961
Epoch 3/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.4111 - loss: 1.0345 - val_accuracy: 0.0000e+00 - val_loss: 1.7110
Epoch 4/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.4145 - loss: 1.0548 - val_accuracy: 0.0000e+00 - val_loss: 1.6430
Epoch 5/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.3972 - loss: 1.0314 - val_accuracy: 0.0000e+00 - val_loss: 1.6802
Epoch 6/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.4200 - loss: 1.0326 - val_accuracy: 0.0000e+00 - val_loss: 1.6884
Epoch 7/50
[1m19/19



[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step
Neural Network Accuracy: 0.43243243243243246
Neural Network Classification Report:
               precision    recall  f1-score   support

        Hold       0.20      0.82      0.32       114
         Buy       0.86      0.40      0.54       571
        Sell       0.00      0.00      0.00        55

    accuracy                           0.43       740
   macro avg       0.35      0.41      0.29       740
weighted avg       0.70      0.43      0.47       740



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [36]:
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Conv1D, MaxPooling1D, LSTM, GRU, Input, Attention
from tensorflow.keras.layers import Layer

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
import yfinance as yf
import pandas as pd
import joblib
import matplotlib.pyplot as plt

# Fetch historical OHLCV data for given stock tickers
def fetch_data(tickers, start, end):
    dfs = []
    for ticker in tickers:
        df = yf.download(ticker, start=start, end=end)  # Download data from Yahoo Finance
        df['Ticker'] = ticker  # Add ticker as a column
        df.reset_index(inplace=True)  # Ensure 'Date' is a column
        dfs.append(df)  # Append dataframe to the list
    return pd.concat(dfs, ignore_index=True)  # Concatenate all dataframes and reset index

# Calculate Exponential Moving Average (EMA)
def calculate_ema(df, column, period):
    return df[column].ewm(span=period, adjust=False).mean()

# Calculate MACD and MACD Signal
def calculate_macd(df):
    df['EMA12'] = calculate_ema(df, 'Close', 12)
    df['EMA26'] = calculate_ema(df, 'Close', 26)
    df['MACD'] = df['EMA12'] - df['EMA26']
    df['MACD Signal'] = calculate_ema(df, 'MACD', 9)
    return df

# Calculate Bollinger Bands
def calculate_bollinger_bands(df, column, window=20, num_sd=2):
    rolling_mean = df[column].rolling(window=window).mean()
    rolling_std = df[column].rolling(window=window).std()
    df['Bollinger High'] = rolling_mean + (rolling_std * num_sd)
    df['Bollinger Low'] = rolling_mean - (rolling_std * num_sd)
    return df

# Calculate VWAP (Volume Weighted Average Price)
def calculate_vwap(df):
    df['Cumulative_Price_Volume'] = (df['Close'] * df['Volume']).cumsum()
    df['Cumulative_Volume'] = df['Volume'].cumsum()
    df['VWAP'] = df['Cumulative_Price_Volume'] / df['Cumulative_Volume']
    return df

# Calculate Relative Strength Index (RSI)
def calculate_rsi(df, column, period=14):
    delta = df[column].diff(1)
    gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
    rs = gain / loss
    df['RSI'] = 100 - (100 / (1 + rs))
    return df

# Apply technical indicators to DataFrame ticker-wise
def apply_indicators(df):
    grouped = df.groupby('Ticker')  # Group data by ticker

    def apply_group(group):
        group = group.sort_values(by='Date').drop_duplicates(subset=['Date'])  # Ensure no duplicate dates
        group['EMA20'] = calculate_ema(group, 'Close', 20)
        group['EMA50'] = calculate_ema(group, 'Close', 50)
        group['EMA100'] = calculate_ema(group, 'Close', 100)
        group = calculate_macd(group)
        group = calculate_bollinger_bands(group, 'Close')
        group = calculate_vwap(group)
        group = calculate_rsi(group, 'Close')
        return group

    result_df = grouped.apply(apply_group).reset_index(drop=True)  # Apply to each group and reset index
    return result_df

# Create labels for the classification task

'''
def create_labels(df):
    df['Future Price'] = df.groupby('Ticker')['Close'].shift(-5)
    df['Price Change'] = (df['Future Price'] - df['Close']) / df['Close']
    df['Buy'] = df['Price Change'] >= 0.02
    df['Sell'] = df['Price Change'] <= -0.02
    df['Label'] = np.where(df['Buy'], 1, np.where(df['Sell'], 2, 0))  # Label encoding: Buy=1, Sell=2, Hold=0
    df.drop(columns=['Future Price', 'Price Change'], inplace=True)
    return df
'''
# labels using rsi
def create_labels(df):
    df['Buy'] = (df['RSI'] > 65) & (df['RSI'].diff() > 0)
    df['Sell'] = (df['RSI'] < 35) & (df['RSI'].diff() < 0)
    df['Label'] = np.where(df['Buy'], 1, np.where(df['Sell'], 2, 0))  # Label encoding: Buy=1, Sell=2, Hold=0
    return df


# Preprocess data: label creation and scaling
def preprocess_data(df):
    # Create labels
    df = create_labels(df)

    # Drop non-numeric columns for feature scaling
    df.drop(columns=['Date', 'Ticker'], inplace=True)

    # Drop any rows with NaN values
    df.dropna(inplace=True)

    # Normalize features
    scaler = MinMaxScaler()
    features = ['Open', 'High', 'Low', 'Close', 'Volume', 'EMA20', 'EMA50', 'EMA100', 'MACD', 'MACD Signal', 'Bollinger High', 'Bollinger Low', 'VWAP', 'RSI']
    df[features] = scaler.fit_transform(df[features])

    return df, scaler

# Custom Attention Layer
class CustomAttention(Layer):
    def __init__(self, **kwargs):
        super(CustomAttention, self).__init__(**kwargs)

    def call(self, inputs):
        query, value = inputs
        attention = tf.keras.layers.Attention()([query, value])
        return tf.reduce_sum(attention, axis=1)

def build_hybrid_model(input_shape):
    inputs = Input(shape=input_shape)

    # CNN Layers
    x = Conv1D(filters=64, kernel_size=3, activation='relu', padding='same')(inputs)
    x = MaxPooling1D(pool_size=2)(x)
    x = Conv1D(filters=128, kernel_size=3, activation='relu', padding='same')(x)
    x = MaxPooling1D(pool_size=2)(x)

    # RNN Layers
    x = LSTM(100, return_sequences=True)(x)
    x = LSTM(50, return_sequences=True)(x)

    # Attention Mechanism
    query = Dense(50)(x)
    value = Dense(50)(x)
    attention_output = CustomAttention()([query, value])

    # Dense Layers
    x = Dense(50, activation='relu')(attention_output)
    x = Dropout(0.5)(x)
    outputs = Dense(3, activation='softmax')(x)  # Output layer for classification

    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model

def main():
    # Define tickers
    tickers = ['DRREDDY.NS', 'HINDALCO.NS', 'JSWSTEEL.NS', 'TATAMOTORS.NS', 'M&M.NS', 'ONGC.NS', 'COALINDIA.NS', 'TECHM.NS', 'UPL.NS', 'TATASTEEL.NS',
               'TATACONSUM.NS', 'ITC.NS', 'ADANIENT.NS', 'TITAN.NS', 'EICHERMOT.NS', 'ADANIPORTS.NS', 'RELIANCE.NS', 'BPCL.NS', 'LT.NS', 'NTPC.NS',
               'LTIM.NS', 'BAJAJ-AUTO.NS', 'SUNPHARMA.NS', 'INFY.NS', 'WIPRO.NS', 'INDUSINDBK.NS', 'HCLTECH.NS', 'SBIN.NS', 'KOTAKBANK.NS', 'HDFCLIFE.NS',
               'BAJAJFINSV.NS', 'SBILIFE.NS', 'BRITANNIA.NS', 'HDFCBANK.NS', 'CIPLA.NS', 'GRASIM.NS', 'NESTLEIND.NS', 'BHARTIARTL.NS', 'TCS.NS',
               'AXISBANK.NS', 'HEROMOTOCO.NS', 'HINDUNILVR.NS', 'ASIANPAINT.NS', 'POWERGRID.NS', 'ULTRACEMCO.NS', 'ICICIBANK.NS', 'APOLLOHOSP.NS',
               'MARUTI.NS', 'BAJFINANCE.NS', 'DIVISLAB.NS']

    # Download historical stock data
    start_date = '2019-08-01'
    end_date = '2024-07-31'
    df = fetch_data(tickers, start=start_date, end=end_date)

    # Apply technical indicators
    df = apply_indicators(df)

    # Preprocess data
    df, scaler = preprocess_data(df)

    # Define features and labels
    features = ['Open', 'High', 'Low', 'Close', 'Volume', 'EMA20', 'EMA50', 'EMA100', 'MACD', 'MACD Signal', 'Bollinger High', 'Bollinger Low', 'VWAP', 'RSI']
    X = df[features].values
    y = df['Label'].values

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

    # Reshape for CNN (samples, time steps, features)
    X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
    X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

    # Build and train the model
    input_shape = (X_train.shape[1], X_train.shape[2])
    model = build_hybrid_model(input_shape)
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)

    # Evaluate the model
    y_pred = np.argmax(model.predict(X_test), axis=1)
    print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
    print(classification_report(y_test, y_pred))

    # Save the model and scaler
    model.save('hybrid_model.h5')
    joblib.dump(scaler, 'scaler.pkl')

if __name__ == "__main__":
    main()


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%*******

Epoch 1/10
[1m1366/1366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 18ms/step - accuracy: 0.7505 - loss: 0.7186 - val_accuracy: 0.8418 - val_loss: 0.3174
Epoch 2/10
[1m1366/1366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 17ms/step - accuracy: 0.8106 - loss: 0.3341 - val_accuracy: 0.8390 - val_loss: 0.2793
Epoch 3/10
[1m1366/1366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 18ms/step - accuracy: 0.8157 - loss: 0.3297 - val_accuracy: 0.8336 - val_loss: 0.3166
Epoch 4/10
[1m1366/1366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 19ms/step - accuracy: 0.8214 - loss: 0.3112 - val_accuracy: 0.8453 - val_loss: 0.2787
Epoch 5/10
[1m1366/1366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 18ms/step - accuracy: 0.8210 - loss: 0.3113 - val_accuracy: 0.8398 - val_loss: 0.2799
Epoch 6/10
[1m1366/1366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 19ms/step - accuracy: 0.8230 - loss: 0.3111 - val_accuracy: 0.8180 - val_loss: 0.3099
Epoc



Accuracy: 0.8110378912685338
              precision    recall  f1-score   support

           0       0.90      0.84      0.87      9036
           1       0.59      0.90      0.71      2007
           2       0.67      0.44      0.53      1097

    accuracy                           0.81     12140
   macro avg       0.72      0.72      0.70     12140
weighted avg       0.83      0.81      0.81     12140

