<a href="https://colab.research.google.com/github/pnperl/Equity/blob/main/EquityGpt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
# Install required libraries
!pip install --quiet yfinance numpy pandas scikit-learn tensorflow shap matplotlib nltk requests

# Import Libraries
import yfinance as yf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import shap
import requests
import json
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, Dropout, LayerNormalization, MultiHeadAttention, Input, GlobalAveragePooling1D
from tensorflow.keras.models import Model
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import os

# Ensure necessary nltk data is downloaded
nltk.download('vader_lexicon')

# === User Input for Stock Selection ===
stock_symbol = input("Enter stock symbol (e.g., TSLA for Tesla): ").strip()
print(f"Fetching data for stock: {stock_symbol}")

# === Fetch Historical Stock Data ===
print(f"Stock symbol: {stock_symbol}")

df = yf.download(stock_symbol, start="2010-01-01", interval="1d", auto_adjust=True)
print(df.columns)
print(df.head())  # Print the first few rows
print(df.isnull().sum())
print(df.dtypes)

print("Stock data fetched successfully.")

if df.empty:
    print("DataFrame is empty. Check the stock symbol or date range.")


# === Technical Indicators Calculation ===
print(f"Stock symbol (before indicators): {stock_symbol}")
print("Calculating technical indicators...")
try:
    df[('50_MA', stock_symbol)] = np.nan #create the column.
    df[('200_MA', stock_symbol)] = np.nan #create the column.
    df[('Volatility', stock_symbol)] = np.nan #create the column.
    df[('RSI', stock_symbol)] = np.nan #create the column.
    df[('MACD', stock_symbol)] = np.nan #create the column.

    df[('50_MA', stock_symbol)] = df[('Close', stock_symbol)].rolling(window=50).mean()
    df[('200_MA', stock_symbol)] = df[('Close', stock_symbol)].rolling(window=200).mean()
    df[('Volatility', stock_symbol)] = df[('Close', stock_symbol)].rolling(window=50).std()
    df[('RSI', stock_symbol)] = 100 - (100 / (1 + (df[('Close', stock_symbol)].diff().rolling(14).mean() / df[('Close', stock_symbol)].diff().rolling(14).std())))
    df[('MACD', stock_symbol)] = df[('Close', stock_symbol)].ewm(span=12, adjust=False).mean() - df[('Close', stock_symbol)].ewm(span=26, adjust=False).mean()
    df = df.sort_index(axis=1)
    print("Technical indicators calculated.")
except KeyError as e:
    print(f"KeyError during technical indicator calculation: {e}")
    print(df.columns)  # Print the columns to see what's available

# === Sentiment Analysis ===
def get_sentiment_score(stock, date):
    try:
        url = f"https://newsapi.org/v2/everything?q={stock}&from={date}&to={date}&apiKey=afce3fd1c27d40d186f4f3a3b1639a5f"  # Replace with your NewsAPI key
        response = requests.get(url)
        news_data = json.loads(response.text)

        analyzer = SentimentIntensityAnalyzer()
        sentiment_scores = [analyzer.polarity_scores(article["title"])['compound'] for article in news_data.get("articles", [])]

        return np.mean(sentiment_scores) if sentiment_scores else 0
    except Exception as e:
        print(f"Error fetching sentiment: {e}")
        return 0

if "Sentiment" in df.columns:
    df = df.drop(columns=["Sentiment"])

sentiments = []
for index, row in df.iterrows():
    sentiment = get_sentiment_score(stock_symbol, index.strftime('%Y-%m-%d'))
    sentiments.append(sentiment)

df["Sentiment"] = sentiments
df.dropna(inplace=True)

# === Data Preprocessing ===
print("Preprocessing data...")
features = [('Close', stock_symbol), ('50_MA', stock_symbol), ('200_MA', stock_symbol), ('Volatility', stock_symbol), ('RSI', stock_symbol), ('MACD', stock_symbol), "Sentiment"]
scaler = MinMaxScaler(feature_range=(0, 1))
df_scaled = scaler.fit_transform(df[features])

print("Creating sequences...")
def create_sequences(data, time_steps=60):
    X, y = [], []
    for i in range(len(data) - time_steps):
        X.append(data[i:i+time_steps])
        y.append(data[i+time_steps][0])
    return np.array(X), np.array(y)

time_steps = 60
X, y = create_sequences(df_scaled, time_steps)
print("Sequences created.")

# Train-Test Split
print("Splitting data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
print(f"Training set size: {len(X_train)}, Testing set size: {len(X_test)}")

# === Transformer-Based Model ===
def build_transformer_model(input_shape):
    print("Building Transformer model...")
    inputs = Input(shape=input_shape)
    attn_output = MultiHeadAttention(num_heads=2, key_dim=input_shape[-1])(inputs, inputs)
    x = LayerNormalization(epsilon=1e-6)(attn_output + inputs)
    x = GlobalAveragePooling1D()(x)
    x = Dense(32, activation="relu")(x)
    x = Dropout(0.1)(x)
    outputs = Dense(1, activation="linear")(x)

    model = Model(inputs, outputs)
    model.compile(optimizer="adam", loss="mean_squared_error")
    print("Model built successfully.")
    return model

model = build_transformer_model((time_steps, len(features)))

# Train Model
print("Training model...")
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))
print("Model training completed.")

# === SHAP Explainability ===
print("Calculating SHAP values using KernelExplainer...")
explainer = shap.KernelExplainer(model.predict, X_train[:100])

X_test_2d = X_test[:10].reshape(10 * X_test.shape[1], X_test.shape[2])

shap_values = explainer.shap_values(X_test_2d)

shap_values_3d = np.reshape(shap_values, (10, X_test.shape[1], X_test.shape[2]))

X_test_original_shape = np.reshape(X_test_2d, (10, X_test.shape[1], X_test.shape[2]))

shap.summary_plot(shap_values_3d[0], X_test_original_shape[0], feature_names=features)

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Enter stock symbol (e.g., TSLA for Tesla): hdfcbank.ns
Fetching data for stock: hdfcbank.ns
Stock symbol: hdfcbank.ns


[*********************100%***********************]  1 of 1 completed


MultiIndex([( 'Close', 'HDFCBANK.NS'),
            (  'High', 'HDFCBANK.NS'),
            (   'Low', 'HDFCBANK.NS'),
            (  'Open', 'HDFCBANK.NS'),
            ('Volume', 'HDFCBANK.NS')],
           names=['Price', 'Ticker'])
Price            Close        High         Low        Open      Volume
Ticker     HDFCBANK.NS HDFCBANK.NS HDFCBANK.NS HDFCBANK.NS HDFCBANK.NS
Date                                                                  
2010-01-04  151.915115  153.945755  150.743932  151.407449     3050490
2010-01-05  152.048691  153.634015  151.852751  152.298067     8386600
2010-01-06  152.151123  153.188703  150.347585  152.654319     6639840
2010-01-07  152.547440  157.499351  151.549936  157.499351     6123980
2010-01-08  152.747833  153.491503  151.496496  152.476183     7085900
Price   Ticker     
Close   HDFCBANK.NS    0
High    HDFCBANK.NS    0
Low     HDFCBANK.NS    0
Open    HDFCBANK.NS    0
Volume  HDFCBANK.NS    0
dtype: int64
Price   Ticker     
Close   HDFCBANK.NS 

KeyError: "None of [Index([     ('Close', 'hdfcbank.ns'),      ('50_MA', 'hdfcbank.ns'),\n           ('200_MA', 'hdfcbank.ns'), ('Volatility', 'hdfcbank.ns'),\n              ('RSI', 'hdfcbank.ns'),       ('MACD', 'hdfcbank.ns'),\n                         'Sentiment'],\n      dtype='object')] are in the [columns]"