<a href="https://colab.research.google.com/github/racoope70/daytrading-with-ml/blob/main/multi_ticker_LightGBN_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
#Protocol Buffer Fix (for TensorFlow)
!pip install --upgrade protobuf
!pip install protobuf==3.20.3


Collecting protobuf
  Downloading protobuf-6.30.2-cp39-abi3-manylinux2014_x86_64.whl.metadata (593 bytes)
Downloading protobuf-6.30.2-cp39-abi3-manylinux2014_x86_64.whl (316 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.2/316.2 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 4.25.6
    Uninstalling protobuf-4.25.6:
      Successfully uninstalled protobuf-4.25.6
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.12.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 6.30.2 which is incompatible.
google-cloud-firestore 2.20.1 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have 

In [4]:
#Install TensorFlow (latest stable GPU-compatible version)
!pip install tensorflow

#Install Stable Baselines3 and Trading Libraries
!pip install stable-baselines3[extra] gymnasium gym-anytrading yfinance xgboost joblib

#Reinstall RAPIDS dependencies if needed (optional reset)
!pip install --upgrade --force-reinstall \
    dask==2024.11.2 \
    rapids-dask-dependency==24.12.0 \
    cudf-cu12==24.12.0 \
    cuml-cu12==24.12.0 \
    pylibraft-cu12==24.12.0 \
    pylibcudf-cu12==24.12.0 \
    numba==0.61.0

Collecting gym-anytrading
  Downloading gym_anytrading-2.0.0-py3-none-any.whl.metadata (292 bytes)
Collecting stable-baselines3[extra]
  Downloading stable_baselines3-2.6.0-py3-none-any.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3[extra])
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3[extra])
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3[extra])
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3.0,>=2.3->stable-baselines3[extra])
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.

In [3]:
import torch
import cudf
import cuml
import dask
import pandas as pd
import numpy as np
import scipy
import lightgbm as lgb
import gymnasium as gym
import stable_baselines3

#=========================
#Version Checks
#=========================
print(" Library Versions")
print("--------------------")
print(" PyTorch:", torch.__version__)
print(" CUDA:", torch.version.cuda)
print(" cuDF:", cudf.__version__)
print(" cuML:", cuml.__version__)
print(" Dask:", dask.__version__)
print(" Pandas:", pd.__version__)
print(" NumPy:", np.__version__)
print(" SciPy:", scipy.__version__)
print(" LightGBM:", lgb.__version__)
print(" Gymnasium:", gym.__version__)
print(" Stable Baselines3:", stable_baselines3.__version__)

#=========================
# GPU Check (Torch + NVIDIA)
#=========================
print("\n GPU Availability")
print("--------------------")
print(" PyTorch GPU Available:", torch.cuda.is_available())
print(" GPU Count:", torch.cuda.device_count())
if torch.cuda.is_available():
    print(" GPU Name:", torch.cuda.get_device_name(0))

ImportError: Numba needs NumPy 1.24 or greater. Got NumPy 1.23.

In [1]:
#Clean install of TensorFlow compatible with Colab's GPU (CUDA 11.8 + cuDNN 8.x)
!pip uninstall -y tensorflow keras
!pip install tensorflow==2.12.0

#Restart runtime after this!
!pip install numpy==1.24.4 --force-reinstall

Found existing installation: tensorflow 2.12.0
Uninstalling tensorflow-2.12.0:
  Successfully uninstalled tensorflow-2.12.0
Found existing installation: keras 2.12.0
Uninstalling keras-2.12.0:
  Successfully uninstalled keras-2.12.0
Collecting tensorflow==2.12.0
  Using cached tensorflow-2.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting keras<2.13,>=2.12.0 (from tensorflow==2.12.0)
  Using cached keras-2.12.0-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting numpy<1.24,>=1.22 (from tensorflow==2.12.0)
  Using cached numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Using cached tensorflow-2.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (586.0 MB)
Using cached keras-2.12.0-py2.py3-none-any.whl (1.7 MB)
Using cached numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
Installing collected packages: numpy, keras, tensorflow
  Attempting uninstall: numpy
    F

Collecting numpy==1.24.4
  Downloading numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Downloading numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/17.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/17.3 MB[0m [31m9.0 MB/s[0m eta [36m0:00:02[0m[2K   [91m━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/17.3 MB[0m [31m51.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━[0m [32m9.9/17.3 MB[0m [31m95.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m17.3/17.3 MB[0m [31m208.6 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m17.3/17.3 MB[0m [31m208.6 MB/s[0m eta [36m0:00:01[0m[2K   [

In [4]:
#Core & System Utilities
import os
import gc
import sys
import time
import json
import pickle
import random
from datetime import datetime
from collections import defaultdict, deque

#Data Science Essentials
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import numba

#Financial Data
import yfinance as yf

#Machine Learning & Preprocessing
import joblib
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, f1_score, classification_report, confusion_matrix
)

#Deep Learning (TensorFlow/Keras)
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras import Input, backend as K
from tensorflow.keras import mixed_precision

#Visualization & Display
import IPython.display as display

#RAPIDS Libraries (for GPU-accelerated ML, optional)
import cupy as cp

#Reinforcement Learning (Stable Baselines3)
import stable_baselines3
from stable_baselines3 import A2C, DDPG, DQN, PPO, SAC, TD3
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.logger import configure
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.noise import NormalActionNoise
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize

#Gym & Trading Environments
import gym
import gymnasium as gym
import gym_anytrading
from gym.spaces import Box
from gymnasium.spaces import Box as GymBox, Discrete
from gymnasium.wrappers import TimeLimit

#CUDA (Optional Paths - for manual GPU configuration)
os.environ['CUDA_HOME'] = '/usr/local/cuda-11.8'
os.environ['PATH'] += ':/usr/local/cuda-11.8/bin'
os.environ['LD_LIBRARY_PATH'] += ':/usr/local/cuda-11.8/lib64'

#GPU Check (Colab only)
!nvidia-smi


ImportError: Numba needs NumPy 1.24 or greater. Got NumPy 1.23.

In [2]:
# Required for TensorFlow compatibility (GPU + cuDNN)
!pip uninstall -y tensorflow keras -q
!pip install tensorflow==2.12.0 -q

# Fix protobuf compatibility
!pip install protobuf==3.20.3 -q

# Essential packages
!pip install numpy==1.24.4 pandas joblib yfinance scikit-learn matplotlib -q


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-text 2.18.1 requires tensorflow<2.19,>=2.18.0, but you have tensorflow 2.12.0 which is incompatible.
tf-keras 2.18.0 requires tensorflow<2.19,>=2.18, but you have tensorflow 2.12.0 which is incompatible.
tensorflow-decision-forests 1.11.0 requires tensorflow==2.18.0, but you have tensorflow 2.12.0 which is incompatible.[0m[31m
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.12.0 requires numpy<1.24,>=1.22, but you have numpy 1.24.4 which is incompatible.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.2.3 which is incompatible.
pymc 5.21.2 requires numpy>=1.25.0, but you have numpy 1.24.4 which is incompatible.
blosc2 3.3.0 requires 

In [11]:
import os
from google.colab import drive

if not os.path.exists('/content/drive/MyDrive'):
    drive.mount('/content/drive')
else:
    print("Google Drive is already mounted.")



Google Drive is already mounted.


In [None]:
#Downgrade NumPy to a compatible version
!pip install numpy==1.24.4 --force-reinstall

#Reinstall LightGBM after fixing NumPy
!pip install lightgbm --force-reinstall --no-cache-dir


In [6]:
!pip install -U scikit-learn==1.3.2 --quiet


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m45.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [1]:
#Imports
import os, gc, joblib, yfinance as yf
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier, plot_importance
from lightgbm import early_stopping, log_evaluation
#Config
save_dir = '/content/drive/MyDrive/LightGBM_Models'
os.makedirs(save_dir, exist_ok=True)

#Toggle test mode
test_mode = False
TICKERS = ['AAPL'] if test_mode else [
    'AAPL', 'TSLA', 'MSFT', 'GOOGL', 'AMZN', 'NVDA', 'META', 'BRK-B', 'JPM', 'JNJ',
    'XOM', 'V', 'PG', 'UNH', 'MA', 'HD', 'LLY', 'MRK', 'PEP', 'KO',
    'BAC', 'ABBV', 'AVGO', 'PFE', 'COST', 'CSCO', 'TMO', 'ABT', 'ACN', 'WMT',
    'MCD', 'ADBE', 'DHR', 'CRM', 'NKE', 'INTC', 'QCOM', 'NEE', 'AMD', 'TXN',
    'AMGN', 'UPS', 'LIN', 'PM', 'UNP', 'BMY', 'LOW', 'RTX', 'CVX', 'IBM',
    'GE', 'SBUX', 'ORCL'
]

#Feature Engineering
def compute_technical_indicators(df):
    df['SMA_50'] = df['Close'].rolling(window=50).mean()
    df['EMA_20'] = df['Close'].ewm(span=20, adjust=False).mean()
    delta = df['Close'].diff()
    gain = delta.where(delta > 0, 0).rolling(window=14).mean()
    loss = -delta.where(delta < 0, 0).rolling(window=14).mean()
    rs = gain / (loss + 1e-6)
    df['RSI'] = 100 - (100 / (1 + rs))
    df['MACD'] = df['Close'].ewm(span=12, adjust=False).mean() - df['Close'].ewm(span=26, adjust=False).mean()
    df['Signal_Line'] = df['MACD'].ewm(span=9, adjust=False).mean()
    df['ATR'] = df['High'].rolling(window=14).max() - df['Low'].rolling(window=14).min()
    df['OBV'] = (np.sign(df['Close'].diff()) * df['Volume']).fillna(0).cumsum()
    typical_price = (df['High'] + df['Low'] + df['Close']) / 3
    df['CCI'] = (typical_price - typical_price.rolling(20).mean()) / (0.015 * typical_price.rolling(20).std())
    df.dropna(inplace=True)
    return df

# Label: 1 = Buy (next close up), 0 = Hold/Sell
def generate_labels(df):
    df['Target'] = (df['Close'].shift(-1) > df['Close']).astype(int)
    df.dropna(inplace=True)
    return df

# Train LightGBM for one stock
def train_lightgbm_for_stock(ticker):
    print(f"\nTraining LightGBM for {ticker}")
    model_path = f"{save_dir}/lgb_{ticker}_model.txt"
    if os.path.exists(model_path) and not test_mode:
        print(f"Skipping {ticker}, model already exists.")
        return

    df = yf.download(ticker, period="720d", interval="1h", progress=False, auto_adjust=False)
    if df.empty:
        print(f"No data for {ticker}, skipping.")
        return

    df = compute_technical_indicators(df)
    df = generate_labels(df)

    features = ['Close', 'SMA_50', 'EMA_20', 'RSI', 'MACD', 'Signal_Line', 'ATR', 'OBV', 'CCI']
    X = df[features]
    y = df['Target']

    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, shuffle=False)

    #Sklearn-style LightGBM classifier
    model = LGBMClassifier(
        objective='binary',
        boosting_type='gbdt',
        learning_rate=0.05,
        n_estimators=500,
        random_state=42
    )

    model.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],
        callbacks=[early_stopping(stopping_rounds=20), log_evaluation(period=50)]
    )

    #Save model and scaler
    model.booster_.save_model(model_path)
    joblib.dump(scaler, f"{save_dir}/{ticker}_scaler.pkl")
    print(f"Model & scaler saved for {ticker}")

    #Predict & evaluate
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy for {ticker}: {acc:.4f}")

    #Plot top features (AAPL only in test mode)
    if test_mode and ticker == 'AAPL':
        print("\nAAPL Feature Importances:")
        plot_importance(model, max_num_features=10, importance_type='gain')
        plt.title("Top 10 AAPL Feature Importances")
        plt.tight_layout()
        plt.show()

#Run training loop
for ticker in TICKERS:
    train_lightgbm_for_stock(ticker)
    gc.collect()



Training LightGBM for AAPL
Skipping AAPL, model already exists.

Training LightGBM for TSLA
Skipping TSLA, model already exists.

Training LightGBM for MSFT
Skipping MSFT, model already exists.

Training LightGBM for GOOGL
Skipping GOOGL, model already exists.

Training LightGBM for AMZN
Skipping AMZN, model already exists.

Training LightGBM for NVDA
Skipping NVDA, model already exists.

Training LightGBM for META
Skipping META, model already exists.

Training LightGBM for BRK-B
Skipping BRK-B, model already exists.

Training LightGBM for JPM
Skipping JPM, model already exists.

Training LightGBM for JNJ
Skipping JNJ, model already exists.

Training LightGBM for XOM
Skipping XOM, model already exists.

Training LightGBM for V
Skipping V, model already exists.

Training LightGBM for PG
Skipping PG, model already exists.

Training LightGBM for UNH
Skipping UNH, model already exists.

Training LightGBM for MA
Skipping MA, model already exists.

Training LightGBM for HD
Skipping HD, mode

In [6]:
def evaluate_lightgbm_model(ticker, model_path, scaler_path, sequence_length=60, initial_cash=100000, return_history=False):
    import yfinance as yf
    import pandas as pd
    import numpy as np
    import joblib
    import lightgbm as lgb

    #Load trained model and scaler
    model = lgb.Booster(model_file=model_path)
    scaler = joblib.load(scaler_path)

    #Get recent stock data
    df = yf.download(ticker, period="360d", interval="1h", progress=False)
    df = compute_technical_indicators(df)

    features = ['Close', 'SMA_50', 'EMA_20', 'RSI', 'MACD', 'Signal_Line', 'ATR', 'OBV', 'CCI']
    df_feat = df[features].copy()

    #Ensure feature match for scaler
    if hasattr(scaler, "feature_names_in_"):
        df_feat = pd.DataFrame(df_feat, columns=scaler.feature_names_in_)
    scaled = scaler.transform(df_feat)

    #Generate signals
    y_pred = (model.predict(scaled) > 0.5).astype(int)
    real_prices = df_feat['Close'].values

    #Simulate trading strategy
    portfolio_lgbm = [initial_cash]
    portfolio_hold = [initial_cash]
    shares = initial_cash / real_prices[0]

    for i in range(1, len(real_prices)):
        action = y_pred[i-1]  # Use previous signal
        cash = portfolio_lgbm[-1]
        if action:
            cash = shares * real_prices[i]
        portfolio_lgbm.append(cash)
        portfolio_hold.append(shares * real_prices[i])

    results = {
        "Final Portfolio (LGBM)": portfolio_lgbm[-1],
        "Final Portfolio (Hold)": portfolio_hold[-1],
        "Return % (LGBM)": (portfolio_lgbm[-1] - initial_cash) / initial_cash * 100,
        "Return % (Hold)": (portfolio_hold[-1] - initial_cash) / initial_cash * 100
    }

    if return_history:
        results["Portfolio_LGBM_History"] = portfolio_lgbm
        results["Portfolio_Hold_History"] = portfolio_hold
        results["Dates"] = df.index[-len(portfolio_lgbm):].to_list()

    return results


In [8]:
results = []
for ticker in TICKERS:
    try:
        model_path = f"/content/drive/MyDrive/LightGBM_Models/lgb_{ticker}_model.txt"
        scaler_path = f"/content/drive/MyDrive/LightGBM_Models/{ticker}_scaler.pkl"
        if not os.path.exists(model_path) or not os.path.exists(scaler_path):
            continue
        result = evaluate_lightgbm_model(ticker, model_path, scaler_path)
        result["Symbol"] = ticker
        results.append(result)
    except Exception as e:
        print(f"Skipping {ticker}: {e}")


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-

In [9]:
results_df = pd.DataFrame(results)

top5 = results_df.sort_values(by="Return % (LGBM)", ascending=False).head(5)
bottom5 = results_df.sort_values(by="Return % (LGBM)", ascending=True).head(5)

print("Top 5 LGBM Stocks:")
display(top5)

print("Bottom 5 LGBM Stocks:")
display(bottom5)


Top 5 LGBM Stocks:


Unnamed: 0,Final Portfolio (LGBM),Final Portfolio (Hold),Return % (LGBM),Return % (Hold),Symbol
5,[224903.9951205108],[224903.9951205108],[124.9039951205108],[124.9039951205108],NVDA
50,[200827.57350737],[200827.57350737],[100.82757350736999],[100.82757350736999],GE
22,[182717.9424579327],[182717.9424579327],[82.7179424579327],[82.7179424579327],AVGO
43,[173595.5039182379],[173595.5039182379],[73.5955039182379],[73.5955039182379],PM
29,[168755.5590461578],[168755.5590461578],[68.75555904615781],[68.75555904615781],WMT


Bottom 5 LGBM Stocks:


Unnamed: 0,Final Portfolio (LGBM),Final Portfolio (Hold),Return % (LGBM),Return % (Hold),Symbol
35,[46304.62029855221],[50711.610119453835],[-53.695379701447784],[-49.28838988054616],INTC
34,[51368.664157006046],[51059.90924044138],[-48.631335842993956],[-48.94009075955862],NKE
31,[58729.1862446084],[58729.1862446084],[-41.2708137553916],[-41.2708137553916],ADBE
41,[67788.8105073313],[67788.8105073313],[-32.211189492668694],[-32.211189492668694],UPS
17,[78341.33811329814],[78341.33811329814],[-21.658661886701854],[-21.658661886701854],MRK
