<a href="https://colab.research.google.com/github/racoope70/daytrading-with-ml/blob/main/Multi_Stock_Feature_Engineering_Trading.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#  Protocol Buffer Fix (for TensorFlow)
!pip install --upgrade protobuf
!pip install protobuf==3.20.3

Collecting protobuf
  Downloading protobuf-6.30.2-cp39-abi3-manylinux2014_x86_64.whl.metadata (593 bytes)
Downloading protobuf-6.30.2-cp39-abi3-manylinux2014_x86_64.whl (316 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.2/316.2 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 5.29.4
    Uninstalling protobuf-5.29.4:
      Successfully uninstalled protobuf-5.29.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-cloud-aiplatform 1.87.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0,>=3.20.2, but you have protobuf 6.30.2 which is incompatible.
grpcio-status 1.71.0 requires protobuf<6.0dev,>=5.26.1, but you have protobuf 6.30.2 which is incompatible.
tensorflow 2.18.0 requires protobu

In [2]:
#  Install TensorFlow (latest stable GPU-compatible version)
!pip install tensorflow

#  Install Stable Baselines3 and Trading Libraries
!pip install stable-baselines3[extra] gymnasium gym-anytrading yfinance xgboost joblib

#  Reinstall RAPIDS dependencies if needed (optional reset)
!pip install --upgrade --force-reinstall \
    dask==2024.11.2 \
    rapids-dask-dependency==24.12.0 \
    cudf-cu12==24.12.0 \
    cuml-cu12==24.12.0 \
    pylibraft-cu12==24.12.0 \
    pylibcudf-cu12==24.12.0 \
    numba==0.61.0

Collecting gym-anytrading
  Downloading gym_anytrading-2.0.0-py3-none-any.whl.metadata (292 bytes)
Collecting stable-baselines3[extra]
  Downloading stable_baselines3-2.6.0-py3-none-any.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3[extra])
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3[extra])
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3.0,>=2.3->stable-baselines3[extra])
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3.0,>=2.3->stable-baselines3[extra])
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.

In [1]:
import torch
import cudf
import cuml
import dask
import pandas as pd
import numpy as np
import scipy
import lightgbm as lgb
import gymnasium as gym
import stable_baselines3

# =========================
#  Version Checks
# =========================
print(" Library Versions")
print("--------------------")
print(" PyTorch:", torch.__version__)
print(" CUDA:", torch.version.cuda)
print(" cuDF:", cudf.__version__)
print(" cuML:", cuml.__version__)
print(" Dask:", dask.__version__)
print(" Pandas:", pd.__version__)
print(" NumPy:", np.__version__)
print(" SciPy:", scipy.__version__)
print(" LightGBM:", lgb.__version__)
print(" Gymnasium:", gym.__version__)
print(" Stable Baselines3:", stable_baselines3.__version__)

# =========================
#  GPU Check (Torch + NVIDIA)
# =========================
print("\n GPU Availability")
print("--------------------")
print(" PyTorch GPU Available:", torch.cuda.is_available())
print(" GPU Count:", torch.cuda.device_count())
if torch.cuda.is_available():
    print(" GPU Name:", torch.cuda.get_device_name(0))


 Library Versions
--------------------
 PyTorch: 2.6.0+cu124
 CUDA: 12.4
 cuDF: 24.12.00
 cuML: 24.12.00
 Dask: 2024.11.2
 Pandas: 2.2.3
 NumPy: 2.1.3
 SciPy: 1.15.2
 LightGBM: 4.5.0
 Gymnasium: 1.1.1
 Stable Baselines3: 2.6.0

 GPU Availability
--------------------
 PyTorch GPU Available: True
 GPU Count: 1
 GPU Name: Tesla T4


In [2]:
#  Core Libraries
import gc
import json
import os
import pickle
import sys
import time
from collections import defaultdict, deque
from datetime import datetime

#  Data Science Essentials
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import numba
import IPython.display as display

#  Machine Learning & Data Processing
import joblib
import lightgbm as lgb
import xgboost as xgb
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
)
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.preprocessing import MinMaxScaler

#  Deep Learning (TensorFlow/Keras)
import tensorflow as tf
from tensorflow.keras import mixed_precision
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.layers import Dense, Dropout, LSTM
from tensorflow.keras.models import Sequential, load_model

#  RAPIDS Libraries (cuDF & cuML for GPU acceleration)
import cupy as cp

#  Reinforcement Learning (Stable Baselines3)
import stable_baselines3
from stable_baselines3 import A2C, DDPG, DQN, PPO, SAC, TD3
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.logger import configure
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.noise import NormalActionNoise
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize

#  Gym & Trading Environments
import gym
import gymnasium as gym
import gym_anytrading
from gym.spaces import Box
from gymnasium.spaces import Box as GymBox, Discrete
from gymnasium.wrappers import TimeLimit
from gym_anytrading.envs import StocksEnv

#  Financial & Stock Data Libraries
import yfinance as yf

#  PyTorch Essentials
import torch
import torch.nn as nn
import torch.optim as optim


In [3]:
#  Set CUDA Paths (Ensuring GPU Utilization)
os.environ['CUDA_HOME'] = '/usr/local/cuda-11.8'
os.environ['PATH'] += ':/usr/local/cuda-11.8/bin'
os.environ['LD_LIBRARY_PATH'] += ':/usr/local/cuda-11.8/lib64'

In [4]:
!nvidia-smi

Sun Apr  6 09:00:08 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   37C    P8              9W /   70W |       2MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [8]:
#  Install RAPIDS for cuML and cuDF support
!pip install cudf-cu12 cuml-cu12 --extra-index-url=https://pypi.ngc.nvidia.com

#  Import Libraries
import cudf
import cuml
from cuml.model_selection import train_test_split
from cuml.ensemble import RandomForestClassifier
from cuml.metrics import accuracy_score

import joblib
import numpy as np
import pandas as pd
from google.colab import drive
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

#  Mount Google Drive
drive.mount('/content/drive')

#  Load Multi-Stock Labeled Dataset
file_path = '/content/drive/My Drive/multi_stock_labeled_trading_dataset.csv'
df = pd.read_csv(file_path)

#  Feature Engineering
df['SMA_20'] = df['Close'].rolling(window=20).mean()
df['STD_20'] = df['Close'].rolling(window=20).std()
df['Upper_Band'] = df['SMA_20'] + 2 * df['STD_20']
df['Lower_Band'] = df['SMA_20'] - 2 * df['STD_20']
df['Lowest_Low'] = df['Low'].rolling(window=14).min()
df['Highest_High'] = df['High'].rolling(window=14).max()
df['Stoch'] = ((df['Close'] - df['Lowest_Low']) / (df['Highest_High'] - df['Lowest_Low'])) * 100
df['ROC'] = df['Close'].pct_change(periods=10)
df['OBV'] = (np.sign(df['Close'].diff()) * df['Volume']).cumsum()
typical_price = (df['High'] + df['Low'] + df['Close']) / 3
df['CCI'] = (typical_price - typical_price.rolling(window=20).mean()) / (0.015 * typical_price.rolling(window=20).std())
df['PROC'] = ((df['Close'] - df['Close'].shift(12)) / df['Close'].shift(12)) * 100
df['Rolling_Mean_50'] = df['Close'].rolling(window=50).mean()
df['Expanding_Mean'] = df['Close'].expanding(min_periods=1).mean()

#  Remove Leakage Columns
leakage_columns = ['Buy_Signal', 'Sell_Signal', 'Sell_Signal_Debug', 'Multi_Class_Target', 'MACD_Crossover']
df.drop(columns=[col for col in leakage_columns if col in df.columns], inplace=True)
df.dropna(inplace=True)

#  Prepare Features and Target
X = df.drop(columns=['Target', 'Datetime'], errors='ignore')
y = df['Target']

#  Encode Categorical Columns
for col in X.select_dtypes(include=['object']).columns:
    X[col] = LabelEncoder().fit_transform(X[col].astype(str))

#  Convert to cuDF (GPU)
X_cu = cudf.DataFrame.from_pandas(X.fillna(0))
y_cu = cudf.Series(y.values)

#  Train/Test Split (GPU)
X_train, X_test, y_train, y_test = train_test_split(X_cu, y_cu, test_size=0.3, random_state=42)

#  Train cuML Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

#  Predict and Evaluate
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print(f"Accuracy with cuML Random Forest: {acc:.4f}")
print("Classification Report:")
print(classification_report(y_test.to_pandas(), y_pred.to_pandas()))
print("Confusion Matrix:")
print(confusion_matrix(y_test.to_pandas(), y_pred.to_pandas()))

#  Save Model
model_path = '/content/drive/My Drive/trading_model_gpu_optimized.pkl'
joblib.dump(model, model_path)
print(f"Optimized GPU model saved to: {model_path}")

#  Save Enhanced Dataset
enhanced_path = '/content/drive/My Drive/multistockfeature_engineered_dataset.csv'
df.to_csv(enhanced_path, index=False)
print(f"Feature-engineered dataset saved to: {enhanced_path}")


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  return func(**kwargs)
  ret = func(*args, **kwargs)


Accuracy with cuML Random Forest: 0.9978
Classification Report:
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00      9406
           0       1.00      1.00      1.00     59394
           1       1.00      0.98      0.99      9989

    accuracy                           1.00     78789
   macro avg       1.00      0.99      1.00     78789
weighted avg       1.00      1.00      1.00     78789

Confusion Matrix:
[[ 9406     0     0]
 [   16 59378     0]
 [    0   157  9832]]
Optimized GPU model saved to: /content/drive/My Drive/trading_model_gpu_optimized.pkl
Feature-engineered dataset saved to: /content/drive/My Drive/multistockfeature_engineered_dataset.csv
