<a href="https://colab.research.google.com/github/racoope70/daytrading-with-ml/blob/main/Feature_Engineering_Trading.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install and Import Libraries
!pip install cudf-cu12 cuml-cu12 --extra-index-url=https://pypi.ngc.nvidia.com
import cudf
import cuml
from cuml.model_selection import train_test_split
from cuml.ensemble import RandomForestClassifier
from cuml.metrics import accuracy_score
import joblib
import numpy as np
import pandas as pd
from google.colab import drive
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting cuml-cu12
  Downloading cuml_cu12-25.2.0.tar.gz (2.5 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
INFO: pip is looking at multiple versions of cuml-cu12 to determine which version is compatible with other requirements. This could take a while.
  Downloading cuml_cu12-24.12.0.tar.gz (2.5 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting cuvs-cu12==24.12.* (from cuml-cu12)
  Downloading cuvs_cu12-24.12.0.tar.gz (1.0 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting dask-cuda==24.12.* (from cuml-cu12)
  Download

In [2]:

drive.mount('/content/drive')

# Load Dataset
file_path = '/content/drive/My Drive/tsla_labeled_trading_dataset.csv'
df = pd.read_csv(file_path)


Mounted at /content/drive


In [3]:
# Step 2: Comprehensive Feature Engineering
df['SMA_20'] = df['Close'].rolling(window=20).mean()
df['STD_20'] = df['Close'].rolling(window=20).std()
df['Upper_Band'] = df['SMA_20'] + 2 * df['STD_20']
df['Lower_Band'] = df['SMA_20'] - 2 * df['STD_20']
df['Lowest_Low'] = df['Low'].rolling(window=14).min()
df['Highest_High'] = df['High'].rolling(window=14).max()
df['Stoch'] = ((df['Close'] - df['Lowest_Low']) / (df['Highest_High'] - df['Lowest_Low'])) * 100
df['ROC'] = df['Close'].pct_change(periods=10)
df['OBV'] = (np.sign(df['Close'].diff()) * df['Volume']).cumsum()
typical_price = (df['High'] + df['Low'] + df['Close']) / 3
df['CCI'] = (typical_price - typical_price.rolling(window=20).mean()) / (0.015 * typical_price.rolling(window=20).std())
df['PROC'] = ((df['Close'] - df['Close'].shift(12)) / df['Close'].shift(12)) * 100
df['Rolling_Mean_50'] = df['Close'].rolling(window=50).mean()
df['Expanding_Mean'] = df['Close'].expanding(min_periods=1).mean()

In [4]:
# ✅ Drop Leakage Columns
leakage_columns = ['Buy_Signal', 'Sell_Signal', 'Sell_Signal_Debug', 'Multi_Class_Target', 'MACD_Crossover']
df = df.drop(columns=[col for col in leakage_columns if col in df.columns])
df.dropna(inplace=True)

In [5]:
# Step 3: Target & Feature Preparation
X = df.drop(columns=['Target', 'Datetime'], errors='ignore')
y = df['Target']

# Encode Categorical Columns
for col in X.select_dtypes(include=['object']).columns:
    X[col] = LabelEncoder().fit_transform(X[col].astype(str))

# Step 4: Time-Based Train/Test Split
train_size = int(len(X) * 0.7)
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

# Step 5: Train Random Forest (GPU-Accelerated)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(cudf.DataFrame.from_pandas(X_train), cudf.Series(y_train.values))


  return func(**kwargs)
  ret = func(*args, **kwargs)


In [6]:
# Save Feature-Engineered Dataset
output_path = '/content/drive/My Drive/teslafeature_engineered_dataset.csv'
df.to_csv(output_path, index=False)
print(f' Enhanced Feature-Engineered dataset saved to: {output_path}')


 Enhanced Feature-Engineered dataset saved to: /content/drive/My Drive/teslafeature_engineered_dataset.csv


In [7]:

# Step 2: Drop Unnecessary Columns and Prepare Data
drop_columns = ['Datetime'] if 'Datetime' in df.columns else []
X = df.drop(columns=drop_columns + ['Target']) if 'Target' in df.columns else df.drop(columns=drop_columns)
y = df['Target'] if 'Target' in df.columns else None

if y is None:
    raise ValueError(" 'Target' column not found in the dataset.")

# Step 3: Encode Categorical Columns (Label Encoding)
categorical_columns = X.select_dtypes(include=['object']).columns
for col in categorical_columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

# Step 4: Convert Pandas DataFrame to cuDF for GPU
X_cu = cudf.DataFrame.from_pandas(X)
y_cu = cudf.Series(y.values)

# Handle Missing Values for cuML
X_cu = X_cu.fillna(0)  # Replace NaN with 0 (or use median if needed)
y_cu = y_cu.dropna()    # Drop any null target values

# Step 5: Train-Test Split (GPU-Accelerated)
X_train, X_test, y_train, y_test = train_test_split(
    X_cu, y_cu, test_size=0.3, random_state=42
)

In [8]:
# Step 6: Train Random Forest Classifier (cuML - GPU Accelerated)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 7: Make Predictions
y_pred = model.predict(X_test)

# Step 8: Evaluate Model Performance
accuracy = accuracy_score(y_test, y_pred)
print(f" Accuracy with cuML Random Forest: {accuracy:.4f}")

# Print Classification Report and Confusion Matrix
print(f' Classification Report:\n{classification_report(y_test.to_pandas(), y_pred.to_pandas())}')
print(f' Confusion Matrix:\n{confusion_matrix(y_test.to_pandas(), y_pred.to_pandas())}')

# Step 9: Save the GPU-Optimized Model
model_path = '/content/drive/My Drive/trading_model_gpu_optimized.pkl'
joblib.dump(model, model_path)
print(f" Optimized GPU model saved as '{model_path}'")

  return func(**kwargs)
  ret = func(*args, **kwargs)


 Accuracy with cuML Random Forest: 0.9910
 Classification Report:
              precision    recall  f1-score   support

          -1       1.00      0.98      0.99       427
           0       0.98      1.00      0.99       607
           1       1.00      0.99      0.99       414

    accuracy                           0.99      1448
   macro avg       0.99      0.99      0.99      1448
weighted avg       0.99      0.99      0.99      1448

 Confusion Matrix:
[[420   7   0]
 [  0 606   1]
 [  0   5 409]]
 Optimized GPU model saved as '/content/drive/My Drive/trading_model_gpu_optimized.pkl'
