In [1]:
from alpaca.data.historical import StockHistoricalDataClient
from alpaca.data.requests import StockBarsRequest
from alpaca.data.timeframe import TimeFrame
import pandas as pd
import numpy as np
from datetime import datetime
import plotly.graph_objects as go

In [6]:
import torch
print(torch.__version__)
print(torch.version.cuda)
print(torch.backends.cudnn.version())
print("CUDA available:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0))

2.6.0+cu126
12.6
90501
CUDA available: True
GPU: NVIDIA GeForce RTX 4070 SUPER


In [2]:
import os
from dotenv import load_dotenv
from pathlib import Path

# Path to the .env file inside 01 - Documentation
env_path = Path("01 - Documentation") / "keys.env"

# Load the .env file from that path
load_dotenv(dotenv_path=env_path)

# Now you can access your keys
API_KEY = os.getenv("ALPACA_API_KEY")
SECRET_KEY = os.getenv("ALPACA_SECRET_KEY")

print(API_KEY[:4] + "****")  # optional: verify it's working


PKD5****


In [7]:
# Create data client
client = StockHistoricalDataClient(API_KEY, SECRET_KEY)

# Define request
request_params = StockBarsRequest(
    symbol_or_symbols=["AAPL"],
    timeframe=TimeFrame.Day,
    start=datetime(2023, 1, 1),
    end=datetime(2023, 12, 31)
)

# Fetch data
bars = client.get_stock_bars(request_params)
stockData = bars.df
stockData = stockData.reset_index()


print(stockData.head())
print(stockData.columns)
#print(df.index)
print(stockData.shape)


  symbol                 timestamp     open      high     low   close  \
0   AAPL 2023-01-03 05:00:00+00:00  130.280  130.9000  124.17  125.07   
1   AAPL 2023-01-04 05:00:00+00:00  126.890  128.6557  125.08  126.36   
2   AAPL 2023-01-05 05:00:00+00:00  127.130  127.7700  124.76  125.02   
3   AAPL 2023-01-06 05:00:00+00:00  126.010  130.2900  124.89  129.62   
4   AAPL 2023-01-09 05:00:00+00:00  130.465  133.4100  129.89  130.15   

        volume  trade_count        vwap  
0  124289279.0    1021067.0  125.660032  
1   95426133.0     770045.0  126.643011  
2   88344592.0     665463.0  126.006961  
3   96468673.0     709888.0  128.362627  
4   76653608.0     645367.0  131.545593  
Index(['symbol', 'timestamp', 'open', 'high', 'low', 'close', 'volume',
       'trade_count', 'vwap'],
      dtype='object')
(250, 9)


In [None]:
fig = go.Figure(data=[go.Candlestick(
    x=stockData['timestamp'],
    open=stockData['open'],
    high=stockData['high'],
    low=stockData['low'],
    close=stockData['close'],
    name='AAPL'
)])

fig.update_layout(
    title='AAPL Stock Price (2023)',
    xaxis_title='Date',
    yaxis_title='Price (USD)',
    xaxis_rangeslider_visible=False
)

fig.show()


In [27]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA

In [None]:
# print(stockData.shape)
stockData['target'] = (stockData['close'].shift(-2) > stockData['close']).astype(int)

features = stockData.drop(columns=['symbol', 'timestamp', 'target'])
target = stockData['target']

print(features.head())

# Standard split (shuffle = False for time series)
X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2, shuffle=False
)
   
# our features have very different magnitudes (volume in millions, price in dollars)
# so we will scale them to have mean 0 and variance std = 1
# ensures equal weighting of all features
# this is important for PCA and Lasso regression
# these are z-scores, how many standard deviations away from the mean
# for each feature

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("X_train_scaled shape:", X_train_scaled.shape)
print("X Scaled")
print(X_train_scaled[0:5])

# PCA to reduce the dimensionality of our data
# PCA is a linear dimensionality reduction technique

pca = PCA(n_components='mle', svd_solver='full')
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print("Explained variance by components:", pca.explained_variance_ratio_)

print("PCA components shape:", X_train_pca.shape)
print("X PCA")
print(X_train_pca[0:5])


      open      high     low   close       volume  trade_count        vwap
0  130.280  130.9000  124.17  125.07  124289279.0    1021067.0  125.660032
1  126.890  128.6557  125.08  126.36   95426133.0     770045.0  126.643011
2  127.130  127.7700  124.76  125.02   88344592.0     665463.0  126.006961
3  126.010  130.2900  124.89  129.62   96468673.0     709888.0  128.362627
4  130.465  133.4100  129.89  130.15   76653608.0     645367.0  131.545593
X_train_scaled shape: (200, 7)
X Scaled
[[-2.22320049 -2.31356947 -2.50363438 -2.5731427   3.09473072  2.9353302
  -2.52737734]
 [-2.41920823 -2.44501333 -2.45097373 -2.49763033  1.61218889  1.20997085
  -2.47003634]
 [-2.40533158 -2.49688689 -2.46949176 -2.57606954  1.24844889  0.49114329
  -2.50713962]
 [-2.4700893  -2.34929587 -2.46196881 -2.3068006   1.66573846  0.79649138
  -2.36972441]
 [-2.2125039  -2.16656412 -2.17262457 -2.27577614  0.64794703  0.35301666
  -2.18404956]]
Explained variance by components: [7.42393820e-01 2.35789185e-01 

In [37]:
# Train models
models = {
    "Linear": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.01)
}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_test_scaled)
    mse_train = mean_squared_error(y_train, model.predict(X_train_scaled))
    mse_test = mean_squared_error(y_test, preds)
    print(f"{name} Test MSE: {mse_test:.4f}")
    print(f"{name} Train MSE: {mse_train:.4f}")



Linear Test MSE: 0.2482
Linear Train MSE: 0.2345
Ridge Test MSE: 0.2470
Ridge Train MSE: 0.2352
Lasso Test MSE: 0.2397
Lasso Train MSE: 0.2369


In [40]:
fig = go.Figure()

# Plot actual binary labels
fig.add_trace(go.Scatter(
    x=X_test.index,
    y=y_test,
    mode='markers',
    name='Actual Movement (0=Down, 1=Up)',
    marker=dict(color='black', size=6)
))

# Plot predicted probabilities
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    probs = model.predict(X_test_scaled)  # this is still regression, not class probs
    fig.add_trace(go.Scatter(
        x=X_test.index,
        y=probs,
        mode='lines',
        name=f'{name} Prediction'
    ))

fig.update_layout(
    title='Model Prediction Probabilities vs Actual Movement',
    xaxis_title='Index',
    yaxis_title='Probability / Class',
    template='plotly_white'
)

fig.show()
