## plan:
- learn the basics of gold, silver and CAD prices
- try a simple linear regression just to say that we tried it
- experiment with LSTMs
- account for inflation and other economic factors that may be relevant
- scrape news headlines and use them for sentiment analysis

In [None]:
# !pip install yfinance

In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import hashlib
import json
from datetime import datetime, timedelta

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import layers, models

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

2025-11-30 01:29:11.946956: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-11-30 01:29:11.975030: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-11-30 01:29:12.621019: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
end_date = datetime.now()
start_date = end_date - timedelta(days=60)

print("Start date:", start_date)
print("End date:", end_date)

Start date: 2025-10-01 01:29:14.184120
End date: 2025-11-30 01:29:14.184120


In [3]:
# my_end = "2025-11-20" # so I can use the days after that for testing
# end_date = datetime.strptime(my_end, "%Y-%m-%d")
# start_date = "2025-10-01"

gold = yf.download("GC=F", start = start_date, end = end_date, interval = "30m", auto_adjust = False)
silver = yf.download("SI=F", start = start_date, end = end_date, interval = "30m",auto_adjust = False)
cad = yf.download("CADUSD=X", start = start_date, end = end_date, interval = "30m", auto_adjust = False)

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [4]:
gold = gold[["Close"]].rename(columns={"Close": "Gold"})
silver = silver[["Close"]].rename(columns={"Close": "Silver"})
cad = cad[["Close"]].rename(columns={"Close": "CAD"})

In [5]:
all_prices = gold.join([silver, cad], how="outer")
all_prices.to_csv("prices_with_null.csv")

In [6]:
all_prices = all_prices.ffill().bfill() # forward-fill + backward-fill to deal with missing values
all_prices.to_csv("prices.csv")

In [7]:
all_prices.head()

Price,Gold,Silver,CAD
Ticker,GC=F,SI=F,CADUSD=X
Datetime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
2025-10-01 00:00:00+00:00,3893.300049,47.224998,0.718066
2025-10-01 00:30:00+00:00,3893.300049,47.224998,0.718045
2025-10-01 01:00:00+00:00,3893.300049,47.224998,0.718174
2025-10-01 01:30:00+00:00,3893.300049,47.224998,0.718231
2025-10-01 02:00:00+00:00,3893.300049,47.224998,0.71804


In [8]:
all_prices.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2052 entries, 2025-10-01 00:00:00+00:00 to 2025-11-28 23:00:00+00:00
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   (Gold, GC=F)     2052 non-null   float64
 1   (Silver, SI=F)   2052 non-null   float64
 2   (CAD, CADUSD=X)  2052 non-null   float64
dtypes: float64(3)
memory usage: 64.1 KB


In [9]:
all_prices.describe()

Price,Gold,Silver,CAD
Ticker,GC=F,SI=F,CADUSD=X
count,2052.0,2052.0,2052.0
mean,4083.31331,49.640253,0.713278
std,109.358684,2.088606,0.002605
min,3851.800049,45.665001,0.707269
25%,4004.0,47.970001,0.711805
50%,4072.100098,48.887499,0.713267
75%,4153.949951,51.029999,0.714937
max,4394.299805,57.080002,0.719746


In [10]:
# flatten the multi-index columns
all_prices.columns = [' '.join(col).strip() for col in all_prices.columns.values]

all_prices.head()

Unnamed: 0_level_0,Gold GC=F,Silver SI=F,CAD CADUSD=X
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-10-01 00:00:00+00:00,3893.300049,47.224998,0.718066
2025-10-01 00:30:00+00:00,3893.300049,47.224998,0.718045
2025-10-01 01:00:00+00:00,3893.300049,47.224998,0.718174
2025-10-01 01:30:00+00:00,3893.300049,47.224998,0.718231
2025-10-01 02:00:00+00:00,3893.300049,47.224998,0.71804


In [11]:
all_prices = all_prices.rename(columns={
    'Gold GC=F': 'Gold',
    'Silver SI=F': 'Silver',
    'CAD CADUSD=X': 'CAD'
})

In [12]:
all_prices.head()

Unnamed: 0_level_0,Gold,Silver,CAD
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2025-10-01 00:00:00+00:00,3893.300049,47.224998,0.718066
2025-10-01 00:30:00+00:00,3893.300049,47.224998,0.718045
2025-10-01 01:00:00+00:00,3893.300049,47.224998,0.718174
2025-10-01 01:30:00+00:00,3893.300049,47.224998,0.718231
2025-10-01 02:00:00+00:00,3893.300049,47.224998,0.71804


## CNN + LSTM

## Transformer

In [13]:
scaler = MinMaxScaler()

def scale_data(df):
    scaled = scaler.fit_transform(df)
    return scaled

In [14]:
def make_dataset(data, window, horizon):
    X, y = [], []
    for i in range(len(data) - window - horizon):
        X.append(data[i:i+window])                    # (window, 3)
        y.append(data[i+window:i+window+horizon])     # (horizon, 3)
    return np.array(X), np.array(y)

In [15]:
def build_cnn_lstm(window=64, features=3, horizon=20):

    inp = layers.Input(shape=(window, features))

    # CNN extracts local feature patterns
    x = layers.Conv1D(64, kernel_size=3, activation='relu')(inp)
    x = layers.Conv1D(64, kernel_size=3, activation='relu')(x)

    # LSTM captures temporal dynamics
    x = layers.LSTM(128)(x)

    # Dense head for multi-output multi-step forecasting
    x = layers.Dense(horizon * features)(x)

    # reshape to (horizon, features)
    out = layers.Reshape((horizon, features))(x)

    model = models.Model(inp, out)
    model.compile(optimizer='adam', loss='mse')

    return model

In [16]:
scaled = scale_data(all_prices)

window = 64
horizon = 20   # forecast next 20 x 30-min steps

X, y = make_dataset(scaled, window, horizon)

# Train/Validation split
split = int(len(X) * 0.8)
X_train, X_val = X[:split], X[split:]
y_train, y_val = y[:split], y[split:]

print("Train shape:", X_train.shape, y_train.shape)
print("Val   shape:", X_val.shape, y_val.shape)

Train shape: (1574, 64, 3) (1574, 20, 3)
Val   shape: (394, 64, 3) (394, 20, 3)


In [17]:
model = build_cnn_lstm(window=window, features=3, horizon=horizon)
model.summary()

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=40,
    batch_size=32
)

I0000 00:00:1764459068.906078  157480 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10065 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4080 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9


Epoch 1/40


2025-11-30 01:31:09.960933: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91400


[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - loss: 0.0408 - val_loss: 0.0148
Epoch 2/40
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0069 - val_loss: 0.0103
Epoch 3/40
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0057 - val_loss: 0.0074
Epoch 4/40
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0045 - val_loss: 0.0058
Epoch 5/40
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0041 - val_loss: 0.0069
Epoch 6/40
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.0040 - val_loss: 0.0065
Epoch 7/40
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0040 - val_loss: 0.0063
Epoch 8/40
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.0038 - val_loss: 0.0058
Epoch 9/40
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

In [21]:
def predict_interval(model, df, start_date, end_date,
                     window=64, horizon=20):

    # 1. Slice range safely using partial dates
    df_range = df.loc[start_date:end_date]

    if len(df_range) == 0:
        raise ValueError(f"No data between {start_date} and {end_date}. "
                         "Try a date fully inside the dataset range.")

    needed_steps = len(df_range)

    # 2. Convert partial date to nearest timestamp index
    try:
        start_idx = df.index.get_loc(start_date)
    except KeyError:
        # nearest timestamp
        start_idx = df.index.get_indexer([start_date], method='nearest')[0]

    # 3. Scale full dataset
    df_scaled = scaler.transform(df)

    # 4. Ensure we have enough data before start_date
    if start_idx < window:
        raise ValueError(
            f"Not enough past data before {start_date} "
            f"to build a window of length {window}."
        )

    # 5. Get the initial input sequence
    current_seq = df_scaled[start_idx-window:start_idx]

    preds_scaled = []

    # 6. Rolling multi-step prediction
    while len(preds_scaled) < needed_steps:
        pred = model.predict(current_seq[np.newaxis])[0]  # (horizon, 3)
        preds_scaled.append(pred)
        current_seq = np.concatenate([current_seq[horizon:], pred], axis=0)

    preds_scaled = np.concatenate(preds_scaled, axis=0)[:needed_steps]

    # 7. Inverse scale
    preds_inv = scaler.inverse_transform(preds_scaled)

    return preds_inv, df_range.index

In [25]:
pred, pred_index = predict_interval(
    model,
    all_prices,
    start_date="2025-11-01",
    end_date="2024-11-10",
    window=window,
    horizon=horizon
)

forecast_df = pd.DataFrame(
    pred,
    columns=["Gold_pred","Silver_pred","CAD_pred"],
    index=pred_index
)

forecast_df.head()

ValueError: No data between 2025-11-10 and 2024-11-01. Try a date fully inside the dataset range.

In [None]:
plt.figure(figsize=(14,5))
plt.plot(df.loc["2024-03-20":"2024-03-27"].index, df.loc["2024-03-20":"2024-03-27"]["Gold"], label="Actual Gold")
plt.plot(forecast_df.index, forecast_df["Gold_pred"], label="Pred Gold")
plt.legend()
plt.show()