## plan:
- learn the basics of gold, silver and CAD prices
- try a simple linear regression just to say that we tried it
- experiment with LSTMs
- account for inflation and other economic factors that may be relevant
- scrape news headlines and use them for sentiment analysis

In [None]:
# !pip install yfinance

In [2]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import hashlib
import json
from datetime import datetime, timedelta

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import layers, models

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [4]:
end_date = datetime.now()
start_date = end_date - timedelta(days=60)

print("Start date:", start_date)
print("End date:", end_date)

Start date: 2025-10-01 00:25:28.485569
End date: 2025-11-30 00:25:28.485569


In [5]:
# my_end = "2025-11-20" # so I can use the days after that for testing
# end_date = datetime.strptime(my_end, "%Y-%m-%d")
# start_date = "2025-10-01"

gold = yf.download("GC=F", start = start_date, end = end_date, interval = "30m", auto_adjust = False)
silver = yf.download("SI=F", start = start_date, end = end_date, interval = "30m",auto_adjust = False)
cad = yf.download("CADUSD=X", start = start_date, end = end_date, interval = "30m", auto_adjust = False)

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [6]:
gold = gold[["Close"]].rename(columns={"Close": "Gold"})
silver = silver[["Close"]].rename(columns={"Close": "Silver"})
cad = cad[["Close"]].rename(columns={"Close": "CAD"})

In [7]:
all_prices = gold.join([silver, cad], how="outer")
all_prices.to_csv("prices_with_null.csv")

In [8]:
all_prices = all_prices.ffill().bfill() # forward-fill + backward-fill to deal with missing values
all_prices.to_csv("prices.csv")

In [9]:
all_prices.head()

Price,Gold,Silver,CAD
Ticker,GC=F,SI=F,CADUSD=X
Datetime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
2025-09-30 23:00:00+00:00,3888.600098,47.264999,0.718396
2025-09-30 23:30:00+00:00,3888.600098,47.264999,0.7182
2025-10-01 00:00:00+00:00,3888.600098,47.264999,0.718066
2025-10-01 00:30:00+00:00,3888.600098,47.264999,0.718045
2025-10-01 01:00:00+00:00,3888.600098,47.264999,0.718174


In [10]:
all_prices.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2054 entries, 2025-09-30 23:00:00+00:00 to 2025-11-28 23:00:00+00:00
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   (Gold, GC=F)     2054 non-null   float64
 1   (Silver, SI=F)   2054 non-null   float64
 2   (CAD, CADUSD=X)  2054 non-null   float64
dtypes: float64(3)
memory usage: 64.2 KB


In [11]:
all_prices.describe()

Price,Gold,Silver,CAD
Ticker,GC=F,SI=F,CADUSD=X
count,2054.0,2054.0,2054.0
mean,4083.102781,49.638165,0.713283
std,109.510812,2.088647,0.002609
min,3851.800049,45.665001,0.707269
25%,4004.0,47.970001,0.711808
50%,4072.050049,48.8825,0.713267
75%,4153.774902,51.028749,0.714944
max,4394.299805,57.080002,0.719746


## feature engineering

## LSTM

In [16]:
import numpy as np

def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred)**2))

def mae(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))

def mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [12]:
def create_sequences(data, window):
    X = []
    y = []
    for i in range(len(data) - window):
        X.append(data[i : i + window])
        y.append(data[i + window])
    return np.array(X), np.array(y)

In [13]:
def baseline_predict(X):
    return X[:, -1]

In [14]:
def train_test_split(X, y, test_ratio=0.2):
    split = int(len(X) * (1 - test_ratio))
    return X[:split], X[split:], y[:split], y[split:]

In [15]:
def build_model(window, features=3):
    model = models.Sequential([
        layers.Input((window, features)),
        layers.LSTM(64, return_sequences=False),
        layers.Dense(32, activation='relu'),
        layers.Dense(features) 
    ])
    
    model.compile(
        optimizer='adam',
        loss='mse'
    )
    return model

In [17]:
values = all_prices.values 

scaler = MinMaxScaler()
scaled = scaler.fit_transform(values)

window_sizes = [10, 20, 30, 40, 60, 90, 120]

results = []


for window in window_sizes:
    print(f"\n Window = {window}")

    X, y = create_sequences(scaled, window)

    X_train, X_test, y_train, y_test = train_test_split(X, y)

    model = build_model(window)
    
    model.fit(
        X_train, y_train,
        epochs=20, batch_size=32,
        validation_split=0.1,
        verbose=0
    )
    
    y_pred = model.predict(X_test)

    y_base = baseline_predict(X_test)

    y_test_inv = scaler.inverse_transform(y_test)
    y_pred_inv = scaler.inverse_transform(y_pred)
    y_base_inv = scaler.inverse_transform(y_base)

    metrics = {}

    for i, asset in enumerate(["Gold", "Silver", "CAD"]):
        rmse_m = rmse(y_test_inv[:, i], y_pred_inv[:, i])
        mae_m  = mae (y_test_inv[:, i], y_pred_inv[:, i])
        mape_m = mape(y_test_inv[:, i], y_pred_inv[:, i])

        rmse_b = rmse(y_test_inv[:, i], y_base_inv[:, i])
        mae_b  = mae (y_test_inv[:, i], y_base_inv[:, i])
        mape_b = mape(y_test_inv[:, i], y_base_inv[:, i])

        metrics[asset] = {
            "RMSE": rmse_m, "MAE": mae_m, "MAPE": mape_m,
            "Base_RMSE": rmse_b, "Base_MAE": mae_b, "Base_MAPE": mape_b,
        }

    results.append((window, metrics))

    for asset in metrics:
        print(f"{asset}:")
        print("  Model      → RMSE: {:.4f} | MAE: {:.4f} | MAPE: {:.2f}%".format(
            metrics[asset]["RMSE"], metrics[asset]["MAE"], metrics[asset]["MAPE"]))
        print("  Baseline   → RMSE: {:.4f} | MAE: {:.4f} | MAPE: {:.2f}%".format(
            metrics[asset]["Base_RMSE"], metrics[asset]["Base_MAE"], metrics[asset]["Base_MAPE"]))
    print()


=== Window = 10 ===


I0000 00:00:1764456049.347698  113632 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10065 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4080 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9
2025-11-30 00:40:50.482425: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91400


[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
Gold:
  Model      → RMSE: 11.1630 | MAE: 7.9792 | MAPE: 0.19%
  Baseline   → RMSE: 10.0473 | MAE: 5.7122 | MAPE: 0.14%
Silver:
  Model      → RMSE: 0.3952 | MAE: 0.2930 | MAPE: 0.57%
  Baseline   → RMSE: 0.2145 | MAE: 0.1182 | MAPE: 0.23%
CAD:
  Model      → RMSE: 0.0004 | MAE: 0.0003 | MAPE: 0.04%
  Baseline   → RMSE: 0.0003 | MAE: 0.0002 | MAPE: 0.03%


=== Window = 20 ===
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
Gold:
  Model      → RMSE: 11.7884 | MAE: 8.4966 | MAPE: 0.21%
  Baseline   → RMSE: 10.0676 | MAE: 5.7219 | MAPE: 0.14%
Silver:
  Model      → RMSE: 0.4293 | MAE: 0.3004 | MAPE: 0.57%
  Baseline   → RMSE: 0.2150 | MAE: 0.1185 | MAPE: 0.23%
CAD:
  Model      → RMSE: 0.0005 | MAE: 0.0003 | MAPE: 0.04%
  Baseline   → RMSE: 0.0003 | MAE: 0.0002 | MAPE: 0.03%


=== Window = 30 ===
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
Gold:
  Model      →