In [3]:
from datetime import datetime, timedelta
import yfinance as yf
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
import keras_tuner as kt

import joblib

import hashlib
import json

In [4]:
log_file = "data_log.txt"

def log(msg):
    timestamp = datetime.now().strftime("[%Y-%m-%d %H:%M:%S]")
    with open(log_file, "a") as f:
        f.write(f"{timestamp} {msg}\n")


interval = "30m"

tickers = {"Gold": "GC=F", "Silver": "SI=F", "CAD": "CADUSD=X"}

end_date = datetime.now()
start_date = end_date - timedelta(days=60)
end_date = "2025-12-03"

log(f"Interval {interval}")
log(f"Downloading data from {start_date} to {end_date}")

dfs = {}
for name, ticker in tickers.items():
    log(f"Downloading {name} ({ticker})...")
    df = yf.download(ticker, start=start_date, end=end_date, interval=interval, auto_adjust=False)
    df = df[["Close"]].rename(columns={"Close": name})
    dfs[name] = df
    log(f"{name} data shape: {df.shape}")


# Merge & clean
all_prices = dfs["Gold"].join([dfs["Silver"], dfs["CAD"]], how="outer")
all_prices = all_prices.ffill().bfill()

log(f"Merged dataframe shape: {all_prices.shape}")

print("Loaded Data Shape")
print(all_prices.shape)
print()


# ===========================================================
# MAX TIMESTAMP PER ASSET
# ===========================================================
print(f"Max Timestamp per Asset for Interval: {interval}")
log("Logging maximum timestamps per asset:")

for asset in ["Gold", "Silver", "CAD"]:
    max_ts = all_prices[asset].dropna().index.max()
    print(f"{asset}: {max_ts}")
    log(f"{asset} max timestamp: {max_ts}")

print()


# ===========================================================
# TRAIN/TEST SPLIT + HASH TRAIN INDICES
# ===========================================================
test_ratio = 0.2
split_idx = int(len(all_prices) * (1 - test_ratio))

train_indices = list(all_prices.index[:split_idx])
test_indices  = list(all_prices.index[split_idx:])

log(f"Train size: {len(train_indices)}, Test size: {len(test_indices)}")

# Convert timestamps to strings for hashing
train_idx_str = json.dumps([str(ts) for ts in train_indices]).encode()

# SHA256
train_hash = hashlib.sha256(train_idx_str).hexdigest()

# Save train indices to a raw JSON file
with open("train_indices.json", "w") as f:
    json.dump([str(ts) for ts in train_indices], f, indent=2)

log(f"SHA256 Hash of train indices: {train_hash}")


print("SHA256 hash of train indices:")
print(train_hash)
print()

log("=== END RUN ===")

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

Loaded Data Shape
(2007, 3)

Max Timestamp per Asset for Interval: 30m
Gold: 2025-12-02 21:30:00+00:00
Silver: 2025-12-02 21:30:00+00:00
CAD: 2025-12-02 21:30:00+00:00

SHA256 hash of train indices:
f54223762806886622699cb167e262fd629156cb1a2ecff43bb4cdf469a1c86b




