In [38]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [39]:
data_file_path = "/Users/aishwaryaiyer/Documents/GitHub/Digital-Asset-Prediction/data/processed/combined_dataset_v1.csv"


In [40]:
def load_data(file_path):

    """ 
    I am sorting it by first symbol and then date bc I think that it ensures that each asset history is maintained, that way learning is not disrupted
    """

    df = pd.read_csv(file_path, parse_dates=['date'])
    df.sort_values(by=['symbol', 'date'], inplace=True)
    return df



In [41]:

# Define dataset class
class TimeSeriesDataset(Dataset):
    def __init__(self, data, target_col, window_size):
        self.data = data
        self.target_col = target_col
        self.window_size = window_size
    
    def __len__(self):
        return len(self.data) - self.window_size
    
    def __getitem__(self, index):
        x = self.data[index:index+self.window_size].values
        y = self.data.iloc[index+self.window_size][self.target_col]
        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

In [42]:
# Define transformer model
class transformer(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=2):
        super(transformer, self).__init__()
        self.encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=input_dim, nhead=2), num_layers=num_layers
        )
        self.decoder = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model=input_dim, nhead=2), num_layers=num_layers
        )
        self.fc = nn.Linear(input_dim, output_dim)
    
    def forward(self, x):
        memory = self.encoder(x)
        x = self.decoder(x, memory)
        x = self.fc(x[:, -1, :])  # Use last time step for prediction
        return x


In [43]:
def train_model(model, dataloader, epochs=10, lr=0.001):
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    for epoch in range(epochs):
        for x, y in dataloader:
            optimizer.zero_grad()
            y_pred = model(x)
            loss = criterion(y_pred, y.unsqueeze(1))
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")


In [44]:
df = load_data(data_file_path)
df['symbol'] = df['symbol'].astype('category').cat.codesv


# Normalize and prepare dataset
features = ['symbol','open', 'high', 'low', 'close', 'volume', 'market_cap', 'daily_return', 'sp500', 'treasury_spread', 'fear_greed', 'gold_price_usd']
df[features] = (df[features] - df[features].mean()) / df[features].std()
dataset = TimeSeriesDataset(df[features], target_col='close', window_size=14)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

# Initialize and train model
input_dim = len(features)
model = transfomer(input_dim, hidden_dim=64, output_dim=1)
train_model(model, dataloader)




Epoch 1, Loss: 0.0194
Epoch 2, Loss: 1.7930
Epoch 3, Loss: 0.0134
Epoch 4, Loss: 0.0023
Epoch 5, Loss: 0.0041
Epoch 6, Loss: 0.0140
Epoch 7, Loss: 0.0093
Epoch 8, Loss: 0.0029
Epoch 9, Loss: 0.0068
Epoch 10, Loss: 0.0018


In [48]:


def evaluate_model(model, dataloader, plot_samples=3):
    """
    Evaluate the model on test data and provide performance metrics and visualizations
    
    Args:
        model: Trained PyTorch model
        dataloader: DataLoader containing test data
        plot_samples: Number of sample predictions to plot
    """
    model.eval()  # Set model to evaluation mode
    predictions = []
    actuals = []
    
    with torch.no_grad():
        for x, y in dataloader:
            y_pred = model(x)
            predictions.extend(y_pred.squeeze().tolist())
            actuals.extend(y.tolist())
    
    # Convert to numpy arrays for easier calculations
    predictions = np.array(predictions)
    actuals = np.array(actuals)
    
    # Calculate metrics
    mse = mean_squared_error(actuals, predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(actuals, predictions)
    r2 = r2_score(actuals, predictions)
    
    print(f"Evaluation Metrics:")
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R² Score: {r2:.4f}")
    
  
    return {'mse': mse, 'rmse': rmse, 'mae': mae, 'r2': r2}



In [50]:
metrics = evaluate_model(model, test_dataloader)

Evaluation Metrics:
MSE: 0.0928
RMSE: 0.3047
MAE: 0.1339
R² Score: 0.2255
