In [37]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tkinter as tk
from tkinter import filedialog
import pandas_ta as ta
from sklearn.model_selection import TimeSeriesSplit
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score
import seaborn as sns

root = tk.Tk()
root.wm_attributes('-topmost', 1)
root.withdraw()

# model, data prep, and model run

# normalize data and assign movement direction values
def prep(dataset):
    scaler = StandardScaler()

    dataset['RSI (14D)'] = ta.rsi(dataset['Close'], length=14)
    dataset['20 Day CCI'] = ta.cci(high=dataset['High'], low=dataset['Low'], 
                                    close=dataset['Close'], length=20)
    dataset['Williams %R'] = ta.willr(high=dataset['High'], low=dataset['Low'], 
                                        close=dataset['Close'], length=14)
    dataset['EMA (5D)'] = dataset['Close'].ewm(span=5, adjust=False).mean()

    features = ['Open', 'High', 'Low', 'Close', 'Volume', 'RSI (14D)', 
                '20 Day CCI', 'Williams %R', 'Mortgage_rate', 'Unemp_rate',
                'disposable_income', 'Personal_consumption_expenditure', 
                'personal_savings', 'Open_VIX', 'Close_VIX', 
                'Adj Close_VIX',	'CORESTICKM159SFRBATL',	'DFF',	'GDP']
    
    dataset[features] = dataset[features].astype(float)
    dataset[features] = scaler.fit_transform(dataset[features])

    dataset['MA50'] = dataset['Close'].rolling(window=50).mean()

    dataset['Target'] = np.where(dataset['Close'].shift(-1) > dataset['Close'], 1, 0)
    prepared_data = dataset.dropna().tail(503)

    return prepared_data, scaler

# create LSTM model class
class LSTM_Model(nn.Module):
    def __init__(self, input_layer, hidden_layer, output_layer):
        super(LSTM_Model, self).__init__()
        self.hidden_layer = hidden_layer
        self.lstm = nn.LSTM(input_layer, hidden_layer, batch_first=True)
        self.linear_layer = nn.Linear(hidden_layer, output_layer)
        self.hidden_cell = (torch.zeros(1, 1, self.hidden_layer),
                            torch.zeros(1, 1, self.hidden_layer))

    # Define the forward pass of the LSTM_Model
    def forward(self, input_tensor):
        self.hidden_cell = (torch.zeros(1, input_tensor.size(0), self.hidden_layer), 
                            torch.zeros(1, input_tensor.size(0), self.hidden_layer))
        
        # Pass the input through the LSTM layer
        out, self.hidden_cell = self.lstm(input_tensor, self.hidden_cell)
        
        # Get the output of the last time step
        lstm_out_last = out[:, -1, :]
        
        # Pass the output through the linear layer
        linear_out = self.linear_layer(lstm_out_last)
        
        return linear_out

# create sequences for input data and corresponding labels
def create_sequence(input_data, sequence_length):
  sequences = []
  for i in range(len(input_data) - sequence_length):
    sequence = input_data[i : i + sequence_length, :-1]
    label = input_data[i + sequence_length, -1]
    sequences.append((sequence, label))
  return sequences

# train the model with data provided
def trainer(model, train_data, loss_func, opt, epochs):
  for epoch in range(epochs):
    for sequence, labels, in train_data:
      opt.zero_grad()
      model.hidden_cell = (torch.zeros(1, 1, model.hidden_layer),
                           torch.zeros(1, 1, model.hidden_layer))
      
      sequence = torch.tensor(sequence).float()
      labels = torch.tensor(labels).float().view(-1, 1)

      # Initialize the hidden state at the start of each sequence
      model.hidden_cell = (torch.zeros(1, sequence.size(0), 
                                       model.hidden_layer),
                            torch.zeros(1, sequence.size(0), 
                                        model.hidden_layer))

      y = model(sequence)
      loss = loss_func(y, labels)
      loss.backward()
      opt.step()

    # print progress as the model runs
    if epoch % 25 == 1:
      print(f'Epoch {epoch} loss: {loss.item():.4f}')

# make predictions using trained model
def predictor(model, test_data):
    model.eval()
    with torch.no_grad():
        predictions = []
        for sequence, _ in test_data:
            sequence = torch.tensor(sequence).float()

            # Initialize the hidden state at the start of each sequence
            model.hidden_cell = (torch.zeros(1, sequence.size(0), 
                                             model.hidden_layer),
                                 torch.zeros(1, sequence.size(0), 
                                             model.hidden_layer))
            
            y = model(sequence)

            batch_predictions = torch.round(torch.sigmoid(y))

            predictions.extend(batch_predictions.squeeze().tolist())

    return predictions

# load and prep data

# get dataset
file_path = filedialog.askopenfilename(parent=root, title="Select A File")
ticker = pd.read_csv(file_path)
ticker, scaler = prep(ticker)

# create sequences
sequence_length = 10
sequences = create_sequence(ticker[['Open', 'High', 'Low', 'Close', 'RSI (14D)', 
                '20 Day CCI', 'Williams %R', 'Mortgage_rate', 'Unemp_rate',
                'disposable_income', 'Personal_consumption_expenditure', 
                'personal_savings', 'CORESTICKM159SFRBATL',	
                'DFF',	'GDP', 'MA50', 'Target']].values, sequence_length)

# cross-validation
tscv = TimeSeriesSplit(n_splits=5)
accuracies = []
precisions = []
recalls = []
f1s = []
kappas = []

for fold, (train_index, test_index) in enumerate(tscv.split(sequences)):
    print(f'Fold {fold+1}')
    
    train_sequences = [sequences[i] for i in train_index]
    test_sequences = [sequences[i] for i in test_index]
    
    train_data = torch.utils.data.DataLoader(train_sequences, shuffle=True, batch_size=5)
    test_data = torch.utils.data.DataLoader(test_sequences, shuffle=True, batch_size=5)
    
    # initialize model
    model = LSTM_Model(input_layer=16, hidden_layer=25, output_layer=1)
    loss_func = nn.BCEWithLogitsLoss()
    opt = optim.Adam(model.parameters(), lr=0.0001)
    
    # train model
    epochs = 150
    trainer(model, train_data, loss_func, opt, epochs)
    
    # make predictions
    test_labels = [label for _, label in test_sequences]
    predictions = predictor(model, test_data)
    
    # calculate statistics
    accuracy = accuracy_score(test_labels, predictions)
    precision = precision_score(test_labels, predictions)
    recall = recall_score(test_labels, predictions)
    f1 = f1_score(test_labels, predictions)
    kappa = cohen_kappa_score(test_labels, predictions)
    
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)
    kappas.append(kappa)
    
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'F1: {f1:.4f}')
    print(f'Kappa: {kappa:.4f}')

# average scores across all folds
avg_accuracy = np.mean(accuracies)
avg_precision = np.mean(precisions)
avg_recall = np.mean(recalls)
avg_f1 = np.mean(f1s)
avg_kappa = np.mean(kappas)

print(f'\nCross-Validation Results:')
print(f'Average Accuracy: {avg_accuracy:.4f}')
print(f'Average Precision: {avg_precision:.4f}')
print(f'Average Recall: {avg_recall:.4f}')
print(f'Average F1: {avg_f1:.4f}')
print(f'Average Kappa: {avg_kappa:.4f}')


Fold 1
Epoch 1 loss: 0.6705


  sequence = torch.tensor(sequence).float()
  labels = torch.tensor(labels).float().view(-1, 1)


Epoch 26 loss: 0.7021
Epoch 51 loss: 0.7295
Epoch 76 loss: 0.6981
Epoch 101 loss: 0.6626
Epoch 126 loss: 0.7053


  sequence = torch.tensor(sequence).float()
  sequence = torch.tensor(sequence).float()
  labels = torch.tensor(labels).float().view(-1, 1)


Accuracy: 0.6296
Precision: 0.6800
Recall: 0.8947
F1: 0.7727
Kappa: -0.1345
Fold 2
Epoch 1 loss: 0.6949
Epoch 26 loss: 0.7319
Epoch 51 loss: 0.5270
Epoch 76 loss: 0.4820
Epoch 101 loss: 0.5657
Epoch 126 loss: 0.4233


  sequence = torch.tensor(sequence).float()
  sequence = torch.tensor(sequence).float()
  labels = torch.tensor(labels).float().view(-1, 1)


Accuracy: 0.5556
Precision: 0.5769
Recall: 0.9375
F1: 0.7143
Kappa: -0.0728
Fold 3
Epoch 1 loss: 0.7092
Epoch 26 loss: 0.6911
Epoch 51 loss: 0.6547
Epoch 76 loss: 0.7178
Epoch 101 loss: 0.4886
Epoch 126 loss: 0.4055


  sequence = torch.tensor(sequence).float()
  sequence = torch.tensor(sequence).float()
  labels = torch.tensor(labels).float().view(-1, 1)


Accuracy: 0.6667
Precision: 0.6800
Recall: 0.9444
F1: 0.7907
Kappa: 0.0690
Fold 4
Epoch 1 loss: 0.7199
Epoch 26 loss: 0.6600
Epoch 51 loss: 0.3807
Epoch 76 loss: 0.4663
Epoch 101 loss: 0.4809
Epoch 126 loss: 0.5554


  sequence = torch.tensor(sequence).float()
  sequence = torch.tensor(sequence).float()
  labels = torch.tensor(labels).float().view(-1, 1)


Accuracy: 0.6296
Precision: 0.6296
Recall: 1.0000
F1: 0.7727
Kappa: 0.0000
Fold 5
Epoch 1 loss: 0.6957
Epoch 26 loss: 0.5655
Epoch 51 loss: 0.8070
Epoch 76 loss: 0.8042
Epoch 101 loss: 0.4483
Epoch 126 loss: 0.8037
Accuracy: 0.6296
Precision: 0.6842
Recall: 0.7647
F1: 0.7222
Kappa: 0.1718

Cross-Validation Results:
Average Accuracy: 0.6222
Average Precision: 0.6502
Average Recall: 0.9083
Average F1: 0.7545
Average Kappa: 0.0067


  sequence = torch.tensor(sequence).float()


In [38]:
import numpy as np
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score
from sklearn.utils import shuffle as sk_shuffle

# Function to calculate model performance
def evaluate_model(model, test_data):
    model.eval()
    with torch.no_grad():
        predictions = []
        true_labels = []
        for sequence, label in test_data:
            sequence = torch.tensor(sequence).float()
            model.hidden_cell = (torch.zeros(1, sequence.size(0), model.hidden_layer),
                                 torch.zeros(1, sequence.size(0), model.hidden_layer))
            y = model(sequence)
            batch_predictions = torch.round(torch.sigmoid(y))
            predictions.extend(batch_predictions.squeeze().tolist())
            true_labels.extend(label.tolist())
        
    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions)
    recall = recall_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)
    kappa = cohen_kappa_score(true_labels, predictions)
    
    return accuracy, precision, recall, f1, kappa

# Calculate baseline performance
baseline_accuracy, baseline_precision, baseline_recall, baseline_f1, baseline_kappa = evaluate_model(model, test_data)

# Function to compute permutation importance for a specific feature
def permutation_importance(model, test_data, feature_index, original_data):
    permuted_data = original_data.copy()
    permuted_data[:, feature_index] = np.random.permutation(permuted_data[:, feature_index])
    permuted_sequences = create_sequence(permuted_data, sequence_length)
    permuted_test_data = torch.utils.data.DataLoader(permuted_sequences, shuffle=True, batch_size=32)
    
    permuted_accuracy, permuted_precision, permuted_recall, permuted_f1, permuted_kappa = evaluate_model(model, permuted_test_data)
    
    accuracy_drop = baseline_accuracy - permuted_accuracy
    precision_drop = baseline_precision - permuted_precision
    recall_drop = baseline_recall - permuted_recall
    f1_drop = baseline_f1 - permuted_f1
    kappa_drop = baseline_kappa - permuted_kappa
    
    return accuracy_drop, precision_drop, recall_drop, f1_drop, kappa_drop

# Get feature indices from the original dataset
feature_names = ['Open', 'High', 'Low', 'Close', 'RSI (14D)', 
                 '20 Day CCI', 'Williams %R', 'Mortgage_rate', 'Unemp_rate',
                 'disposable_income', 'Personal_consumption_expenditure', 
                 'personal_savings', 'CORESTICKM159SFRBATL',	
                 'DFF', 'GDP', 'MA50']
feature_importance = {}

# Calculate permutation importance for each feature
for i, feature in enumerate(feature_names):
    print(f'Calculating importance for feature: {feature}')
    accuracy_drop, precision_drop, recall_drop, f1_drop, kappa_drop = permutation_importance(model, test_data, i, ticker[['Open', 'High', 'Low', 'Close', 'RSI (14D)', 
                '20 Day CCI', 'Williams %R', 'Mortgage_rate', 'Unemp_rate',
                'disposable_income', 'Personal_consumption_expenditure', 
                'personal_savings', 'CORESTICKM159SFRBATL',	
                'DFF', 'GDP', 'MA50', 'Target']].values)
    
    feature_importance[feature] = {
        'accuracy_drop': accuracy_drop,
        'precision_drop': precision_drop,
        'recall_drop': recall_drop,
        'f1_drop': f1_drop,
        'kappa_drop': kappa_drop
    }

# Print results
for feature, importance in feature_importance.items():
    print(f'Feature: {feature}')
    print(f'  Accuracy Drop: {importance["accuracy_drop"]:.4f}')
    print(f'  Precision Drop: {importance["precision_drop"]:.4f}')
    print(f'  Recall Drop: {importance["recall_drop"]:.4f}')
    print(f'  F1 Drop: {importance["f1_drop"]:.4f}')
    print(f'  Kappa Drop: {importance["kappa_drop"]:.4f}')


  sequence = torch.tensor(sequence).float()
  sequence = torch.tensor(sequence).float()
  sequence = torch.tensor(sequence).float()
  sequence = torch.tensor(sequence).float()
  sequence = torch.tensor(sequence).float()
  sequence = torch.tensor(sequence).float()
  sequence = torch.tensor(sequence).float()
  sequence = torch.tensor(sequence).float()
  sequence = torch.tensor(sequence).float()
  sequence = torch.tensor(sequence).float()
  sequence = torch.tensor(sequence).float()
  sequence = torch.tensor(sequence).float()
  sequence = torch.tensor(sequence).float()
  sequence = torch.tensor(sequence).float()
  sequence = torch.tensor(sequence).float()


Calculating importance for feature: Open
Calculating importance for feature: High
Calculating importance for feature: Low
Calculating importance for feature: Close
Calculating importance for feature: RSI (14D)
Calculating importance for feature: 20 Day CCI
Calculating importance for feature: Williams %R
Calculating importance for feature: Mortgage_rate
Calculating importance for feature: Unemp_rate
Calculating importance for feature: disposable_income
Calculating importance for feature: Personal_consumption_expenditure
Calculating importance for feature: personal_savings
Calculating importance for feature: CORESTICKM159SFRBATL
Calculating importance for feature: DFF
Calculating importance for feature: GDP
Calculating importance for feature: MA50
Feature: Open
  Accuracy Drop: -0.0492
  Precision Drop: -0.0111
  Recall Drop: -0.0994
  F1 Drop: -0.0483
  Kappa Drop: -0.0838
Feature: High
  Accuracy Drop: -0.0431
  Precision Drop: 0.0054
  Recall Drop: -0.1382
  F1 Drop: -0.0528
  Kappa D

  sequence = torch.tensor(sequence).float()
  sequence = torch.tensor(sequence).float()


In [39]:
fall = (ticker.Target == 0).sum()
rise = (ticker.Target == 1).sum()

print(f'Number of days rising: {rise}')
print(f'Number of days falling: {fall}')

print(f'Rise % is: {(rise / (rise + fall)) * 100:.2f}%')
print(f'Fall % is: {(fall / (rise + fall)) * 100:.2f}%')

Number of days rising: 108
Number of days falling: 67
Rise % is: 61.71%
Fall % is: 38.29%


In [157]:
results = pd.DataFrame(columns=['Dataset', 'Average Accuracy', 'Average Precision', 
                                'Average Recall', 'Average F1'])

datatset_name = file_path.split('/')[-1].split('.')[0]

metrics = {'Dataset': datatset_name ,'Average Accuracy': avg_accuracy, 
           'Average Precision': avg_precision, 'Average Recall': avg_recall, 
           'Average F1': avg_f1}

export_results = results._append(metrics, ignore_index=True)

export_results.set_index('Dataset', inplace=True)

export_results.to_csv(r"C:\Users\samto\Desktop\IRP DATA-20240724T195545Z-001\IRP DATA\\" + datatset_name + '_results.csv')

export_results

Unnamed: 0_level_0,Average Accuracy,Average Precision,Average Recall,Average F1
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
nyse_daily,0.514634,0.567364,0.533256,0.542568
