In [73]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score
import pandas as pd
import tkinter as tk
from tkinter import filedialog
import pandas_ta as ta

# this is a change
root = tk.Tk()
root.wm_attributes('-topmost', 1)
root.withdraw()

# model, data prep, and model run

# normalize data and assign movement direction values
def prep(dataset):
  scaler = StandardScaler()

  dataset['RSI (14D)'] = ta.rsi(dataset['Close'], length=14)
  dataset['20 Day CCI'] = ta.cci(high=dataset['High'], low=dataset['Low'], 
                                 close=dataset['Close'], length=20)
  dataset['Williams %R'] = ta.willr(high=dataset['High'], low=dataset['Low'], 
                                    close=dataset['Close'], length=14)
  dataset['EMA (5D)'] = dataset['Close'].ewm(span=5, adjust=False).mean()

  bollinger = ta.bbands(dataset['Close'], length=20, std=2)
  dataset['BB_upper'] = bollinger['BBU_20_2.0']
  dataset['BB_middle'] = bollinger['BBM_20_2.0']
  dataset['BB_lower'] = bollinger['BBL_20_2.0']

  features = ['Open', 'High', 'Low', 'Close', 'Volume', 'RSI (14D)', 
              '20 Day CCI', 'Williams %R','BB_middle']
  
  dataset[features] = dataset[features].astype(float)
  dataset[features] = scaler.fit_transform(dataset[features])

  dataset['MA10'] = dataset['Close'].rolling(window=10).mean()
  # dataset['MA50'] = dataset['Close'].rolling(window=50).mean()
  
  dataset['Target'] = np.where(dataset['Close'].shift(-1) > dataset['Close'], 1, 0)
  prepared_data = dataset.dropna()

  #print(prepared_data)

  return prepared_data, scaler

# create LSTM model class
class LSTM_Model(nn.Module):
  def __init__(self, input_layer, hidden_layer, output_layer):
    super(LSTM_Model, self).__init__()
    self.hidden_layer = hidden_layer
    self.lstm = nn.LSTM(input_layer, hidden_layer, batch_first=True)
    self.linear_layer = nn.Linear(hidden_layer, output_layer)
    self.hidden_cell = (torch.zeros(1,1,self.hidden_layer).cuda(),
                        torch.zeros(1,1,self.hidden_layer).cuda())

  # Define the forward pass of the LSTM_Model
  def forward(self, input_tensor):

    self.hidden_cell = (torch.zeros(1, input_tensor.size(0), self.hidden_layer).cuda(), 
                        torch.zeros(1, input_tensor.size(0), self.hidden_layer).cuda())
    
    out, self.hidden_cell = self.lstm(input_tensor, self.hidden_cell)
    lstm_out_last = out[:, -1, :]
    predicted_values = self.linear_layer(lstm_out_last)
    return predicted_values

# create sequences for input data and corresponding labels
def create_sequence(input_data, sequence_length):
  sequences = []
  for i in range(len(input_data) - sequence_length):
    sequence = input_data[i : i + sequence_length, :-1]
    label = input_data[i + sequence_length, -1]
    sequences.append((sequence, label))
  return sequences

# train the model with data provided
def trainer(model, train_data, loss_func, opt, epochs):
  for epoch in range(epochs):
    for sequence, labels, in train_data:
      opt.zero_grad()
      model.hidden_cell = (torch.zeros(1, 1, model.hidden_layer),
                           torch.zeros(1, 1, model.hidden_layer))
      
      sequence = sequence.clone().detach().float().cuda()
      labels = labels.clone().detach().float().view(-1, 1).cuda()

      # Initialize the hidden state at the start of each sequence
      model.hidden_cell = (torch.zeros(1, sequence.size(0), 
                                       model.hidden_layer),
                            torch.zeros(1, sequence.size(0), 
                                        model.hidden_layer))

      y = model(sequence)
      loss = loss_func(y, labels)
      loss.backward()
      opt.step()

    # print progress as the model runs
    if epoch % 25 == 1:
      print(f'Epoch {epoch} loss: {loss.item()}')

# make predictions using trained model
def predictor(model, test_data):
    model.eval()
    with torch.no_grad():
        predictions = []
        for sequence, _ in test_data:
            sequence = torch.tensor(sequence).float().cuda()

            # Initialize the hidden state at the start of each sequence
            model.hidden_cell = (torch.zeros(1, sequence.size(0), 
                                             model.hidden_layer),
                                 torch.zeros(1, sequence.size(0), 
                                             model.hidden_layer))
            
            y = model(sequence)
            predictions.append(torch.round(torch.sigmoid(y)).item())
    return predictions


# load and prep data

# get dataset
file_path = filedialog.askopenfilename(parent=root,  title="Select A File")

ticker = pd.read_csv(file_path)
ticker, scaler = prep(ticker)

# create sequences
sequence_length = 10
sequences = create_sequence(ticker[['Open', 'High', 'Low', 'Close', 'Volume', 'RSI (14D)', '20 Day CCI', 
                                    'Williams %R','BB_middle', 'Target']].values, sequence_length)

# split test/train and create dataloader
train_size = int(len(sequences) * 0.6) # set train size
train_sequences = sequences[ : train_size]
test_sequences = sequences[train_size : ]

train_data = torch.utils.data.DataLoader(train_sequences, shuffle=True, batch_size=1)
test_data = torch.utils.data.DataLoader(test_sequences, shuffle=True, batch_size=1)

# initialise model
model = LSTM_Model(input_layer=9, hidden_layer=150, output_layer=1)
model.to('cuda') # move model to the GPU
loss_func = nn.BCEWithLogitsLoss() # is this the best one?
opt = optim.Adam(model.parameters(), lr=0.001) # and this?

# train
epochs = 150 # is this optimal?
trainer(model, train_data, loss_func, opt, epochs)

# run model and predict values
test_labels = [label for _, label in test_sequences]
predictions = predictor(model, test_data)

# calcluate statistics
accuracy = accuracy_score(test_labels, predictions)
cm = confusion_matrix(test_labels, predictions)


print(f'Confusion Matrix:\n{cm}')
print(f'Accuracy: {accuracy}') 



Epoch 1 loss: 0.5599114894866943
Epoch 26 loss: 0.5266392230987549
Epoch 51 loss: 0.1687854528427124
Epoch 76 loss: 0.24095119535923004


In [72]:
import copy

baseline_predictions = predictor(model, test_data)
baseline_accuracy = accuracy_score(test_labels, baseline_predictions)
print(f'Baseline Accuracy: {baseline_accuracy}')


def permutation_importance(model, test_data, test_labels, sequence_length, features, scaler):
    baseline_predictions = predictor(model, test_data)
    baseline_accuracy = accuracy_score(test_labels, baseline_predictions)
    feature_importances = {}
    
    for feature_idx, feature in enumerate(features):
        shuffled_test_data = copy.deepcopy(ticker)
        
        # Shuffle the specific feature column
        shuffled_test_data[feature] = np.random.permutation(shuffled_test_data[feature].values)
        
        # Create sequences for shuffled data
        shuffled_sequences = create_sequence(shuffled_test_data[features + ['Target']].values, sequence_length)
        
        # Split shuffled data into test sequences
        shuffled_test_data = torch.utils.data.DataLoader(shuffled_sequences[train_size:], shuffle=False, batch_size=1)
        
        # Get predictions with shuffled data
        shuffled_predictions = predictor(model, shuffled_test_data)
        
        # Calculate the accuracy with shuffled data
        shuffled_accuracy = accuracy_score(test_labels, shuffled_predictions)
        
        # Calculate importance as the drop in accuracy
        feature_importance = baseline_accuracy - shuffled_accuracy
        feature_importances[feature] = feature_importance
        print(f'Feature: {feature}, Importance: {feature_importance}')
    
    return feature_importances

features = ['Open', 'High', 'Low', 'Close', 'Volume', 'RSI (14D)', 
              '20 Day CCI', 'Williams %R','BB_middle']

feature_importances = permutation_importance(model, test_data, test_labels, sequence_length, features, scaler)

# Sort and print the feature importances
sorted_importances = sorted(feature_importances.items(), key=lambda x: x[1], reverse=True)
print("Feature Importances (sorted):")
for feature, importance in sorted_importances:
    print(f'{feature}: {importance}')

  sequence = torch.tensor(sequence).float().cuda()


Baseline Accuracy: 0.4772727272727273


  sequence = torch.tensor(sequence).float().cuda()
  sequence = torch.tensor(sequence).float().cuda()


Feature: Open, Importance: -0.006198347107438051


  sequence = torch.tensor(sequence).float().cuda()


Feature: High, Importance: -0.0041322314049587194


  sequence = torch.tensor(sequence).float().cuda()


Feature: Low, Importance: -0.022727272727272707


  sequence = torch.tensor(sequence).float().cuda()


Feature: Close, Importance: -0.018595041322314043


  sequence = torch.tensor(sequence).float().cuda()


Feature: Volume, Importance: -0.024793388429752095


  sequence = torch.tensor(sequence).float().cuda()


Feature: RSI (14D), Importance: 0.002066115702479332


  sequence = torch.tensor(sequence).float().cuda()


Feature: 20 Day CCI, Importance: -0.02066115702479343


  sequence = torch.tensor(sequence).float().cuda()


Feature: Williams %R, Importance: -0.01446280991735538


  sequence = torch.tensor(sequence).float().cuda()


Feature: BB_middle, Importance: -0.01446280991735538
Feature Importances (sorted):
RSI (14D): 0.002066115702479332
High: -0.0041322314049587194
Open: -0.006198347107438051
Williams %R: -0.01446280991735538
BB_middle: -0.01446280991735538
Close: -0.018595041322314043
20 Day CCI: -0.02066115702479343
Low: -0.022727272727272707
Volume: -0.024793388429752095
