# Basic Setup

In [98]:
import pandas as pd
import plotly.express as px
import numpy as np

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dropout, Flatten, Dense, BatchNormalization
from tensorflow.keras import Sequential

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, recall_score, precision_score
from sklearn.utils.class_weight import compute_class_weight

df = pd.read_csv("SPY.csv")
df['day_of_week'] = pd.to_datetime(df['Date']).dt.dayofweek
df['day'] = pd.to_datetime(df['Date']).dt.day
df['month'] = pd.to_datetime(df['Date']).dt.month
df['year'] = pd.to_datetime(df['Date']).dt.year
df = df.reset_index()
df

Unnamed: 0,index,Date,Close/Last,Volume,Open,High,Low,day_of_week,day,month,year
0,0,12/09/2024,604.6800,34742740,607.69,607.860,604.080,0,9,12,2024
1,1,12/06/2024,607.8100,31241550,607.44,609.070,607.020,4,6,12,2024
2,2,12/05/2024,606.6600,28762180,607.66,608.480,606.305,3,5,12,2024
3,3,12/04/2024,607.6600,42787560,605.63,607.910,604.950,2,4,12,2024
4,4,12/03/2024,603.9100,26906630,603.39,604.160,602.341,1,3,12,2024
...,...,...,...,...,...,...,...,...,...,...,...
2511,2511,12/16/2014,197.9100,258867900,198.58,202.395,197.860,1,16,12,2014
2512,2512,12/15/2014,199.5100,188920400,201.98,202.530,198.780,0,15,12,2014
2513,2513,12/12/2014,200.8900,201983200,202.64,203.819,200.850,4,12,12,2014
2514,2514,12/11/2014,204.1900,158542800,203.88,206.190,203.710,3,11,12,2014


In [None]:
def days_before(df, days):
    all_prev_data = []  # List to store all the previous data
    
    for day in range(1, days + 1):
        def prior(row):
            # Ensure we do not go out of bounds (row.name + day must be within the DataFrame index)
            if row.name + day < len(df):
                prior_row = df.iloc[row.name + day]
                return pd.Series({
                    f"prev{day}Close/Last": prior_row["Close/Last"],
                    f"prev{day}Volume": prior_row["Volume"],
                    f"prev{day}Open": prior_row["Open"],
                    f"prev{day}High": prior_row["High"],
                    f"prev{day}Low": prior_row["Low"]
                })
            else:
                # Return NaN for out-of-bounds cases
                return pd.Series({
                    f"prev{day}Close/Last": None,
                    f"prev{day}Volume": None,
                    f"prev{day}Open": None,
                    f"prev{day}High": None,
                    f"prev{day}Low": None
                })

        # Apply the prior function to each row and create new columns
        prev_data = df.apply(prior, axis=1)
        all_prev_data.append(prev_data)  # Append the data for this day

    # Concatenate all the new columns at once
    df = pd.concat([df] + all_prev_data, axis=1)

    # Drop the last 'days' rows to remove rows with NaN values
    df = df.dropna(axis=0, how='any')

    return df

# Call the function
df = days_before(df, 20)




In [97]:
df

Unnamed: 0,index,Date,Close/Last,Volume,Open,High,Low,day_of_week,day,month,...,prev19Close/Last,prev19Volume,prev19Open,prev19High,prev19Low,prev20Close/Last,prev20Volume,prev20Open,prev20High,prev20Low
0,0,12/09/2024,604.6800,34742740,607.69,607.86,604.080,0,9,12,...,598.76,37586770.0,599.81,600.170,597.0000,598.1900,46444890.0,596.17,599.640,596.1650
1,1,12/06/2024,607.8100,31241550,607.44,609.07,607.020,4,6,12,...,598.19,46444890.0,596.17,599.640,596.1650,595.6100,47233210.0,593.08,596.650,592.9999
2,2,12/05/2024,606.6600,28762180,607.66,608.48,606.305,3,5,12,...,595.61,47233210.0,593.08,596.650,592.9999,591.0400,68181970.0,589.20,591.930,585.3900
3,3,12/04/2024,607.6600,42787560,605.63,607.91,604.950,2,4,12,...,591.04,68181970.0,589.20,591.930,585.3900,576.7000,39478320.0,570.74,576.740,570.5200
4,4,12/03/2024,603.9100,26906630,603.39,604.16,602.341,1,3,12,...,576.70,39478320.0,570.74,576.740,570.5200,569.8100,38216980.0,571.18,572.500,567.8900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2491,2491,01/15/2015,199.0199,175383200,201.63,202.01,198.880,3,15,1,...,201.79,252458700.0,198.44,202.340,198.2900,197.9100,258867900.0,198.58,202.395,197.8600
2492,2492,01/14/2015,200.8600,192551700,199.65,201.10,198.570,2,14,1,...,197.91,258867900.0,198.58,202.395,197.8600,199.5100,188920400.0,201.98,202.530,198.7800
2493,2493,01/13/2015,202.0800,214341100,204.12,205.48,200.510,1,13,1,...,199.51,188920400.0,201.98,202.530,198.7800,200.8900,201983200.0,202.64,203.819,200.8500
2494,2494,01/12/2015,202.6500,144043100,204.41,204.60,201.920,0,12,1,...,200.89,201983200.0,202.64,203.819,200.8500,204.1900,158542800.0,203.88,206.190,203.7100


In [71]:

def is_up(row):
    try:
        # Ensure the next row exists before accessing
        if row["Close/Last"] + 1 < df.iloc[row["index"] + 1]["Close/Last"]:
            # Compare "Close" values
            return 1
        else:
            return 0
    except Exception as e:
        print(f"Error: {e}")
        return 0
    
# def is_down(row):
#     try:
#         # Ensure the next row exists before accessing
#         if row["Close"] -1 > df.iloc[row["index"] + 1]["Close"]:
#             # Compare "Close" values
#             return 1
#         else:
#             return 0
#     except Exception as e:
#         print(f"Error: {e}")
#         return 0
        
df["NextDayUp1"] = df.apply(is_up, axis=1)
# df["NextDayDown1"] = df.apply(is_down, axis=1)
df


y = df[df["year"]!=2024]['NextDayUp1']
X = df[df["year"] != 2024].drop(['Date', "NextDayUp1", "index"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=41)

# fit scaler on training data
norm = MinMaxScaler().fit(X_train)

# transform data
X_train = norm.transform(X_train)
X_test = norm.transform(X_test)

hold_X = df[df["year"] == 2024].drop(['Date', "NextDayUp1", "index"], axis=1)
hold_y = df[df["year"] == 2024]['NextDayUp1']
hold_X = norm.transform(hold_X)
X

Error: single positional indexer is out-of-bounds


  df["NextDayUp1"] = df.apply(is_up, axis=1)


Unnamed: 0,Close/Last,Volume,Open,High,Low,day_of_week,day,month,year,prev1Close/Last,...,prev19Close/Last,prev19Volume,prev19Open,prev19High,prev19Low,prev20Close/Last,prev20Volume,prev20Open,prev20High,prev20Low
237,475.3100,122283100,476.49,477.030,473.30,4,29,12,2023,472.65,...,491.27,61322750.0,487.730,491.415,487.1700,490.8900,58618390.0,490.560,491.620,490.1100
238,476.6900,77158120,476.88,477.550,476.26,3,28,12,2023,475.31,...,487.41,76641610.0,487.590,489.120,486.5400,491.2700,61322750.0,487.730,491.415,487.1700
239,476.5100,68000310,475.44,476.660,474.89,2,27,12,2023,476.69,...,488.03,72524990.0,487.575,488.305,485.3900,487.4100,76641610.0,487.590,489.120,486.5400
240,475.6500,55386950,474.07,476.580,473.99,1,26,12,2023,476.51,...,485.39,81765040.0,487.810,488.770,484.8819,488.0300,72524990.0,487.575,488.305,485.3900
241,473.6500,67160420,473.86,475.380,471.70,4,22,12,2023,475.65,...,484.86,49945300.0,484.010,485.105,482.8900,485.3900,81765040.0,487.810,488.770,484.8819
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2511,197.9100,258867900,198.58,202.395,197.86,1,16,12,2014,201.79,...,200.86,192551700.0,199.650,201.100,198.5700,199.0199,175383200.0,201.630,202.010,198.8800
2512,199.5100,188920400,201.98,202.530,198.78,0,15,12,2014,197.91,...,202.08,214341100.0,204.120,205.480,200.5100,200.8600,192551700.0,199.650,201.100,198.5700
2513,200.8900,201983200,202.64,203.819,200.85,4,12,12,2014,199.51,...,202.65,144043100.0,204.410,204.600,201.9200,202.0800,214341100.0,204.120,205.480,200.5100
2514,204.1900,158542800,203.88,206.190,203.71,3,11,12,2014,200.89,...,204.25,157293400.0,206.400,206.420,203.5100,202.6500,144043100.0,204.410,204.600,201.9200


In [72]:
# def next(row):
#     try:
#         return df.iloc[row["index"] + 1]["Close/Last"]

#     except Exception as e:
#         print(f"Error: {e}")
#         return 0
    

        
# df["Next"] = df.apply(next, axis=1)
# # df["NextDayDown1"] = df.apply(is_down, axis=1)
# df


# y = df[df["year"]!=2024]['Next']
# X = df[df["year"]!=2024].drop(['Date', "Next", "index"], axis=1)

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=41)

# # fit scaler on training data
# norm = MinMaxScaler().fit(X_train)

# # transform data
# X_train = norm.transform(X_train)
# X_test = norm.transform(X_test)

# hold_X = df[df["year"] == 2024].drop(['Date', "Next", "index"], axis=1)
# hold_y = df[df["year"] == 2024]['Next']
# hold_X = norm.transform(hold_X)
# X

# Modeling

In [73]:
num_features = len(X_train[0])

# model.add(Dropout(.5))

model = Sequential([
  Dense(500, activation='relu', input_dim=num_features),
  Dropout(.4),
  Dense(64, activation='relu'),
  Dropout(.4),
  Dense(32, activation='relu'),
  Dense(1)
])

model.summary()

def rmse(y_true, y_pred):
    return tf.sqrt(tf.reduce_mean(tf.square(y_pred - y_true)))

class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weights_dict = dict(enumerate(class_weights))

opt = keras.optimizers.Adam(learning_rate=0.0003)
model.compile(loss='mse', optimizer=opt, metrics=['mse'])

early_stop = keras.callbacks.EarlyStopping(monitor='val_mse', patience=300, mode='min')

history = model.fit(X_train, y_train, epochs=1000, validation_split=.35, batch_size=40, callbacks=[early_stop], shuffle=False, class_weight=class_weights_dict)
hist = pd.DataFrame(history.history)

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   


 dense_28 (Dense)            (None, 500)               55000     
                                                                 
 dropout_14 (Dropout)        (None, 500)               0         
                                                                 
 dense_29 (Dense)            (None, 64)                32064     
                                                                 
 dropout_15 (Dropout)        (None, 64)                0         
                                                                 
 dense_30 (Dense)            (None, 32)                2080      
                                                                 
 dense_31 (Dense)            (None, 1)                 33        
                                                                 
Total params: 89,177
Trainable params: 89,177
Non-trainable params: 0
_________________________________________________________________
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6

In [74]:
predictions = (model.predict(X_test) > 0.5).astype(np.int64)

pred = pd.DataFrame(predictions,columns=['predictions'])





In [75]:
accuracy = accuracy_score(y_test, pred["predictions"])
recall = recall_score(y_test, pred["predictions"])
precision = precision_score(y_test, pred["predictions"])
print(accuracy)
print(recall)
print(precision)


0.6798245614035088
0.5480769230769231
0.43291139240506327


In [76]:
predictions = (model.predict(hold_X) > 0.5).astype(np.int64)

pred = pd.DataFrame(predictions,columns=['predictions'])
accuracy = accuracy_score(hold_y, pred["predictions"])
recall = recall_score(hold_y, pred["predictions"])
precision = precision_score(hold_y, pred["predictions"])
print(accuracy)
print(recall)
print(precision)

0.6371308016877637
0.323943661971831
0.3770491803278688
