# Part one: Introduction

### Imports

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pandas_ta as ta
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow import keras


In [None]:
# -.- is to remove the scientific notation of the log_returns in numpy arrays
pd.set_option('display.float_format', str)
np.set_printoptions(suppress=True)

### Loading dataset

In [None]:
df_nflx_intraday = pd.read_csv('data_nflx_intraday.csv', index_col=0, parse_dates=True) # will be our working df
df = df_nflx_intraday.copy() # working df
df.columns = df.columns.str.lower()

### Adding buy signals and their respective technical indicators

In [None]:
# adding some indicators
#super trend : calculates the trend, length is the ATR length default is 7 but i took 10 since i wanna trade on the 10 minute horizon
# the signal is 1 if the trend is going up aka buy signal
# the value of supertrend is a price which ill use later as a feature
supertrend = df.ta.supertrend(length = 10)
supertrend.rename(columns={'SUPERTd_10_3.0': 'supertrend_signal', 'SUPERT_10_3.0' : 'supertrend'}, inplace=True)
supertrend = supertrend[['supertrend_signal','supertrend']] # <- first df to append to main df
df = pd.concat([df, supertrend], axis=1)

In [None]:
#VWAP
df.ta.vwap(append=True)
df.rename(columns={'VWAP_D': 'vwap'}, inplace=True)

In [None]:
df.head()

In [None]:
df['vwap_signal'] = (df['vwap'] < df['close']).astype(int)

In [None]:
# stoch indicator and signal
stoch = df.ta.stoch(k=10, d=2)
stoch.rename(columns={'STOCHk_10_2_3': 'stoch_k', 'STOCHd_10_2_3' : 'stoch_d'}, inplace=True)
stoch['stoch_signal'] = ((stoch['stoch_k'] <= 20) & (stoch['stoch_k'] > stoch['stoch_d'])).astype(int)
df = pd.concat([df, stoch], axis=1)

In [None]:
# adx
adx = df.ta.adx(length=10)
adx.rename(columns={'ADX_10': 'adx', 'DMP_10' : 'adx_direction_pos', 'DMN_10' : 'adx_direction_neg'}, inplace=True)
adx['adx_signal'] = ((adx['adx'] > 25) & (adx['adx_direction_pos'] > adx['adx_direction_neg'])).astype(int)
df = pd.concat([df, adx], axis=1)

In [None]:
#obv
# with the obv we want to check for divergence ( obv is increasing and price is decreasing, this could be a potential buy signal)
obv = df.ta.obv()
obv = pd.DataFrame(obv)
obv.rename(columns={'OBV': 'obv'}, inplace=True)
df = pd.concat([df, obv], axis=1)
df['obv_signal'] = ((df['obv'] > df['obv'].rolling(window=5).mean()) & (df['close'] < df['close'].rolling(window=5).mean())).astype(int)


In [None]:
# rsi
df['rsi'] = df.ta.rsi()
df['rsi_signal'] = (df['rsi'] < 30).astype(int)


In [None]:
# golden cross
df['gc_signal'] = (ta.sma(df['close'], length=5) > ta.sma(df['close'], length=15)).astype(int)


In [None]:
# donchian
donchian = df.ta.donchian(lower_length=15, upper_length=15)
donchian.rename(columns={'DCU_15_15': 'donchian_upper'}, inplace=True)
df['donchian_signal'] = (donchian['donchian_upper'] > df['close']).astype(int)

In [None]:
# macd
macd = df.ta.macd(fast=5, slow=15, signal=3)

macd['macd_signal'] = (
    (macd['MACD_5_15_3'] < 0) & (macd['MACDs_5_15_3'] < 0) &
    (macd['MACD_5_15_3'] > macd['MACDs_5_15_3']) &
    (macd['MACD_5_15_3'].shift(1) <= macd['MACDs_5_15_3'].shift(1)) &
    (macd['MACDh_5_15_3'] > 0)).astype(int)

macd.rename(columns={'MACDh_5_15_3': 'macd'}, inplace=True)
df = pd.concat([df, macd[['macd', 'macd_signal']]], axis=1)

In [None]:
df

In [None]:
print(adx.columns)

In [None]:
help(ta.macd)

### Adding some signals using visual patterns with candlestick trends

In [None]:
# https://www.investopedia.com/articles/active-trading/062315/using-bullish-candlestick-patterns-buy-stocks.asp
# candles df
candles = df.ta.cdl_pattern(name=['engulfing', 'hammer', 'invertedhammer', 'piercing', 'morningstar', '3whitesoldiers'])
candles.rename(columns={
    'CDL_ENGULFING': 'engulfing',
    'CDL_HAMMER': 'hammer',
    'CDL_INVERTEDHAMMER': 'invertedhammer',
    'CDL_PIERCING': 'piercing',
    'CDL_MORNINGSTAR': 'morningstar',
    'CDL_3WHITESOLDIERS': '3whitesoldiers'
}, inplace=True)

In [None]:
candles

In [None]:
df = pd.concat([df, candles], axis=1)



In [None]:
df[1:2]

### Did't need a lot of data processing as we didn't have tons of NaN data, now we will choose if we want regression or classiication
Classification: using the features to predict y: (Bool price increase or no) -> logistic regression and random forest classifier

Regression: using the features to predict y: (Continuous value price change) -> linear regression and random forest regressor

In [None]:
df['price_increased'] = np.where(df['close'].diff() > 0, 1, 0)

In [None]:
df.head(5)

### Classification model one: Logistic Regression

In [None]:
# imports
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Copying the df to main_df so we can copy back from it for future models

In [None]:
df.dropna(inplace=True)
df_main = df.copy()

In [None]:
df.info()

In [None]:
# dividing X and y
X = df.drop(columns=['price_increased'])
y = df[['price_increased']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

y_pred_logistic = logistic_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_logistic)
print("Accuracy:", accuracy)

Our accuracy is 0.5483133218982276 which is not that good, the main issue i can think of is that our data is shuffled with the train_test_split function and our data is sequencial as it has DateTimeIndex as its index. One solution could be to not shuffle:

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

y_pred_logistic = logistic_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_logistic)
print("Accuracy:", accuracy)

Now our accuracy is 0.6921097770154374 but still not that great! What we could do next is standardize our data.

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

predictions = logistic_model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

And now our accuracy is even better, 0.7764436821040595! Lets take 3 random values and see what the model predicts.

In [None]:

X_try =  df[100:103].drop(columns=['price_increased'])
X_try = scaler.fit_transform(X_try)
y_try = df[100:103][['price_increased']]

In [None]:
logistic_model.predict(X_try)

In [None]:
y_try

### Classification model two: Random Forest Classifier

In [None]:
df = df_main.copy()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score
scaler = StandardScaler()  
# dividing X and y
X = df.drop(columns=['price_increased'])
y = df[['price_increased']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_classifier.fit(X_train, y_train)


In [None]:
predictions = random_forest_classifier.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test,predictions)
recall = recall_score(y_test, predictions)

print("Random Forest Classifier Accuracy:", accuracy)
print("Random Forest Classifier Precision:", precision)
print("Random Forest Classifier Recall:", recall)

A benefit of using random forests is that we can use the Feature importances function to get an an idea of which features are more important in predicting y.

In [None]:
feature_importances = random_forest_classifier.feature_importances_
columns = df.drop(columns=['price_increased']).columns
feature_importance_df = pd.DataFrame({'Feature': columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print(feature_importance_df)

In [None]:
len(columns)

We can see that the macd, rsi, stoch, adx, volume, obv and vwap are the most important features in predicting wether the price will increase or decrease.

Next we will look into two regression models. Here we try to predict a continuous value ( best case scenario the future price).
Instead of the price we will be calculating the change in price because its more static.
First model will be a classic linear regression.

### Regression model one: linear regression.

The main differences are that we try to predict another y with our features and that the scoring is a little bit different. Instead of the accuracy we will try to have a low as possible RMSE (this is the error rate of our models predictions vs the actual values)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
scaler = StandardScaler()  

In [None]:
df = df_main.copy()
df['log_return'] = ta.log_return(df['close'])
df.dropna(inplace=True)
# dividing X and y
X = df.drop(columns=['log_return'])
y = df[['log_return']]



In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

predictions = linear_model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)

In [None]:
correlation_matrix = df.corr()
correlation_matrix["log_return"].sort_values(ascending=False)

# here we can see the correlation between the close price and the other features
# this could help us in deciding which columns to keep
# value between -1 and 1

Our MSE is 3.374455314810467e-07 and once again we can see which features are the most correlated to our y prediction.

### Regression model two: Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
scaler = StandardScaler()  
df = df_main.copy()
df['log_return'] = ta.log_return(df['close'])
df.dropna(inplace=True)
# dividing X and y
X = df.drop(columns=['log_return'])
y = df[['log_return']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
random_forest_model = RandomForestRegressor(n_estimators=100, random_state=42)
random_forest_model.fit(X_train, y_train)

predictions = random_forest_model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print("Random Forest Mean Squared Error:", mse)

In [None]:
feature_importances = random_forest_model.feature_importances_
columns = df.drop(columns=['log_return']).columns
feature_importance_df = pd.DataFrame({'Feature': columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print(feature_importance_df)

# Part two: Neural Networks: regression and classification

## Classification Neural Network

In [None]:
df = df_main.copy()

In [None]:
df['price_increased'] = np.where(df['close'].diff() > 0, 1, 0)
df.dropna(inplace=True)

In [None]:
# dividing X and y
X = df.drop(columns=['price_increased'])
y = df[['price_increased']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

In [None]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
num_features = X_train.shape[1]

In [None]:
# ** PAGE 296 **
model = keras.models.Sequential([
 keras.layers.Flatten(input_shape=(num_features,)),
 keras.layers.Dense(300, activation="relu"),
 keras.layers.Dense(100, activation="relu"),
 keras.layers.Dense(1, activation="sigmoid")
])

In [None]:
model.compile(loss="binary_crossentropy",
                optimizer="sgd",
                metrics=["accuracy"])
model.summary()

In [None]:
history = model.fit(X_train, y_train, epochs=10,
                     validation_split=0.1, batch_size=32 )

In [None]:
pd.DataFrame(history.history).plot(figsize=(16, 9))
plt.show()

Conclusion: this is really good since both the losses are falling and both the accuracies are increasing. And the fact that the curves are really close to each other this means we dont have overfitting happening. 
If we are not happy with the result we can tune the 
    hyperparameters ( number of layers, number of neurons per layer, type of activation for each hidden layer, the epochs, batch siz (it can be set in the fit() method using the
batch_size argument, which defaults to 32)) 

In [None]:
# Evaluating the model
model.evaluate(X_test, y_test)

In [None]:
# Using the model to make predictions
X_new = X_test[200:210]
predictions = model.predict(X_new)
predictions.round(2)

In [None]:
y_test[200:210]

## Regression Neural Network

### Sequantial style

In [None]:
# predicting the close price using neural networks with 31 features

# copying the df so we have a fresh df to work with
df = df_main.copy()
df.dropna(inplace=True)
num_features = df.shape[1]
# dividing X and y
X = df.drop(columns=['close'])
y = df[['close']]

In [None]:
print("Number of columns:", num_features) # 31 features
scaler = MinMaxScaler()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
num_features = X_train.shape[1]

In [None]:
#** PAGE 296 **
model = keras.models.Sequential([
 keras.layers.Dense(30, activation="relu", input_shape=(num_features,)),
 keras.layers.Dense(1, activation="linear")
])

In [None]:
model.compile(loss="mean_squared_error", optimizer="sgd")
model.summary()

In [None]:
history = model.fit(X_train, y_train, epochs=10, validation_split=0.1, batch_size=20, verbose=1)

In [None]:
print('Mean squared error rate on regressive model predicting close price is:', model.evaluate(X_test, y_test))

In [None]:
# making predictions
X_new = X_test[200:210]
predictions = model.predict(X_new)
predictions


In [None]:
y_test[200:210]

### Functional style (more adjustable)

In [None]:
num_features = X_train.shape[1]
input = keras.layers.Input(shape=num_features)
hidden = keras.layers.Dense(30, activation="relu")(input)
concat = keras.layers.Concatenate()([input, hidden])
output = keras.layers.Dense(1, activation="linear")(concat)
model = keras.models.Model(inputs=[input], outputs=[output])

In [None]:
model.compile(loss="mean_squared_error", optimizer="sgd")
history = model.fit(X_train, y_train, epochs=20,
           validation_split=0.1, batch_size=32 )

In [None]:
model.evaluate(X_test, y_test)

In [None]:
# making predictions
X_new = X_test[100:110]
predictions = model.predict(X_new)
predictions


In [None]:
y_test[100:110]

The predictions are really flawed and I can clearly see that the loss starts off small but then suddenly increases, the model doesn't work as supposed. In a bit I will handle parameter tuning and optimization in the hopes that that will fix my issue.

## LSTM regression (using sequences)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
df = df_main.copy()
scaler = StandardScaler()
scaler = scaler.fit(df)
df_scaled = scaler.transform(df)

In [None]:
col_index_to_predict = df.columns.get_loc('close')
features_count = df.shape[1]
val_future = 1 # pred next i days
sequence_length = 3 # use prev j days

In [None]:
sequences = []
close_arr = []

for i in range(len(df_scaled) - sequence_length):
    sequences.append(df_scaled[i:i + sequence_length, :])
    close_arr.append(df_scaled[i + sequence_length, col_index_to_predict])

sequences = np.array(sequences)
close_arr = np.array(close_arr)

print(sequences.shape)
print(close_arr.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(sequences, close_arr, test_size=0.2, random_state=42, shuffle=False)

In [None]:
model = Sequential()
model.add(LSTM(64, activation='relu', input_shape=(sequence_length, features_count), return_sequences=True))
# in the code above return_sequence true because we want the first ltsm to return another sequence
# for the ltsm thats about to come, if the next layer isnt an lstm then set return to false
model.add(LSTM(64, activation='relu', return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='linear'))
model.compile(optimizer='adam', loss='mse')
model.summary()

In [None]:
history = model.fit(X_train, y_train, epochs=10, batch_size=20, validation_split=0.1, verbose=1)

In [None]:
print('Mean squared error rate on the sequential LSTM predicting close price is:', model.evaluate(X_test,y_test))

In [None]:
random_pred = model.predict(X_test[12:13])

In [None]:
preds = np.tile(random_pred, (1, features_count))
random_pred_scaled_back = scaler.inverse_transform(preds)[:, col_index_to_predict]

real_y = y_test[12:13]
reals = np.tile(real_y, (1, features_count))
real_y_scaled_back = scaler.inverse_transform(reals)[:, col_index_to_predict]

In [None]:
print('Predicted next price: ' ,random_pred_scaled_back, 'Actual next price: ', real_y_scaled_back)

In [None]:
# getting the weights of the input layer
input_weights = model.layers[0].get_weights()[0]

# dictionary to store each weight to its column
weights_by_column = dict(zip(X.columns.tolist(), input_weights.T))

feature_weights = {feature: np.abs(weight).sum() for feature, weight in weights_by_column.items()}
top_10_features = sorted(feature_weights, key=lambda x: feature_weights[x], reverse=True)[:10]
for feature in top_10_features:
    print(f"Feature: {feature}, Weight: {feature_weights[feature]}")


## LSTM classification (using sequences)

In [1669]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
df = df_main.copy()
X = data 
y = data['price_increased']  
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [1670]:
col_index_to_predict = df.columns.get_loc('price_increased')
num_features = df.shape[1]
val_future = 1 # pred next i days
sequence_length = 5 # use prev j days

In [1671]:
sequences = []
price_increased_arr = []

for i in range(len(X_scaled) - sequence_length):
    sequences.append(X_scaled[i:i + sequence_length])
    price_increased_arr.append(y.iloc[i + sequence_length - 1])

sequences = np.array(sequences)
price_increased_arr = np.array(price_increased_arr)

sequences = np.array(sequences)
price_increased_arr = np.array(price_increased_arr)

print(sequences.shape)
print(price_increased_arr.shape)

(17482, 5, 31)
(17482,)


In [1672]:
X_train, X_test, y_train, y_test = train_test_split(sequences, price_increased_arr, test_size=0.2, random_state=42, shuffle=False)

In [1673]:
model = Sequential()
model.add(LSTM(64, activation='tanh', input_shape=(sequence_length, num_features), return_sequences=True))
model.add(LSTM(32, activation='tanh', return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='sgd', metrics='accuracy')
model.summary()

Model: "sequential_476"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_64 (LSTM)              (None, 5, 64)             24576     
                                                                 
 lstm_65 (LSTM)              (None, 32)                12416     
                                                                 
 dropout_15 (Dropout)        (None, 32)                0         
                                                                 
 dense_1991 (Dense)          (None, 1)                 33        
                                                                 
Total params: 37,025
Trainable params: 37,025
Non-trainable params: 0
_________________________________________________________________


In [1674]:
history = model.fit(X_train, y_train, epochs=20, batch_size=20, validation_split=0.1, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [1666]:
print('Accuracy on the sequential LSTM predicting if price will increase (y/n): ', (model.evaluate(X_test,y_test))[1])
print('Loss on the test set of the model: ', (model.evaluate(X_test,y_test))[0])

Accuracy on the sequential LSTM predicting if price will increase (y/n):  0.9997140169143677
Loss on the test set of the model:  0.08104927092790604


In [1698]:
# pick random 5 rows
random_x = df[1990:1995]
# scaling the rows
random_x_scaled = scaler.transform(random_x)
random_x_scaled = random_x_scaled.reshape(1, 5, num_features)
# reshape so we can feed it to numpy
# (5, 31) -> (1, 5, 31)

prediction = model.predict(random_x_scaled)
# >0.5 = 1 = True else False so that 0.9952133 will be 1 etc
prediction = (predictions > 0.5).astype(int)

# Print the predictions
print("Predicted Labels:", prediction)
df[1990:1996]

Predicted Labels: [[1]]


Unnamed: 0_level_0,open,high,low,close,volume,supertrend_signal,supertrend,vwap,vwap_signal,stoch_k,...,donchian_signal,macd,macd_signal,engulfing,hammer,invertedhammer,piercing,morningstar,3whitesoldiers,price_increased
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-07-31 09:25:00,426.0,426.49,425.61,426.49,34.0,-1,427.44555899434135,426.5935427589915,0,51.06728678705997,...,1,0.0537991689700452,1,0.0,0.0,0.0,0.0,0.0,0.0,1
2023-07-31 09:26:00,426.4,426.5,425.91,425.91,15.0,-1,427.44555899434135,426.5933431977994,0,38.07641633728719,...,1,0.0012007782475225,0,0.0,0.0,0.0,0.0,0.0,0.0,0
2023-07-31 09:27:00,426.5,427.34,425.91,426.55,238.0,-1,427.44555899434135,426.5933862102044,0,50.73965218892885,...,1,0.0553056269856001,0,0.0,0.0,0.0,0.0,0.0,0.0,1
2023-07-31 09:28:00,426.61,426.85,426.41,426.85,103.0,-1,427.44555899434135,426.5936928012562,1,50.78458505617963,...,1,0.0915643588350221,0,0.0,0.0,0.0,0.0,0.0,0.0,1
2023-07-31 09:29:00,426.8,427.31,425.923,426.94,790.0,-1,427.44555899434135,426.59642840229367,1,67.8468894268523,...,1,0.0905260897340092,0,0.0,0.0,0.0,0.0,0.0,0.0,1
2023-07-31 09:30:00,426.51,427.11,426.3,427.005,71732.0,-1,427.44555899434135,426.7331120724046,1,76.39691714836324,...,1,0.0743573384886868,0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [1699]:
# getting the weights of the input layer
input_weights = model.layers[0].get_weights()[0]

# dictionary to store each weight to its column
weights_by_column = dict(zip(X.columns.tolist(), input_weights.T))

feature_weights = {feature: np.abs(weight).sum() for feature, weight in weights_by_column.items()}
top_10_features = sorted(feature_weights, key=lambda x: feature_weights[x], reverse=True)[:10]
for feature in top_10_features:
    print(f"Feature: {feature}, Weight: {feature_weights[feature]}")


Feature: rsi_signal, Weight: 2.657886266708374
Feature: volume, Weight: 2.5759963989257812
Feature: engulfing, Weight: 2.5741710662841797
Feature: invertedhammer, Weight: 2.4467780590057373
Feature: rsi, Weight: 2.4382731914520264
Feature: macd, Weight: 2.4347383975982666
Feature: stoch_signal, Weight: 2.4232380390167236
Feature: stoch_k, Weight: 2.3380470275878906
Feature: donchian_signal, Weight: 2.321899652481079
Feature: adx, Weight: 2.315385103225708


## FINE TUNING NEURAL NETWORK

In [1805]:
from keras.wrappers.scikit_learn import KerasRegressor
from keras.layers import AlphaDropout

# this object has fit(), score() and predict()
# the score will be opposite of MSE -> higher is better
def build_model(n_hidden=1, n_neurons=30, learning_rate=3e-3, input_shape=[30], optimizer='sgd'): 

    model = keras.models.Sequential()
    options = {'input_shape': input_shape,
                'activation':'selu',
                'kernel_initializer':'lecun_normal',
                'kernel_regularizer' : keras.regularizers.l1(0.01)}
    for layer in range(n_hidden):
        model.add(Dense(n_neurons, **options))
    options = {} 
    model.add(AlphaDropout(rate=0.2))
    model.add(Dense(1, activation='linear', **options, kernel_initializer='lecun_normal', kernel_regularizer = keras.regularizers.l1(0.01))) 
    model.compile(loss="mse", optimizer=optimizer) 
    if optimizer == 'adam':
        optimizer = keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999)
    elif optimizer == 'sgd':
        optimizer = keras.optimizers.SGD(learning_rate, clipnorm=1) 
    model.compile(loss="mse", optimizer=optimizer) 
    return model
    

In [1849]:
df = df_main.copy()
df.dropna(inplace=True)

# normalizing
scaler = StandardScaler()
scaler = scaler.fit(df)
df_scaled = scaler.transform(df)

# dividing X and y
X = df.drop(columns=['close'])
y = df[['close']]

X = np.array(X)
y = np.array(y)

print(X.shape)
print(y.shape)

num_features = X.shape[1]
print(num_features)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, shuffle=False)

(17487, 30)
(17487, 1)
30


In [1854]:
X_train

array([[427.7  , 427.82 , 427.55 , ...,   0.   ,   0.   ,   1.   ],
       [427.7  , 427.7  , 427.53 , ...,   0.   ,   0.   ,   0.   ],
       [427.4  , 427.49 , 427.   , ...,   0.   ,   0.   ,   0.   ],
       ...,
       [412.935, 412.935, 412.935, ...,   0.   ,   0.   ,   0.   ],
       [412.935, 412.935, 412.935, ...,   0.   ,   0.   ,   0.   ],
       [412.935, 412.935, 412.935, ...,   0.   ,   0.   ,   0.   ]])

In [1851]:
from scipy.stats import reciprocal
from sklearn.model_selection import RandomizedSearchCV


In [1852]:
param_grid = {
 'n_hidden': [1, 2, 3, 4],
 'n_neurons': np.arange(1, 100),
 'learning_rate': reciprocal(3e-4, 3e-2),
 'optimizer': ['adam', 'sgd'],  # Add optimizer as a hyperparameter
}
grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=10, cv=3, verbose=3)

In [1853]:
grid.fit(X_train, y_train, epochs=30,
 validation_split=0.1,
 callbacks=[keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)])




Fitting 3 folds for each of 10 candidates, totalling 30 fits
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
[CV 1/3] END learning_rate=0.021629088102836282, n_hidden=4, n_neurons=88, optimizer=adam;, score=-25456.240 total time=  13.6s
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
[CV 2/3] END learning_rate=0.021629088102836282, n_hidden=4, n_neurons=88, optimizer=adam;, score=-12126.986 

In [1905]:
results = grid.cv_results_
# Convert the 'params' key from results to a DataFrame for better visualization
params_df = pd.DataFrame(results['params'])

# Print the DataFrame to see all hyperparameters used in each combination
print(params_df)

          learning_rate  n_hidden  n_neurons optimizer
0  0.021629088102836282         4         88      adam
1 0.0007475736846458284         1         88       sgd
2 0.0004076564878056142         3         59      adam
3 0.0014732302002701152         2         92      adam
4 0.0003650225911813686         3         53      adam
5 0.0011018896957727365         3         70       sgd
6 0.0009677735703570196         4          6      adam
7 0.0005518963375383896         3         52       sgd
8 0.0029282007084415595         2         55       sgd
9   0.01986178583868863         4         84       sgd


In [1906]:
results_df = pd.DataFrame(grid.cv_results_)

# Sort the results by the mean_test_score (lower is better for loss functions)
sorted_results_df = results_df.sort_values(by='mean_test_score', ascending=True)

# Print the top 10 sets of hyperparameters and their corresponding error
top_10_results = sorted_results_df.head(10)
for index, row in top_10_results.iterrows():
    params = row['params']
    mean_test_score = row['mean_test_score']
    std_test_score = row['std_test_score']
    error = -mean_test_score  # Assuming lower is better for your scoring metric

    print(f'Hyperparameters: {params}')
    print(f'Mean Test Score (Error): {error:.2f} +/- {std_test_score:.2f}')
    print('---')

Hyperparameters: {'learning_rate': 0.0007475736846458284, 'n_hidden': 1, 'n_neurons': 88, 'optimizer': 'sgd'}
Mean Test Score (Error): 1342225.02 +/- 1518257.40
---
Hyperparameters: {'learning_rate': 0.0029282007084415595, 'n_hidden': 2, 'n_neurons': 55, 'optimizer': 'sgd'}
Mean Test Score (Error): 459893.76 +/- 339916.78
---
Hyperparameters: {'learning_rate': 0.0005518963375383896, 'n_hidden': 3, 'n_neurons': 52, 'optimizer': 'sgd'}
Mean Test Score (Error): 360923.75 +/- 71582.65
---
Hyperparameters: {'learning_rate': 0.0011018896957727365, 'n_hidden': 3, 'n_neurons': 70, 'optimizer': 'sgd'}
Mean Test Score (Error): 284306.71 +/- 168087.32
---
Hyperparameters: {'learning_rate': 0.0009677735703570196, 'n_hidden': 4, 'n_neurons': 6, 'optimizer': 'adam'}
Mean Test Score (Error): 221143.13 +/- 245721.07
---
Hyperparameters: {'learning_rate': 0.0003650225911813686, 'n_hidden': 3, 'n_neurons': 53, 'optimizer': 'adam'}
Mean Test Score (Error): 169506.79 +/- 85510.95
---
Hyperparameters: {'le

In [1907]:
best_model = grid.best_estimator_
print(best_model)

<keras.wrappers.scikit_learn.KerasRegressor object at 0x00000280BC322D00>


In [1908]:
best_hyperparameters = grid.best_params_
print(best_hyperparameters)

{'learning_rate': 0.0014732302002701152, 'n_hidden': 2, 'n_neurons': 92, 'optimizer': 'adam'}


In [1909]:
best_score = grid.best_score_
print(best_score)


-24072.825236002605


In [1910]:
# keras regressor doesnt have an .evaluate so we use the score
# its opposite of the rmse so the higher the score the better

y_pred = best_model.predict(X_test[-10:])

# Get the corresponding real values for the same subset
y_true = y_test[-10:]
dates = df.index[-10:]
# Convert the NumPy arrays to lists
# Print values side by side
for dt, pred, real in zip(dates, y_pred, y_true):
    error = np.abs(pred - real[0])  # Calculate the absolute error
    print(f'Date: {dt} | Prediction price: {pred:.2f} | Real price: {real[0]:.2f}  | Error: {error:.2f}')


Date: 2023-08-24 19:50:00 | Prediction price: 488.29 | Real price: 406.00  | Error: 82.29
Date: 2023-08-24 19:51:00 | Prediction price: 484.35 | Real price: 406.21  | Error: 78.14
Date: 2023-08-24 19:52:00 | Prediction price: 482.14 | Real price: 406.11  | Error: 76.03
Date: 2023-08-24 19:53:00 | Prediction price: 482.46 | Real price: 406.26  | Error: 76.20
Date: 2023-08-24 19:54:00 | Prediction price: 490.97 | Real price: 406.30  | Error: 84.67
Date: 2023-08-24 19:55:00 | Prediction price: 488.49 | Real price: 406.49  | Error: 82.00
Date: 2023-08-24 19:56:00 | Prediction price: 497.69 | Real price: 406.55  | Error: 91.15
Date: 2023-08-24 19:57:00 | Prediction price: 492.28 | Real price: 406.44  | Error: 85.84
Date: 2023-08-24 19:58:00 | Prediction price: 497.66 | Real price: 406.43  | Error: 91.23
Date: 2023-08-24 19:59:00 | Prediction price: 489.61 | Real price: 406.48  | Error: 83.13


Keras regressor with sequences

In [None]:
sequence_length = 30  # we take every half an hour as a sequence
num_features = X.shape[1]  # amount of features

# Create sequences for training
sequences = []
close_arr = []

for i in range(len(X) - sequence_length):
    sequences.append(X.iloc[i:i+sequence_length].values)
    close = y.iloc[i+sequence_length].values
    close_arr.append(close)
   

# Convert lists to NumPy arrays
sequences = np.array(sequences)
close_arr = np.array(close_arr)

# Reshape sequences for scaling
sequences = sequences.reshape(-1, num_features)

# Normalize data
scaler = MinMaxScaler()
sequences = scaler.fit_transform(sequences)  # Fit and transform sequences

# Reshape sequences back to 3D
sequences = sequences.reshape(-1, sequence_length, num_features)

# Split data into training and test sets
split_ratio = 0.9
split_index = int(len(sequences) * split_ratio)

X_train, X_test = sequences[:split_index], sequences[split_index:]
y_train, y_test = close_arr[:split_index], close_arr[split_index:]



param_grid = {
 'n_hidden': [1, 2, 3, 4],
 #'n_hidden': np.arange(1, 20),
 'n_neurons': np.arange(1, 100),
 'learning_rate': reciprocal(3e-4, 3e-2),
}

model = KerasRegressor(build_model, batch_size=20)

grid = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=10, cv=3, verbose=3)


grid.fit(X_train, y_train, epochs=20,
 validation_split=0.1,
 callbacks=[keras.callbacks.EarlyStopping(patience=10)])

In [None]:
y_train

In [None]:

grid.best_score_
grid.best_params_

In [None]:
y_pred = grid.predict(X_test[200:201])
y_pred

In [None]:
y[200:201]

In [None]:
input_A = keras.layers.Input(shape=(num_features_A,))
input_B = keras.layers.Input(shape=(num_features_B,))
hidden1 = keras.layers.Dense(30, activation="relu")(input_B)
hidden2 = keras.layers.Dense(30, activation="relu")(hidden1)
concat = keras.layers.concatenate([input_A, hidden2])
output = keras.layers.Dense(1, activation="linear")(concat)

aux_output = keras.layers.Dense(1)(hidden2)
model = keras.models.Model(inputs=[input_A, input_B],
outputs=[output, aux_output])
X_train_A, X_train_B = X_train[:, :4], X_train[:, 4:]
X_test_A, X_test_B = X_test[:, :4], X_test[:, 4:]
X_new_A, X_new_B = X_test_A[:10], X_test_B[:10]
model.compile(loss=["mse", "mse"], loss_weights=[0.9, 0.1], optimizer="sgd")
history = model.fit([X_train_A, X_train_B], [y_train, y_train], epochs=20,
  validation_split=0.1, batch_size=32)
mse_test = model.evaluate((X_test_A, X_test_B), y_test)

In [None]:
num_columns = df.shape[1]
print("Number of columns:", num_columns)
#df['price_increased'] = np.where(df['close'].diff() > 0, 1, 0)
#df['log_return'] = np.log(df['close'] / df['close'].shift(1))
df['log_return'] = ta.log_return(df['close'])
df.fillna(0, inplace=True)

In [None]:
df.info()

In [None]:
df.dropna(inplace=True)

In [None]:
# dividing X and y
X = df.drop(columns=['close'])
y = df[['close']]

In [None]:
df.info()

In [None]:
# Define the proportion of data to use for testing
test_size = 0.2

# Calculate the index where the split should occur
split_index = int(len(X) * (1 - test_size))

# Split the data into training and testing sets
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

In [None]:
X.info()

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)


In [None]:
num_features = X_train.shape[1]

In [None]:
from tensorflow.keras.regularizers import l1

# for regression
input = keras.layers.Input(shape=(num_features,))
hidden = keras.layers.Dense(30, activation="relu", kernel_regularizer=l1(0.01))(input)
concat = keras.layers.Concatenate()([input, hidden])
output = keras.layers.Dense(1, activation="linear", kernel_regularizer=l1(0.01))(concat)
model = keras.models.Model(inputs=[input], outputs=[output])

model.compile(loss="mean_squared_error", optimizer="sgd")
history = model.fit(X_train, y_train, epochs=20,
           validation_split=0.1, batch_size=32 )

""" # for classification
input = keras.layers.Input(shape=(num_features,))
hidden = keras.layers.Dense(30, activation="relu")(input)
concat = keras.layers.Concatenate()([input, hidden])
output = keras.layers.Dense(1, activation="sigmoid")(concat)
model = keras.models.Model(inputs=[input], outputs=[output])

model.compile(loss="binary_crossentropy", optimizer="sgd", metrics="accuracy")
history = model.fit(X_train, y_train, epochs=20,
           validation_split=0.1, batch_size=32 ) """


In [None]:
# Evaluating the model
model.evaluate(X_test, y_test)

In [None]:
# making predictions
X_new = X_test[270:280]
predictions = model.predict(X_new)
predictions


In [None]:
model.predict(X_test[270:280])

In [None]:
X

In [None]:
y_test[270:280]

In [None]:
# Assuming you have X_train and column_names defined

# Get the weights of the input layer
column_names = X.columns.tolist()
input_weights = model.layers[1].get_weights()[0]  # Only the weight matrix

# Create a dictionary to map weights to column names
weights_by_column = dict(zip(column_names, input_weights))

# Print the weights associated with each feature
for feature, weight in weights_by_column.items():
    print(f"Feature: {feature}, Weight: {weight}")

In [None]:
import numpy as np

# Assuming you have X_train and column_names defined
# Assuming you have model defined

# Get the weights of the input layer
input_weights = model.layers[1].get_weights()[0]  # Only the weight matrix

# Create a dictionary to map weights to column names
weights_by_column = dict(zip(column_names, input_weights.T))  # Transpose weights

# Calculate the absolute sum of weights for each feature
feature_weights = {feature: np.abs(weight).sum() for feature, weight in weights_by_column.items()}

# Number of top features to select
top_n = 30

# Get the top 'n' features based on weights
top_features = sorted(feature_weights, key=lambda x: feature_weights[x], reverse=True)[:top_n]

# Print the top features and their weights
for feature in top_features:
    print(f"Feature: {feature}, Weight: {feature_weights[feature]}")


In [None]:
y_test

Parameter tuning

In [1776]:
def build_model(n_hidden=1, n_neurons=30, learning_rate=3e-3, input_shape=[8]):
    model = keras.models.Sequential()
    options = {"input_shape": input_shape}
    for layer in range(n_hidden):
        model.add(keras.layers.Dense(n_neurons, activation="relu", **options))
        options = {}
    model.add(keras.layers.Dense(1, **options))
    optimizer = keras.optimizers.SGD(learning_rate)
    model.compile(loss="mse", optimizer=optimizer)
    return model

def build_model(n_hidden=1, n_neurons=30, learning_rate=3e-3, input_shape=(8,)):
    model = keras.models.Sequential()
    model.add(keras.layers.InputLayer(input_shape=input_shape))  # Add input layer
    for layer in range(n_hidden):
        model.add(keras.layers.Dense(n_neurons, activation="relu"))
    model.add(keras.layers.Dense(1))
    optimizer = keras.optimizers.SGD(learning_rate)
    model.compile(loss="mse", optimizer=optimizer)
    return model


In [1777]:
keras_reg = keras.wrappers.scikit_learn.KerasRegressor(build_model(input_shape=(num_features,)))

  keras_reg = keras.wrappers.scikit_learn.KerasRegressor(build_model(input_shape=(num_features,)))


In [None]:
keras_reg.fit(X_train, y_train, epochs=100,
        validation_split=0.1,
        callbacks=[keras.callbacks.EarlyStopping(patience=10)])
mse_test = keras_reg.score(X_test, y_test)
y_pred = keras_reg.predict(X_new)


In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
def build_model(n_hidden=1, n_neurons=30, learning_rate=3e-3, input_shape=(8,)):
    model = keras.models.Sequential()
    model.add(keras.layers.InputLayer(input_shape=input_shape))  # Add input layer
    for layer in range(n_hidden):
        model.add(keras.layers.Dense(n_neurons, activation="relu"))
    model.add(keras.layers.Dense(1))
    optimizer = keras.optimizers.SGD(learning_rate)
    model.compile(loss="mse", optimizer=optimizer)
    return model

keras_reg = keras.wrappers.scikit_learn.KerasRegressor(build_model)

keras_reg.fit(X_train, y_train, epochs=100,
              validation_split=0.1,
              callbacks=[keras.callbacks.EarlyStopping(patience=10)])
mse_test = keras_reg.score(X_test, y_test)
y_pred = keras_reg.predict(X_new)