In [32]:
import yfinance as yf
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib

In [3]:
data = yf.download("AAPL", start="2010-01-01", end="2024-01-01")

[*********************100%***********************]  1 of 1 completed


In [7]:
data

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,MA50,MA200,Momentum5,Momentum20,Lag1,Lag5
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2010-01-04,7.622500,7.660714,7.585000,7.643214,6.454506,493729600,,,,,,
2010-01-05,7.664286,7.699643,7.616071,7.656429,6.465664,601904800,,,,,7.643214,
2010-01-06,7.656429,7.686786,7.526786,7.534643,6.362820,552160000,,,,,7.656429,
2010-01-07,7.562500,7.571429,7.466071,7.520714,6.351058,477131200,,,,,7.534643,
2010-01-08,7.510714,7.571429,7.466429,7.570714,6.393281,447610800,,,,,7.520714,
...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-22,195.179993,195.410004,192.970001,193.600006,192.868149,37122800,185.398801,178.649100,-3.970001,3.630005,194.679993,197.570007
2023-12-26,193.610001,193.889999,192.830002,193.050003,192.320221,28919300,185.682801,178.871851,-2.839996,3.260010,193.600006,195.889999
2023-12-27,192.490005,193.500000,191.089996,193.149994,192.419830,48087700,185.971400,179.085250,-3.790009,2.750000,193.050003,196.940002
2023-12-28,194.139999,194.660004,193.169998,193.580002,192.848206,34049900,186.300001,179.290201,-1.250000,4.210007,193.149994,194.830002


Pre-process data

In [5]:
# handle missing data using interpolation
data = data.interpolate(method='linear', axis=0).ffill().bfill() # draws straight line between known values and fills in the missing values



Feature Engineering

In [6]:
# create moving averages
data['MA50'] = data['Close'].rolling(window=50).mean()
data['MA200'] = data['Close'].rolling(window=200).mean()

# create momentum indicators
data['Momentum5'] = data['Close'].diff(periods=5)
data['Momentum20'] = data['Close'].diff(periods=20)

# create lagged returns
data['Lag1'] = data['Close'].shift(1)
data['Lag5'] = data['Close'].shift(5)

In [8]:
print(data.isnull().sum())

data = data.dropna()

Open            0
High            0
Low             0
Close           0
Adj Close       0
Volume          0
MA50           49
MA200         199
Momentum5       5
Momentum20     20
Lag1            1
Lag5            5
dtype: int64


In [12]:
data

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,MA50,MA200,Momentum5,Momentum20,Lag1,Lag5
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2010-10-18,11.373929,11.392857,11.224643,11.357143,9.590826,1093010800,9.659357,8.767198,0.808573,1.241786,11.240714,10.548571
2010-10-19,10.835714,11.206071,10.715000,11.053214,9.334167,1232784000,9.693457,8.784248,0.391071,0.918571,11.357143,10.662143
2010-10-20,11.035714,11.223214,10.959643,11.090357,9.365534,721624400,9.729971,8.801418,0.371071,0.813571,11.053214,10.719286
2010-10-21,11.155714,11.240714,10.957143,11.054286,9.335072,551460000,9.772350,8.819016,0.257500,0.735715,11.090357,10.796786
2010-10-22,11.038214,11.072857,10.939286,10.981071,9.273244,372778000,9.812121,8.836318,-0.259643,0.541072,11.054286,11.240714
...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-22,195.179993,195.410004,192.970001,193.600006,192.868149,37122800,185.398801,178.649100,-3.970001,3.630005,194.679993,197.570007
2023-12-26,193.610001,193.889999,192.830002,193.050003,192.320221,28919300,185.682801,178.871851,-2.839996,3.260010,193.600006,195.889999
2023-12-27,192.490005,193.500000,191.089996,193.149994,192.419830,48087700,185.971400,179.085250,-3.790009,2.750000,193.050003,196.940002
2023-12-28,194.139999,194.660004,193.169998,193.580002,192.848206,34049900,186.300001,179.290201,-1.250000,4.210007,193.149994,194.830002


Splitting data into Test and Train sets

In [9]:
X = data[['Open', 'High', 'Low', 'Volume', 'MA50', 'MA200', 'Momentum5', 'Momentum20', 'Lag1', 'Lag5']]
y = data['Close']

In [10]:
# split into train and test splits
train_size = int(len(data) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

In [25]:
y_train_categorical = (y_train > y_train.shift(1)).astype(int)  # 1 if price goes up, 0 otherwise

y_test_categorical = (y_test > y_test.shift(1)).astype(int)

Naive Bayes

In [22]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train, y_train_categorical)

In [23]:
y_pred = model.predict(X_test)

In [26]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test_categorical, y_pred)
precision = precision_score(y_test_categorical, y_pred)
recall = recall_score(y_test_categorical, y_pred)
f1 = f1_score(y_test_categorical, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

Accuracy: 0.5218045112781955
Precision: 0.5218045112781955
Recall: 1.0
F1-score: 0.6857707509881423


Linear Regression

In [27]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)

In [31]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared: {r2}")

Mean Squared Error (MSE): 2.85753342273026e-23
Root Mean Squared Error (RMSE): 5.345590166417792e-12
Mean Absolute Error (MAE): 5.286480693467823e-12
R-squared: 1.0


In [35]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(64, activation='relu'))
model.add(Dense(1))

model.compile(optimizer='adam', loss='mse')

model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - loss: 6310308872192.0000 - val_loss: 16260444160.0000
Epoch 2/100
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 36693377024.0000 - val_loss: 64290684.0000
Epoch 3/100
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 84572216.0000 - val_loss: 1781224.6250
Epoch 4/100
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 13483614.0000 - val_loss: 69072.7031
Epoch 5/100
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 630707.7500 - val_loss: 214314.1875
Epoch 6/100
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 338397.9375 - val_loss: 4191.8838
Epoch 7/100
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 63916.2227 - val_loss: 21026.0254
Epoch 8/100
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2m

<keras.src.callbacks.history.History at 0x79d383a57730>

In [36]:
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared: {r2}")

[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
Mean Squared Error (MSE): 14951.672378488829
Root Mean Squared Error (RMSE): 122.27703127934056
Mean Absolute Error (MAE): 121.198113956308
R-squared: -43.793482488350655
