In [5]:
from ast import In
import os
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
import numpy as np
import pandas as pd
import pickle
from sklearn.utils import resample
import torch
import yfinance
from zipfile import ZipFile
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Activation
import matplotlib.pyplot as plt

In [6]:
class NopTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X

class DatetimeTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def fromDate(self, date):
        return (pd.to_datetime(date) - pd.to_datetime('1950-01-01')).days

    def transform(self, X):
        if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series):
            X = X.to_numpy()
        if len(X.shape) != 2:
            X = X.reshape(X.shape[0], 1)
        return np.array([[self.fromDate(j) for j in i] for i in X])

def writeToCSV(data, filename):
    with open(filename, 'w') as outfile:
        for i in range(len(data[0]-1)):
            outfile.write('0,')
        outfile.write('0\n')
        for slice_2d in data:
            for x in slice_2d[:-1]:
                outfile.write(str(x) + ',')
            outfile.write(str(slice_2d[-1]) + '\n')

In [34]:
loadRawFromCSV = os.path.exists('rawData.pkl')

rawData = []

if loadRawFromCSV:
    with open('rawData.pkl', 'rb') as file:
        rawData = pickle.load(file)
else:
    kStocks = 50

    stocks = []

    with ZipFile('stocks.zip') as zip:
        for stock in zip.namelist():
            if stock.startswith('stocks/'):
                stocks.append(stock.split('/')[1].split('.')[0])

    stocks = resample(stocks, n_samples=kStocks, random_state=0)

    with ZipFile('stocks.zip') as zip:
        for stock in stocks:
            zip.extract(f'stocks/{stock}.csv')

    namedStocks = [(stock, pd.read_csv('stocks/' + stock + '.csv')) for stock in stocks]

    rawData = []

    i = 0

    for stock, stockData in namedStocks:
        i += 1
        print(f'Loading stock industry {i} of {kStocks}')
        try:
            sector = yfinance.Ticker(stock).info['sector']
            sectorFeature = np.empty((len(stockData), 1), dtype=object)
            sectorFeature.fill(sector)
            rawData.append(np.c_[stockData, sectorFeature])
        except:
            print(f"{stock} doesn't have a sector!")
            continue

    with open('rawData.pkl', 'wb') as file:
        pickle.dump(rawData, file)

# loadPreprocessedFromCSV = os.path.exists('preprocessedData.pkl')
loadPreprocessedFromCSV = False


preprocessedData = []

if loadPreprocessedFromCSV:
    with open('preprocessedData.pkl', 'rb') as file:
        preprocessedData = pickle.load(file)
else:
    DateIdx, OpenIdx,HighIdx,LowIdx,CloseIdx,Adj_CloseIdx,VolumeIdx,IndustryIdx = (0, 1, 2, 3, 4, 5, 6, 7)

    pipeline = ColumnTransformer([
        ('date pipeline', DatetimeTransformer(), DateIdx),
        ('numerical pipeline', Pipeline([
            ('nop', NopTransformer())
            # ('scale features', StandardScaler())
        ]), [OpenIdx, HighIdx, LowIdx, CloseIdx, Adj_CloseIdx, VolumeIdx]),
        ('industry pipeline', OneHotEncoder(), [IndustryIdx])
    ])

    # print(rawData)
    combinedStocks = []
    for rawStock in rawData:
        combinedStocks.extend(rawStock[-500:])

    preprocessedData = pipeline.fit_transform(combinedStocks)

    with open('preprocessedData.pkl', 'wb') as file:
        pickle.dump(preprocessedData, file)

217575

In [35]:
len(preprocessedData)

18077

In [36]:
concatenatedStocks = preprocessedData

# for preprocessedData in preprocessedData:
#     concatenatedStocks.extend(preprocessedData)

    # concatonated stocks index -stopping point

# concatenatedStocks = concatenatedStocks[-500:]
concatenatedStocks = np.array(concatenatedStocks)
target = np.array(concatenatedStocks)[:, CloseIdx]
concatenatedStocks = concatenatedStocks[:-1]
target = target[1:]

In [37]:
X = []
y = []

n = concatenatedStocks.shape[0]
window_size = 50
for i in range(n-window_size):
    X.append(concatenatedStocks[i:i+window_size])
    y.append(target[i:i+window_size])

X = np.array(X).astype(np.float32)
y = np.array(y).astype(np.float32)


In [38]:
X.shape

(18026, 50, 15)

In [39]:
tf.random.set_seed(7)
x_train, x_test,y_train,y_test = train_test_split(X,y,test_size= .2)

# x_train = np.array(x_train).astype(np.float32)
# x_test = np.array(x_test).astype(np.float32)
# y_train = np.array(y_train).astype(np.float32)
# y_test = np.array(y_test).astype(np.float32)

In [None]:
# x_train.reshape(x_train.shape[0],7,1)

In [None]:
model = Sequential()
model.add(LSTM(8, input_shape=(x_train.shape[1],x_train.shape[2]), return_sequences=False))
model.add(Dense(2,kernel_initializer='normal',activation='linear'))
model.add(Dense(1,kernel_initializer='normal',activation='linear'))
model.compile(loss='mse',optimizer ='adam',metrics=['accuracy'])

In [None]:
model.fit(x_train,y_train,epochs=2000,batch_size=5,validation_split=0.05,verbose=0)

with open('model.pkl', 'wb') as file:
    torch.save(model, file)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

y_pred = model.predict(x_test)
with open('x_test.pkl', 'wb') as file:
    pickle.dump(x_test, file)

with open('y_pred.pkl', 'wb') as file:
    pickle.dump(y_pred, file)

def print_metrics(y_test, y_pred):
    mse_test = mean_squared_error(y_test, y_pred) 

    rmse = np.sqrt(mse_test)
    r2 = r2_score(y_test, y_pred)

    print("%-12s %f" % ('MSE:', mse_test))
    print("%-12s %f" % ('RMSE:', rmse))
    print("%-12s %f" % ('R2:', r2))
    print(f'MAPE: {mean_absolute_percentage_error(y_test, y_pred)}')

    def plotGraph(y_test,y_pred):
        if max(y_test) >= max(y_pred):
            my_range = int(max(y_test))
        else:
            my_range = int(max(y_pred))
        plt.scatter(y_test, y_pred, color='red')
        plt.plot(range(my_range), range(my_range), 'o')
        plt.title('Testing vs Predicted')
        plt.show()
        return

    plotGraph(y_test, y_pred)

print_metrics(y_test, y_pred)

In [None]:
def test_model(name, model):
    model.fit(x_test, y_test)
    print(f'\nModel: {name}')
    print_metrics(y_test, model.predict(x_test))

test_model('Linear Regression', LinearRegression())
test_model('Random Forest', RandomForestRegressor())
test_model('K-Nearest Neighbors', KNeighborsRegressor())
test_model('Support Vector Machine (rbf)', SVR(kernel='rbf'))
# # test_model('Support Vector Machine (linear)', SVR(kernel='linear'))
# # test_model('Support Vector Machine (poly)', SVR(kernel='poly'))
# # test_model('Support Vector Machine (sigmoid)', SVR(kernel='sigmoid'))
test_model('Decision Tree', DecisionTreeRegressor())

In [3]:
# tf.test.is_gpu_available()
tf.config.list_physical_devices('GPU')

[]