# Project 4

Stock price Predictor

Import packages

In [573]:
import pandas as pd 
import numpy as np 
import tensorflow as tf 
from pathlib import Path
import os
import sys
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input, Flatten, Conv2D, MaxPooling2D, concatenate, Conv1D, MaxPooling1D
from tensorflow.keras.utils import plot_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import roc_curve, auc, mean_squared_error
import matplotlib.pyplot as plt 
from collections.abc import Sequence
from sklearn import preprocessing
%matplotlib inline
import csv
import glob
from IPython.display import Image
import seaborn as sns


### Define Helper Methods

In [574]:
def plot_losses(history, base_path, iteration:int):
    # Plot training & validation loss over epochs
    plt.plot(history.history["loss"], label="Training Loss")
    plt.plot(history.history["val_loss"], label="Validation Loss")
    plt.ylim(bottom=0.0, top=10.0)
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training vs. Validation Loss")
    plt.legend()
    plt.savefig(
        os.path.join(base_path, f"training-validiation-loss--epoch---Model {iteration}")
    )
    plt.close()


def print_schema(dataframe: pd.DataFrame):
    print('~~~~~~dataframe schema~~~~~~')
    print(f"Dataframe shape: {dataframe.shape} | Dataframe length: {len(dataframe)}")
    print('Column labels: ')
    print(dataframe.columns)
    print('Dataframe head: ')
    print(f"{dataframe.head()}")
def print_column(dataframe: pd.DataFrame, columns: str | list[str]):
    if isinstance(columns, list):
        for i, label in enumerate(columns):
            print(f"column {i}")
            print(dataframe[label])
    else:
        print(dataframe[columns])
# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column. 
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    #if target_type in (np.int64, np.int32):
        ## Classification
        #dummies = pd.get_dummies(df[target])
        #return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    #else#:
        ## Regression
    return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(path, pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    b = plt.plot(t['pred'].tolist(),label='prediction')
    a = plt.plot(t['y'].tolist(),label='expected')

    plt.ylabel('output')
    plt.legend()
    plt.savefig(path)
    plt.close()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low


### Global Control Flow Flags

These flags are used for control flow and debugging. 

In [581]:
DEBUG = True
ITERATION = 57
WINDOW_SIZE = 7
INPUT_IS_TWO_DIMENSIONAL = False
EXPLORE = False
FCN = False
CNN = True
RNN = False

### Output Files

Define paths for output files like charts, tests, and metrics

In [576]:
base_path = os.path.join(os.getcwd(), "output")
iteration_path = os.path.join(base_path, f"iteration-{ITERATION}")
try:
    os.mkdir(base_path)
except FileExistsError as e:
    print(f"{base_path} already exists")
except OSError as e:
    print(f"Error creating directory: {base_path}")
try:
    os.mkdir(iteration_path)
except FileExistsError as e:
    print(f"{iteration_path} already exists. Exiting to preserve previous work.")
    sys.exit(0)
except OSError:
    print("An error occurred while creating the folder. ")



c:\Users\timef\Documents\Workspaces\Python\csc180\stock-price-predictor\output already exists
c:\Users\timef\Documents\Workspaces\Python\csc180\stock-price-predictor\output\iteration-57 already exists. Exiting to preserve previous work.


SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


### Read Dataset

In [None]:
df = pd.read_csv("./data/JPM.csv", dtype={'Volume':np.float32})
print("hit")

hit


### Drop Unneccessary Columns

In [None]:
df.drop(['Date', 'Adj Close'], axis=1, inplace=True)

In [None]:
if DEBUG:
    print(df.columns)
    print(df.dtypes)
    print(df.shape)

Index(['Open', 'High', 'Low', 'Close', 'Volume'], dtype='object')
Open      float64
High      float64
Low       float64
Close     float64
Volume    float32
dtype: object
(9745, 5)


### Perform Data Cleaning
First, seperate the "y" output feature from the "y" input feature
Second, fill in missing values with the median. 
Third, encode the input feature columns. 


In [None]:
df_copy = df.copy(deep=True)
missing_median(df_copy, 'Close')
for column in df.columns:
    missing_median(df, column)
    if column == 'Close':
        continue
    else:
        encode_numeric_zscore(df, column)

### Reshape Input

Reshape the input dataframe so the model ingests the last 7 days as 1 record. 

In [None]:
if FCN or CNN:
    x = []
    y = []
    window_size = WINDOW_SIZE
    for i in range(len(df) - window_size):
        window = df.iloc[i:i+window_size][['Open', 'High', 'Low', 'Volume', 'Close']].values
        x.append(window.flatten())
        y.append(df_copy.iloc[i+window_size]['Close'])

    X = np.array(x)
    Y = np.array(y)

### Reshaping for CNN
Reshape the input to be a be an 'image' with 7 rows, 5 columns, 1 channel. (2D array)

In [578]:
if CNN and INPUT_IS_TWO_DIMENSIONAL:
    X = X.reshape((X.shape[0], WINDOW_SIZE,5,1))
    print(X.shape)
    print(Y.shape)
if CNN and not INPUT_IS_TWO_DIMENSIONAL:
    X = X.reshape((X.shape[0], WINDOW_SIZE, 5))
    print(X.shape)
    print(Y.shape)

(9738, 7, 5, 1)
(9738,)


In [None]:
if DEBUG:
    print(df.shape)
    for column in df.columns:
        print(f"Number of NAN values in {column}: {df[column].isna().sum()}")

(9745, 5)
Number of NAN values in Open: 0
Number of NAN values in High: 0
Number of NAN values in Low: 0
Number of NAN values in Close: 0
Number of NAN values in Volume: 0


In [579]:
# y = df['Close']
# don't use a random shuffling. This completely breaks the chronological order of the data. 
split_at_index = int(len(X) * 0.7) # we want the training set to have 70% of the rows in the raw df
x_train, x_test = X[:split_at_index], X[split_at_index:]
y_train, y_test = Y[:split_at_index], Y[split_at_index:]  
#x_train, x_test, y_train, y_test = train_test_split(X,Y, random_state=42, test_size=0.3)

In [None]:
if DEBUG:
    print(X)
    print('-----------')
    print(Y)
    print("--------------")
    print(f"X-Train Shape: {x_train.shape}")
    print(f"Y-Train Shape: {y_train.shape}")
    print(f"X-Test Shape: {x_test.shape}")
    print(f"Y-Test Shape: {y_test.shape}")
    

[[-1.16962972e+00 -1.17092224e+00 -1.16332275e+00 ... -1.16174374e+00
  -6.72499359e-01  5.07407400e+00]
 [-1.16884723e+00 -1.17169789e+00 -1.16253322e+00 ... -1.16253322e+00
  -6.74830973e-01  5.11111100e+00]
 [-1.16571735e+00 -1.16937098e+00 -1.15937519e+00 ... -1.16727027e+00
  -6.73069298e-01  5.07407400e+00]
 ...
 [ 3.18802549e+00  3.15195649e+00  3.14656018e+00 ...  3.09326832e+00
   3.60764503e-01  1.06699997e+02]
 [ 3.02112437e+00  3.06064937e+00  3.04381330e+00 ...  3.21818445e+00
   5.24282932e-01  1.09019997e+02]
 [ 3.04478649e+00  3.01332007e+00  3.01013314e+00 ...  3.24078016e+00
   7.55183548e-02  1.08980003e+02]]
-----------
[  5.111111   5.074074   5.018518 ... 109.019997 108.980003 108.379997]
--------------
X-Train Shape: (6816, 35)
Y-Train Shape: (6816,)
X-Test Shape: (2922, 35)
Y-Test Shape: (2922,)


### Create Model

In [None]:
checkpoint_path = os.path.join(iteration_path, "FCN-best-weights.keras")
#os.mkdir(checkpoint_path) # this is wrong

### Create a FCN

In [None]:
if FCN:
    checkpointer = ModelCheckpoint(filepath=checkpoint_path, save_best_only=True, verbose=0)
    for i in range(5):

        visible = Input(shape=(x_train.shape[1],))
        hidden1 = Dense(256, activation='relu')(visible)
        hidden2=Dense(256, activation='relu')(hidden1)
        hidden3=Dense(128, activation='relu')(hidden2)
        hidden4 = Dense(64, activation='relu')(hidden3)
        hidden5 = Dense(32, activation='relu')(hidden4)
        hidden6=Dense(16, activation='relu')(hidden5)
        hidden7=Dense(8, activation='relu')(hidden6)
        hidden8 = Dense(4, activation='relu')(hidden7)

        output = Dense(1)(hidden8)
        model = Model(inputs=visible, outputs=output)
        model.compile(loss='mean_squared_error', optimizer='adam')
        monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=10, verbose=2, mode='min', restore_best_weights=True)
        history = model.fit(x_train, y_train, validation_data=[x_test, y_test], epochs=1000, callbacks=[checkpointer, monitor])
        plot_losses(history, iteration_path, i)

Epoch 1/1000
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 730.5724 - val_loss: 3656.7437
Epoch 2/1000
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 706.5835 - val_loss: 3585.7217
Epoch 3/1000
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 675.2579 - val_loss: 3494.8086
Epoch 4/1000
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 637.6918 - val_loss: 3386.8943
Epoch 5/1000
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 595.7300 - val_loss: 3265.2954
Epoch 6/1000
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 551.3072 - val_loss: 3134.8552
Epoch 7/1000
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 506.1922 - val_loss: 2997.0129
Epoch 8/1000
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 462.0011 - val_

### Evaluate the Model

In [None]:
if FCN:
    metrics_path = "FCN-metrics.txt"
    # Write Metrics to file
    def redirect(out): # redirect model summary to metrics
        with open(os.path.join(iteration_path, metrics_path), 'a') as file:
            print(out, file=file)

    model.load_weights(checkpoint_path)
    prediction = model.predict(x_test)
    score = np.sqrt(mean_squared_error(y_test, prediction))
    if DEBUG:
        print("Score (RMSE): {}".format(score))

    with open(os.path.join(iteration_path, metrics_path), "x") as file:
        file.write(f"Score (RMSE): {score}\n")
    model.summary(print_fn=redirect)


[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Score (RMSE): 1.0972008127032644


In [None]:
if FCN:
    # Make Plots
    chart_regression(os.path.join(iteration_path, "FCN-Lift-Chart"), prediction.flatten(), y_test, sort=True)

### Create a CNN

Create the model

In [None]:
if CNN and INPUT_IS_TWO_DIMENSIONAL: # 2D image of 7 * 5 = 35 pixels, each pixel with 1 channel.
    checkpointer = ModelCheckpoint(filepath=os.path.join(iteration_path, "CNN-best-weights.keras"), save_best_only=True, verbose=0)
    # H_out = floor((H_in + 2*pad_h - kernel_h) / stride_h) + 1
    # W_out = floor((W_in + 2*pad_w - kernel_w) / stride_w) + 1
    # C_out = number of filters

    # H_out = (7 - 4) / 1 + 1 = 4
    # W_out = (5 - 4) / 1 + 1 = 2
    # Channels_out = 64

    for i in range(5):
        visible = Input(shape=(WINDOW_SIZE,5,1)) 
        con1 = Conv2D(64, kernel_size=(1,3), activation='relu')(visible)# -> (4,2,64)
        pool1 = MaxPooling2D(pool_size=(1,2), padding='same')(con1) # 4 - 2 / 1 + 1, 2 - 2 / 
        con2 = Conv2D(128, kernel_size=(1,3), activation='relu')(pool1)
        pool2 = MaxPooling2D(pool_size=(1,1), padding='same')(con2)
        flatten1 = Flatten()(pool2)

        # interpretation layer
        dense1 = Dense(128, activation='relu')(flatten1)
        output = Dense(1)(dense1)
        model = Model(inputs=visible, outputs=output)
        model.compile(loss='mean_squared_error', optimizer='adam')
        monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, mode='min', restore_best_weights=True)
        history = model.fit( x_train, y_train, validation_data=[x_test, y_test], callbacks=[checkpointer, monitor], epochs=500)
        plot_losses(history, iteration_path, i)
if CNN and not INPUT_IS_TWO_DIMENSIONAL:# 1D image of 7 pixels, each pixel with 5 channels
    checkpointer = ModelCheckpoint(filepath=os.path.join(iteration_path, "CNN-best-weights.keras"), save_best_only=True, verbose=0)
    # H_out = floor((H_in + 2*pad_h - kernel_h) / stride_h) + 1
    # W_out = floor((W_in + 2*pad_w - kernel_w) / stride_w) + 1
    # C_out = number of filters

    # H_out = (7 - 4) / 1 + 1 = 4
    # W_out = (5 - 4) / 1 + 1 = 2
    # Channels_out = 64

    for i in range(5):
        visible = Input(shape=(WINDOW_SIZE,5)) 
        con1 = Conv1D(64, kernel_size=1, activation='relu', padding='same')(visible)# -> (4,2,64)
        con2 = Conv1D(64, kernel_size=3, activation='relu', padding='same')(con1)

        pool1 = MaxPooling1D(pool_size=2 )(con2) # 4 - 2 / 1 + 1, 2 - 2 / 
        # con3 = Conv1D(128, kernel_size=1, activation='relu', padding='same')(pool1)
        # con4 = Conv1D(128, kernel_size=3, activation='relu', padding='same')(con3)
        # pool2 = MaxPooling1D(pool_size=2 )(con4) # 4 - 2 / 1 + 1, 2 - 2 / 
        # con5 = Conv1D(128, kernel_size=1, activation='relu', padding='same')(pool2)
        # con6 = Conv1D(128, kernel_size=3, activation='relu', padding='same')(con5)
        flatten1 = Flatten()(pool1)

        # interpretation layer
        dense1 = Dense(32, activation='relu')(flatten1)
        output = Dense(1)(dense1)
        model = Model(inputs=visible, outputs=output)
        model.compile(loss='mean_squared_error', optimizer='adam')
        monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, mode='min', restore_best_weights=True)
        history = model.fit( x_train, y_train, validation_data=[x_test, y_test], callbacks=[checkpointer, monitor], epochs=500)
        plot_losses(history, iteration_path, i)

Epoch 1/500
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 15.8452 - val_loss: 3.0522
Epoch 2/500
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 1.1225 - val_loss: 2.8874
Epoch 3/500
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 1.0247 - val_loss: 3.1190
Epoch 4/500
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 1.0030 - val_loss: 3.8360
Epoch 5/500
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.9562 - val_loss: 2.2816
Epoch 6/500
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.9913 - val_loss: 6.5113
Epoch 7/500
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.8370 - val_loss: 2.1505
Epoch 8/500
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.7820 - val_loss: 5.5796
Epoch 9/500
[1m213/213[0m [3

## Evaluate Model


In [583]:
if CNN:
    metrics_path = "CNN-metrics.txt"
    def redirect(out): # redirect model summary to metrics
        with open(os.path.join(iteration_path, metrics_path), 'a') as file:
            print(out, file=file)
    # make prediction and evaluate
    model.load_weights(os.path.join(iteration_path, "CNN-best-weights.keras"))
    prediction = model.predict(x_test)
    score = np.sqrt(mean_squared_error(y_test, prediction))
    if DEBUG:
        print("Score (RMSE): {}".format(score))
    # Write Metrics to file
    with open(os.path.join(iteration_path, metrics_path), "x") as file:
        file.write(f"Score (RMSE): {score}\nWindow Size: {WINDOW_SIZE}")
    model.summary(print_fn=redirect)
    chart_regression(os.path.join(iteration_path, "CNN-Lift-Chart"), prediction.flatten(), y_test, sort=True)

[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Score (RMSE): 1.2457892000217048


# Create a RNN

Read Data In

In [584]:
import pandas as pd 
import numpy as np 
import tensorflow as tf 
from pathlib import Path
import os
import sys
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input, Flatten, Conv2D, MaxPooling2D, concatenate, Conv1D, MaxPooling1D
from tensorflow.keras.utils import plot_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import roc_curve, auc, mean_squared_error
import matplotlib.pyplot as plt 
from collections.abc import Sequence
from sklearn import preprocessing
%matplotlib inline
import csv
import glob
from IPython.display import Image
import seaborn as sns

In [585]:
def plot_losses(history, base_path, iteration:int):
    # Plot training & validation loss over epochs
    plt.plot(history.history["loss"], label="Training Loss")
    plt.plot(history.history["val_loss"], label="Validation Loss")
    plt.ylim(bottom=0.0, top=400.0)
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training vs. Validation Loss")
    plt.legend()
    plt.savefig(
        os.path.join(base_path, f"training-validiation-loss--epoch---Model {iteration}")
    )
    plt.close()


def print_schema(dataframe: pd.DataFrame):
    print('~~~~~~dataframe schema~~~~~~')
    print(f"Dataframe shape: {dataframe.shape} | Dataframe length: {len(dataframe)}")
    print('Column labels: ')
    print(dataframe.columns)
    print('Dataframe head: ')
    print(f"{dataframe.head()}")
def print_column(dataframe: pd.DataFrame, columns: str | list[str]):
    if isinstance(columns, list):
        for i, label in enumerate(columns):
            print(f"column {i}")
            print(dataframe[label])
    else:
        print(dataframe[columns])
# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column. 
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    #if target_type in (np.int64, np.int32):
        ## Classification
        #dummies = pd.get_dummies(df[target])
        #return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    #else#:
        ## Regression
    return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(path, pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    b = plt.plot(t['pred'].tolist(),label='prediction')
    a = plt.plot(t['y'].tolist(),label='expected')

    plt.ylabel('output')
    plt.legend()
    plt.savefig(path)
    plt.close()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low


In [586]:
DEBUG = True
ITERATION = 58
WINDOW_SIZE = 7
INPUT_IS_TWO_DIMENSIONAL = False
EXPLORE = False
FCN = False
CNN = True
RNN = False

In [587]:
base_path = os.path.join(os.getcwd(), "output")
iteration_path = os.path.join(base_path, f"iteration-{ITERATION}")
try:
    os.mkdir(base_path)
except FileExistsError as e:
    print(f"{base_path} already exists")
except OSError as e:
    print(f"Error creating directory: {base_path}")
try:
    os.mkdir(iteration_path)
except FileExistsError as e:
    print(f"{iteration_path} already exists. Exiting to preserve previous work.")
    sys.exit(0)
except OSError:
    print("An error occurred while creating the folder. ")

c:\Users\timef\Documents\Workspaces\Python\csc180\stock-price-predictor\output already exists


In [None]:

df = pd.read_csv("./data/JPM.csv", dtype={'Volume':np.float32})
print(df.shape)

(9556, 7)


### Clean Dataset

In [589]:
df.drop(['Date', 'Adj Close'], axis=1, inplace=True)

In [590]:
print(df.shape)

(9556, 5)


In [591]:
df_copy = df.copy(deep=True)
missing_median(df_copy, 'Close')
for column in df.columns:
    missing_median(df, column)
    encode_numeric_zscore(df, column)

In [592]:
split_at_index = int(len(df) * 0.7)
df_train = df[:split_at_index]
df_test = df[split_at_index:]

df_copy_train = df_copy[:split_at_index]
df_copy_test = df_copy[split_at_index:]
close_train = df_copy_train['Close']
close_test = df_copy_test['Close']

In [593]:
print(df_train.shape)
print(df_test.shape)
print(close_test.shape)


(6689, 5)
(2867, 5)
(2867,)


In [594]:
def window_it(window_size, dataframe_train, dataframe_test):
    x= []
    y = []
    for i in range(len(dataframe_train) - window_size):
        window = dataframe_train.iloc[i:i+window_size][['Open', 'High', 'Low', 'Volume', 'Close']].values
        x.append(window)
        y.append(dataframe_test.iloc[i+window_size])
        if DEBUG and i == 1:
            print(f"window shape:  {window.shape}\n {window}\nx shape:\n {x[:1]}\ny: \n{y[:1]}")
    return np.array(x), np.array(y)
    

In [595]:
x_train, y_train = window_it(WINDOW_SIZE, df_train, close_train)
x_test, y_test = window_it(WINDOW_SIZE, df_test, close_test)

window shape:  (7, 5)
 [[-0.54816568 -0.5489015  -0.54722446 -0.50387412 -0.54809776]
 [-0.54894298 -0.54967237 -0.5480084  -0.7057299  -0.54887493]
 [-0.54874865 -0.54943146 -0.54776341 -0.76122087 -0.54863205]
 [-0.54845718 -0.54914238 -0.54746945 -0.79860157 -0.54834063]
 [-0.54782561 -0.54851607 -0.54683249 -0.87001157 -0.54770917]
 [-0.54729121 -0.54798609 -0.54629351 -0.9024297  -0.54717486]
 [-0.5468054  -0.5475043  -0.54580355 -0.87484533 -0.54668913]]
x shape:
 [array([[-0.54763128, -0.54832334, -0.54663649,  0.33957607, -0.54751487],
       [-0.54816568, -0.5489015 , -0.54722446, -0.50387412, -0.54809776],
       [-0.54894298, -0.54967237, -0.5480084 , -0.7057299 , -0.54887493],
       [-0.54874865, -0.54943146, -0.54776341, -0.76122087, -0.54863205],
       [-0.54845718, -0.54914238, -0.54746945, -0.79860157, -0.54834063],
       [-0.54782561, -0.54851607, -0.54683249, -0.87001157, -0.54770917],
       [-0.54729121, -0.54798609, -0.54629351, -0.9024297 , -0.54717486]])]
y: 


In [596]:
if DEBUG:
    print(x_train.shape)
    print(x_test.shape)
    print(y_train.shape)
    print(y_test.shape)

(6682, 7, 5)
(2860, 7, 5)
(6682,)
(2860,)


### Create the Model

In [597]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.optimizers import Adam
checkpointer = ModelCheckpoint(filepath=os.path.join(iteration_path, "RNN-best-weights.keras"), save_best_only=True)

for i in range(3):
    print(f"training model {i}")
    optimizer = Adam(learning_rate=0.005)
    visible = Input(shape=(WINDOW_SIZE,  5))
    lstm1 = LSTM(70, dropout=0.1, recurrent_dropout=0.1, activation='tanh')(visible)


    #lstm5 = LSTM(32, dropout=0.1)(lstm1)
    dense1= Dense(16, activation='relu')(lstm1)
    #dense2 = Dense(8, activation='relu')(lstm1)
    output = Dense(1)(dense1)
    model = Model(inputs=visible, outputs=output)
    model.compile(loss="mean_squared_error", optimizer = 'adam')
    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
    history = model.fit(x_train, y_train, validation_data = [x_test, y_test], callbacks=[checkpointer, monitor], epochs=200, verbose=2)
    plot_losses(history, iteration_path, i)
    



training model 0
Epoch 1/200
209/209 - 3s - 13ms/step - loss: 4.2340 - val_loss: 9232.6084
Epoch 2/200
209/209 - 1s - 3ms/step - loss: 2.0540 - val_loss: 9174.3809
Epoch 3/200
209/209 - 1s - 3ms/step - loss: 1.4191 - val_loss: 9276.0127
Epoch 4/200
209/209 - 1s - 3ms/step - loss: 1.1299 - val_loss: 9318.5459
Epoch 5/200
209/209 - 1s - 3ms/step - loss: 1.0362 - val_loss: 9298.4648
Epoch 6/200
209/209 - 1s - 3ms/step - loss: 0.9282 - val_loss: 9284.9209
Epoch 7/200
209/209 - 1s - 3ms/step - loss: 0.9473 - val_loss: 9260.4688
Epoch 7: early stopping
training model 1
Epoch 1/200
209/209 - 3s - 14ms/step - loss: 4.6963 - val_loss: 9192.0449
Epoch 2/200
209/209 - 1s - 4ms/step - loss: 2.1506 - val_loss: 9114.3877
Epoch 3/200
209/209 - 1s - 4ms/step - loss: 1.5129 - val_loss: 9229.0156
Epoch 4/200
209/209 - 1s - 4ms/step - loss: 1.1771 - val_loss: 9212.8828
Epoch 5/200
209/209 - 1s - 4ms/step - loss: 1.0794 - val_loss: 9193.8857
Epoch 6/200
209/209 - 1s - 4ms/step - loss: 1.0277 - val_loss: 9

### Evaluate and Write Results

In [598]:
metrics_path = "RNN-metrics.txt"
def redirect(out): # redirect model summary to metrics
    with open(os.path.join(iteration_path, metrics_path), 'a') as file:
        print(out, file=file)
# make prediction and evaluate
model.load_weights(os.path.join(iteration_path, "RNN-best-weights.keras"))
prediction = model.predict(x_test)
score = np.sqrt(mean_squared_error(y_test, prediction))
if DEBUG:
    print("Score (RMSE): {}".format(score))
# Write Metrics to file
with open(os.path.join(iteration_path, metrics_path), "x") as file:
    file.write(f"Score (RMSE): {score}\nWindow Size: {WINDOW_SIZE}")
model.summary(print_fn=redirect)
chart_regression(os.path.join(iteration_path, "RNN-Lift-Chart"), prediction.flatten(), y_test, sort=True)

[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
Score (RMSE): 95.04725966643242
