# Project 4

Stock price Predictor

Import packages

In [46]:
import pandas as pd 
import numpy as np 
import tensorflow as tf 
from pathlib import Path
import os
import sys
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input, Flatten, Conv2D, MaxPooling2D, concatenate
from tensorflow.keras.utils import plot_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import roc_curve, auc, mean_squared_error
import matplotlib.pyplot as plt 
from collections.abc import Sequence
from sklearn import preprocessing
%matplotlib inline
import csv
import glob
from IPython.display import Image
import seaborn as sns


### Define Helper Methods

In [47]:
def plot_losses(history, base_path, iteration:int):
    # Plot training & validation loss over epochs
    plt.plot(history.history["loss"], label="Training Loss")
    plt.plot(history.history["val_loss"], label="Validation Loss")
    plt.ylim(bottom=0.0, top=10.0)
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training vs. Validation Loss")
    plt.legend()
    plt.savefig(
        os.path.join(base_path, f"training-validiation-loss--epoch---Model {iteration}")
    )
    plt.close()


def print_schema(dataframe: pd.DataFrame):
    print('~~~~~~dataframe schema~~~~~~')
    print(f"Dataframe shape: {dataframe.shape} | Dataframe length: {len(dataframe)}")
    print('Column labels: ')
    print(dataframe.columns)
    print('Dataframe head: ')
    print(f"{dataframe.head()}")
def print_column(dataframe: pd.DataFrame, columns: str | list[str]):
    if isinstance(columns, list):
        for i, label in enumerate(columns):
            print(f"column {i}")
            print(dataframe[label])
    else:
        print(dataframe[columns])
# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column. 
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    #if target_type in (np.int64, np.int32):
        ## Classification
        #dummies = pd.get_dummies(df[target])
        #return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    #else#:
        ## Regression
    return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(path, pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    b = plt.plot(t['pred'].tolist(),label='prediction')
    a = plt.plot(t['y'].tolist(),label='expected')

    plt.ylabel('output')
    plt.legend()
    plt.savefig(path)
    plt.close()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low


### Global Control Flow Flags

These flags are used for control flow and debugging. 

In [68]:
DEBUG = True
ITERATION = 23
WINDOW_SIZE = 7
INPUT_IS_TWO_DIMENSIONAL = False
EXPLORE = False
FCN = False
CNN = True
RNN = False

### Output Files

Define paths for output files like charts, tests, and metrics

In [69]:
base_path = os.path.join(os.getcwd(), "output")
iteration_path = os.path.join(base_path, f"iteration-{ITERATION}")
try:
    os.mkdir(base_path)
except FileExistsError as e:
    print(f"{base_path} already exists")
except OSError as e:
    print(f"Error creating directory: {base_path}")
try:
    os.mkdir(iteration_path)
except FileExistsError as e:
    print(f"{iteration_path} already exists. Exiting to preserve previous work.")
    sys.exit(0)
except OSError:
    print("An error occurred while creating the folder. ")



c:\Users\timef\Documents\Workspaces\Python\csc180\stock-price-predictor\output already exists


### Read Dataset

In [50]:
df = pd.read_csv("./data/JPM.csv", dtype={'Volume':np.float32})
print("hit")

hit


### Drop Unneccessary Columns

In [51]:
df.drop(['Date', 'Adj Close'], axis=1, inplace=True)

In [52]:
if DEBUG:
    print(df.columns)
    print(df.dtypes)
    print(df.shape)

Index(['Open', 'High', 'Low', 'Close', 'Volume'], dtype='object')
Open      float64
High      float64
Low       float64
Close     float64
Volume    float32
dtype: object
(9745, 5)


In [53]:
import seaborn as sns
if EXPLORE:
    print(df.describe())
    for column in df.columns:
        sns.boxplot(df[column])
        plt.show()

In [54]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
if EXPLORE:
    for column in df.columns:
    
        fig, axes = plt.subplots(1, 2, figsize=(10, 4))
        sns.histplot(df[column], bins=50, ax=axes[0])
        axes[0].set_title(f"Raw {column}")
        sns.histplot(np.log1p(df[column]), bins=50, ax=axes[1])
        axes[1].set_title(f"Log({column} + 1)")
        plt.show()
    


### Perform Data Cleaning
First, seperate the "y" output feature from the "y" input feature
Second, fill in missing values with the median. 
Third, encode the input feature columns. 


In [55]:
df_copy = df.copy(deep=True)
missing_median(df_copy, 'Close')
for column in df.columns:
    missing_median(df, column)
    if column == 'Close':
        continue
    else:
        encode_numeric_zscore(df, column)

### Reshape Input

Reshape the input dataframe so the model ingests the last 7 days as 1 record. 

In [56]:
if FCN or CNN:
    x = []
    y = []
    window_size = WINDOW_SIZE
    for i in range(len(df) - window_size):
        window = df.iloc[i:i+window_size][['Open', 'High', 'Low', 'Volume', 'Close']].values
        x.append(window.flatten())
        y.append(df_copy.iloc[i+window_size]['Close'])

    X = np.array(x)
    Y = np.array(y)

### Reshaping for CNN
Reshape the input to be a be an 'image' with 7 rows, 5 columns, 1 channel. (2D array)

In [57]:
if CNN and INPUT_IS_TWO_DIMENSIONAL:
    X = X.reshape((X.shape[0], WINDOW_SIZE,5,1))
    print(X.shape)
    print(Y.shape)
if CNN and not INPUT_IS_TWO_DIMENSIONAL:
    X = X.reshape((X.shape[0], 1,WINDOW_SIZE, 5))
    print(X.shape)
    print(Y.shape)

(9738, 1, 7, 5)
(9738,)


In [58]:
if DEBUG:
    print(df.shape)
    for column in df.columns:
        print(f"Number of NAN values in {column}: {df[column].isna().sum()}")

(9745, 5)
Number of NAN values in Open: 0
Number of NAN values in High: 0
Number of NAN values in Low: 0
Number of NAN values in Close: 0
Number of NAN values in Volume: 0


In [59]:
# y = df['Close']
# don't use a random shuffling. This completely breaks the chronological order of the data. 
split_at_index = int(len(X) * 0.7) # we want the training set to have 70% of the rows in the raw df
x_train, x_test = X[:split_at_index], X[split_at_index:]
y_train, y_test = Y[:split_at_index], Y[split_at_index:]  
#x_train, x_test, y_train, y_test = train_test_split(X,Y, random_state=42, test_size=0.3)

In [60]:
if DEBUG:
    print(X)
    print('-----------')
    print(Y)
    print("--------------")
    print(f"X-Train Shape: {x_train.shape}")
    print(f"Y-Train Shape: {y_train.shape}")
    print(f"X-Test Shape: {x_test.shape}")
    print(f"Y-Test Shape: {y_test.shape}")
    

[[[[-1.16962972e+00 -1.17092224e+00 -1.16332275e+00 -6.73052013e-01
     5.03703700e+00]
   [-1.16884723e+00 -1.17169789e+00 -1.16253322e+00 -6.72965646e-01
     5.07407400e+00]
   [-1.16571735e+00 -1.16937098e+00 -1.15937519e+00 -6.74312830e-01
     5.14814800e+00]
   ...
   [-1.16571735e+00 -1.16704410e+00 -1.15937519e+00 -6.71048582e-01
     5.22222200e+00]
   [-1.16258747e+00 -1.16859537e+00 -1.16016468e+00 -6.71445787e-01
     5.09259300e+00]
   [-1.16649980e+00 -1.17169789e+00 -1.16174374e+00 -6.72499359e-01
     5.07407400e+00]]]


 [[[-1.16884723e+00 -1.17169789e+00 -1.16253322e+00 -6.72965646e-01
     5.07407400e+00]
   [-1.16571735e+00 -1.16937098e+00 -1.15937519e+00 -6.74312830e-01
     5.14814800e+00]
   [-1.16415241e+00 -1.17014663e+00 -1.16016468e+00 -6.75556362e-01
     5.11111100e+00]
   ...
   [-1.16258747e+00 -1.16859537e+00 -1.16016468e+00 -6.71445787e-01
     5.09259300e+00]
   [-1.16649980e+00 -1.17169789e+00 -1.16174374e+00 -6.72499359e-01
     5.07407400e+00]
   

### Create Model

In [61]:
checkpoint_path = os.path.join(iteration_path, "best-weights.keras")
#os.mkdir(checkpoint_path) # this is wrong

### Create a FCN

In [62]:
if FCN:
    checkpointer = ModelCheckpoint(filepath=checkpoint_path, save_best_only=True, verbose=0)
    for i in range(5):

        visible = Input(shape=(x_train.shape[1],))
        hidden1 = Dense(256, activation='relu')(visible)
        hidden2=Dense(256, activation='relu')(hidden1)
        hidden3=Dense(128, activation='relu')(hidden2)
        hidden4 = Dense(64, activation='relu')(hidden3)
        hidden5 = Dense(32, activation='relu')(hidden4)
        hidden6=Dense(16, activation='relu')(hidden5)
        hidden7=Dense(8, activation='relu')(hidden6)
        hidden8 = Dense(4, activation='relu')(hidden7)

        output = Dense(1)(hidden8)
        model = Model(inputs=visible, outputs=output)
        model.compile(loss='mean_squared_error', optimizer='adam')
        monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=10, verbose=2, mode='min', restore_best_weights=True)
        history = model.fit(x_train, y_train, validation_data=[x_test, y_test], epochs=1000, callbacks=[checkpointer, monitor])
        plot_losses(history, iteration_path, i)

### Evaluate the Model

In [63]:
if FCN:
    metrics_path = "FCN-metrics.txt"
    # Write Metrics to file
    def redirect(out): # redirect model summary to metrics
        with open(os.path.join(iteration_path, metrics_path), 'a') as file:
            print(out, file=file)

    model.load_weights(checkpoint_path)
    prediction = model.predict(x_test)
    score = np.sqrt(mean_squared_error(y_test, prediction))
    if DEBUG:
        print("Score (RMSE): {}".format(score))

    with open(os.path.join(iteration_path, metrics_path), "x") as file:
        file.write(f"Score (RMSE): {score}\n")
    model.summary(print_fn=redirect)


In [64]:
if FCN:
    # Make Plots
    chart_regression(os.path.join(iteration_path, "FCN-Lift-Chart"), prediction.flatten(), y_test, sort=True)

### Create a CNN

Create the model

In [None]:
if CNN and INPUT_IS_TWO_DIMENSIONAL: # 2D image of 7 * 5 = 35 pixels, each pixel with 1 channel.
    checkpointer = ModelCheckpoint(filepath=os.path.join(iteration_path, "CNN-best-weights.keras"), save_best_only=True, verbose=0)
    # H_out = floor((H_in + 2*pad_h - kernel_h) / stride_h) + 1
    # W_out = floor((W_in + 2*pad_w - kernel_w) / stride_w) + 1
    # C_out = number of filters

    # H_out = (7 - 4) / 1 + 1 = 4
    # W_out = (5 - 4) / 1 + 1 = 2
    # Channels_out = 64

    for i in range(5):
        visible = Input(shape=(1,WINDOW_SIZE,5)) 
        con1 = Conv2D(64, kernel_size=(1,3), activation='relu')(visible)# -> (4,2,64)
        pool1 = MaxPooling2D(pool_size=(1,2), padding='same')(con1) # 4 - 2 / 1 + 1, 2 - 2 / 
        con2 = Conv2D(128, kernel_size=(1,3), activation='relu')(pool1)
        pool2 = MaxPooling2D(pool_size=(1,1), padding='same')(con2)
        flatten1 = Flatten()(pool2)

        # interpretation layer
        dense1 = Dense(128, activation='relu')(flatten1)
        output = Dense(1)(dense1)
        model = Model(inputs=visible, outputs=output)
        model.compile(loss='mean_squared_error', optimizer='adam')
        monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, mode='min', restore_best_weights=True)
        history = model.fit( x_train, y_train, validation_data=[x_test, y_test], callbacks=[checkpointer, monitor], epochs=500)
        plot_losses(history, iteration_path, i)
if CNN and not INPUT_IS_TWO_DIMENSIONAL:# 1D image of 7 pixels, each pixel with 5 channels
    checkpointer = ModelCheckpoint(filepath=os.path.join(iteration_path, "CNN-best-weights.keras"), save_best_only=True, verbose=0)
    # H_out = floor((H_in + 2*pad_h - kernel_h) / stride_h) + 1
    # W_out = floor((W_in + 2*pad_w - kernel_w) / stride_w) + 1
    # C_out = number of filters

    # H_out = (7 - 4) / 1 + 1 = 4
    # W_out = (5 - 4) / 1 + 1 = 2
    # Channels_out = 64

    for i in range(5):
        visible = Input(shape=(1,WINDOW_SIZE,5)) 
        con1 = Conv2D(64, kernel_size=(1,3), activation='relu')(visible)# -> (4,2,64)
        pool1 = MaxPooling2D(pool_size=(1,2), padding='same')(con1) # 4 - 2 / 1 + 1, 2 - 2 / 
        con2 = Conv2D(128, kernel_size=(1,3), activation='relu')(pool1)
        pool2 = MaxPooling2D(pool_size=(1,1), padding='same')(con2)
        flatten1 = Flatten()(pool2)

        # interpretation layer
        dense1 = Dense(128, activation='relu')(flatten1)
        output = Dense(1)(dense1)
        model = Model(inputs=visible, outputs=output)
        model.compile(loss='mean_squared_error', optimizer='adam')
        monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, mode='min', restore_best_weights=True)
        history = model.fit( x_train, y_train, validation_data=[x_test, y_test], callbacks=[checkpointer, monitor], epochs=500)
        plot_losses(history, iteration_path, i)

Epoch 1/500
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 12.5579 - val_loss: 2.7102
Epoch 2/500
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.9035 - val_loss: 2.5395
Epoch 3/500
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.7957 - val_loss: 2.1769
Epoch 4/500
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.7257 - val_loss: 1.7175
Epoch 5/500
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.6334 - val_loss: 2.0149
Epoch 6/500
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.5855 - val_loss: 1.6887
Epoch 7/500
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.6419 - val_loss: 1.4756
Epoch 8/500
[1m213/213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.5030 - val_loss: 1.6893
Epoch 9/500
[1m213/213[0m [3

## Evaluate Model


In [73]:
if CNN:
    metrics_path = "CNN-metrics.txt"
    def redirect(out): # redirect model summary to metrics
        with open(os.path.join(iteration_path, metrics_path), 'a') as file:
            print(out, file=file)
    # make prediction and evaluate
    model.load_weights(os.path.join(iteration_path, "CNN-best-weights.keras"))
    prediction = model.predict(x_test)
    score = np.sqrt(mean_squared_error(y_test, prediction))
    if DEBUG:
        print("Score (RMSE): {}".format(score))
    # Write Metrics to file
    with open(os.path.join(iteration_path, metrics_path), "x") as file:
        file.write(f"Score (RMSE): {score}\n")
    model.summary(print_fn=redirect)
    chart_regression(os.path.join(iteration_path, "CNN-Lift-Chart"), prediction.flatten(), y_test, sort=True)

[1m92/92[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
Score (RMSE): 1.0919655104728037


### Create a RNN