# Project 4

Stock price Predictor

Import packages

In [None]:
import pandas as pd 
import numpy as np 
import tensorflow as tf 
from pathlib import Path
import os
import sys
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input, Flatten, Conv2D, MaxPooling2D, concatenate
from tensorflow.keras.utils import plot_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import roc_curve, auc, mean_squared_error
import matplotlib.pyplot as plt 
from collections.abc import Sequence
from sklearn import preprocessing
%matplotlib inline
import csv
import glob
from IPython.display import Image
import seaborn as sns


### Define Helper Methods

In [None]:
def plot_losses(history, base_path, iteration:int):
    # Plot training & validation loss over epochs
    plt.plot(history.history["loss"], label="Training Loss")
    plt.plot(history.history["val_loss"], label="Validation Loss")
    plt.ylim(bottom=0.0, top=1.0)
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training vs. Validation Loss")
    plt.legend()
    plt.savefig(
        os.path.join(base_path, f"training-validiation-loss--epoch---Model {iteration}")
    )
    plt.close()


def print_schema(dataframe: pd.DataFrame):
    print('~~~~~~dataframe schema~~~~~~')
    print(f"Dataframe shape: {dataframe.shape} | Dataframe length: {len(dataframe)}")
    print('Column labels: ')
    print(dataframe.columns)
    print('Dataframe head: ')
    print(f"{dataframe.head()}")
def print_column(dataframe: pd.DataFrame, columns: str | list[str]):
    if isinstance(columns, list):
        for i, label in enumerate(columns):
            print(f"column {i}")
            print(dataframe[label])
    else:
        print(dataframe[columns])
# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column. 
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    #if target_type in (np.int64, np.int32):
        ## Classification
        #dummies = pd.get_dummies(df[target])
        #return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    #else#:
        ## Regression
    return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(path, pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    b = plt.plot(t['pred'].tolist(),label='prediction')
    a = plt.plot(t['y'].tolist(),label='expected')

    plt.ylabel('output')
    plt.legend()
    plt.savefig(path)
    plt.close()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low


### Global Control Flow Flags

These flags are used for control flow and debugging. 

In [None]:
DEBUG = True
ITERATION = 8
EXPLORE = True

### Output Files

Define paths for output files like charts, tests, and metrics

In [None]:
base_path = os.path.join(os.getcwd(), "output")
iteration_path = os.path.join(base_path, f"iteration-{ITERATION}")
try:
    os.mkdir(base_path)
except FileExistsError as e:
    print(f"{base_path} already exists")
except OSError as e:
    print(f"Error creating directory: {base_path}")
try:
    os.mkdir(iteration_path)
except FileExistsError as e:
    print(f"{iteration_path} already exists. Exiting to preserve previous work.")
    sys.exit(0)
except OSError:
    print("An error occurred while creating the folder. ")



### Read Dataset

In [None]:
df = pd.read_csv("./data/JPM.csv", dtype={'Volume':np.float32})
print("hit")

### Drop Unneccessary Columns

In [None]:
df.drop(['Date', 'Adj Close'], axis=1, inplace=True)

In [None]:
if DEBUG:
    print(df.columns)
    print(df.dtypes)
    print(df.shape)

In [None]:
import seaborn as sns
if EXPLORE:
    print(df.describe())
    for column in df.columns:
        sns.boxplot(df[column])
        plt.show()

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
for column in df.columns:

    fig, axes = plt.subplots(1, 2, figsize=(10, 4))
    sns.histplot(df[column], bins=50, ax=axes[0])
    axes[0].set_title(f"Raw {column}")
    sns.histplot(np.log1p(df[column]), bins=50, ax=axes[1])
    axes[1].set_title(f"Log({column} + 1)")
    plt.show()
    


### Perform Data Cleaning

In [None]:
for column in df.columns:
    missing_median(df, column)
    if column == "Close":
        continue
    else:
        encode_numeric_zscore(df, column)


In [None]:
if DEBUG:
    print(df)
    for column in df.columns:
        print(f"Number of NAN values in {column}: {df[column].isna().sum()}")

In [None]:
# y = df['Close']
x,y = to_xy(df, 'Close')
x_train, x_test, y_train, y_test = train_test_split(x,y, random_state=42, test_size=0.3)

In [None]:
if DEBUG:
    print(x)
    print('-----------')
    print(y)
    print("--------------")
    print(x_train.shape)
    

### Create Model

In [None]:
checkpoint_path = os.path.join(iteration_path, "best-weights.keras")
#os.mkdir(checkpoint_path) # this is wrong

### Create a FCN

In [None]:
checkpointer = ModelCheckpoint(filepath=checkpoint_path, save_best_only=True, verbose=0)

visible = Input(shape=(x_train.shape[1],))
hidden1 = Dense(64, activation='relu')(visible)
hidden2=Dense(64, activation='relu')(hidden1)
hidden3=Dense(32, activation='relu')(hidden2)
hidden4 = Dense(16, activation='relu')(hidden3)

output = Dense(1)(hidden4)
model = Model(inputs=visible, outputs=output)
model.compile(loss='mean_squared_error', optimizer='adam')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=10, verbose=2, mode='min', restore_best_weights=True)
model.fit(x_train, y_train, validation_data=[x_test, y_test], epochs=100, callbacks=[checkpointer, monitor])

### Evaluate the Model

In [None]:
# Write Metrics to file
def redirect(out): # redirect model summary to metrics
    with open(os.path.join(iteration_path, "metrics.txt"), 'a') as file:
        print(out, file=file)

model.load_weights(checkpoint_path)
prediction = model.predict(x_test)
score = np.sqrt(mean_squared_error(y_test, prediction))
if DEBUG:
    print("Score (RMSE): {}".format(score))

with open(os.path.join(iteration_path, "metrics.txt"), "x") as file:
    file.write(f"Score (RMSE): {score}\n")
model.summary(print_fn=redirect)


In [None]:
# Make Plots
chart_regression(os.path.join(iteration_path, "Lift-Chart-2"), prediction.flatten(), y_test, sort=True)

### Create a CNN

### Create a RNN