In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Stock Indicator Functions

Streamlines computing with lots of data

Initiallly coded and graphed [here](https://www.kaggle.com/colinflueck/apple-stock-indicators/notebook?scriptVersionId=46331290)


In [None]:
# All indicators are typically done with close, but I'm doing open for everything

def SMA(df, dataColumn, window):
    newColumn = str(window) + "SMA" + "_" + dataColumn
    df[newColumn] = df[dataColumn].rolling(window=window).mean()
    
def EMA(df, dataColumn, window):
    newColumn = str(window) + "EMA" + "_" + dataColumn
    # pandas ewm method for moving average (adjust set to false to equal stock website calculations)
    df[newColumn] = df[dataColumn].ewm(span=window,min_periods=0,adjust=False,ignore_na=False).mean()

# Moving Average Convergence Divergence (12, 26 are most common EMA windows)
def MACD(df, dataColumn, window1, window2, signal_window, isSignal):
    EMA1 = df[dataColumn].ewm(span=window1,min_periods=0,adjust=False,ignore_na=False).mean()
    EMA2 = df[dataColumn].ewm(span=window2,min_periods=0,adjust=False,ignore_na=False).mean()
    MACD = EMA1 - EMA2
    df[str(window1) + "-" + str(window2) + "MACD" + "_" + dataColumn] = MACD
    if(isSignal):
        df["MACD_" + str(signal_window) + "SMA"] = MACD.ewm(span=signal_window,min_periods=0,adjust=False,ignore_na=False).mean()

        
# Stochastic Oscillator (14 is most common lookback window)
def SO(df, dataColumn, window, signal_window, isSignal):
    # current open - lowest open / (highest open - lowest open) * 100
    SO = ((df[dataColumn] - df[dataColumn].rolling(window=window).min()) /
              (df[dataColumn].rolling(window=window).max() - df[dataColumn].rolling(window=window).min())) * 100
    df[str(window) + "SO" + "_" + dataColumn] = SO
    # Signal line (SMA of Stochstic Oscillator, 3 is most common)
    if(isSignal):
        df["SO_" + str(signal_window) + "SMA"] = SO.rolling(window=signal_window).mean()
        
# Relative Strength Index (14 is most common lookback window)
def RSI(df, dataColumn, window, isEMA, isSMA):
        
    data = df[dataColumn]
    delta = data.diff()
    # Get rid of the first row, which is NaN since it did not have a previous row to calculate the differences
    delta = delta[1:] 

    # Make the positive gains (up) and negative gains (down)
    up, down = delta.copy(), delta.copy()
    up[up < 0] = 0
    down[down > 0] = 0

    if(isEMA):
        # Calculate the EMAs
        roll_up1 = up.ewm(span=window,min_periods=0,adjust=False,ignore_na=False).mean()
        roll_down1 = down.abs().ewm(span=window,min_periods=0,adjust=False,ignore_na=False).mean()

        # Calculate the RSI based on EMA
        RS1 = roll_up1 / roll_down1
        name1 = str(window) + "RSI" + "_" + "EMA" + "_" + dataColumn
        df[name1] = 100.0 - (100.0 / (1.0 + RS1))

    if(isSMA):
        # Calculate the SMAs
        roll_up2 = up.rolling(window=window).mean()
        roll_down2 = down.abs().rolling(window=window).mean()

        # Calculate the RSI based on SMA
        RS2 = roll_up2 / roll_down2
        name2 = str(window) + "RSI" + "_" + "SMA" + "_" + dataColumn
        df[name2] = 100.0 - (100.0 / (1.0 + RS2))

# Bollinger Bands
def BB(df, dataColumn, window):
    SMA = df[dataColumn].rolling(window=window).mean()
    std = df[dataColumn].rolling(window=window).std()
    df[str(window) + "upper_" + dataColumn] = SMA + (2 * std)
    df[str(window) + "middle_" + dataColumn] = SMA
    df[str(window) + "lower_" + dataColumn] = SMA - (2 * std)
    
    
# On Balance Volume (Try RSI approach to calc up/down)
'''
def OBV(df, dataColumn, volColumn, window):
    #df['OBV_' + dataColumn ]
    df['OBV_' + dataColumn ] = 0
    last_obv = df['OBV_' + dataColumn ].shift(1)
    if df[dataColumn] > df[dataColumn].shift(1):
        df['OBV_' + dataColumn ] = last_obv + df[volColumn]
    elif df[dataColumn] < df[dataColumn].shift(1):
        df['OBV_' + dataColumn ] = last_obv - df[volColumn]
    else:
        df['OBV_' + dataColumn ] = last_obv
    df = df.append(data,ignore_index=True)
    
    
    
    for index, row in data.iterrows():
        if index > 0:
            last_obv = data.at[index - 1, 'obv']
            if row[close_col] > data.at[index - 1, close_col]:
                current_obv = last_obv + row[vol_col]
            elif row[close_col] < data.at[index - 1, close_col]:
                current_obv = last_obv - row[vol_col]
            else:
                current_obv = last_obv
        else:
            last_obv = 0
            current_obv = row[vol_col]'''  
        
        #data.set_value(index, 'obv', current_obv)

   # data['obv_ema' + str(trend_periods)] = data['obv'].ewm(ignore_na=False, min_periods=0, com=trend_periods, adjust=True).mean()
        


## Data Compiling Function

Streamlines data compiling.  Args: list of tickers, returns a dataframe with all the data and one-hot encoded columns
* Calls functions to calculate stock indicator
* Could be used for any data (just change url and columns)

Will update this to use sparse vectors for one-hot encodings once I figure it out.
I will also test out additional indicators


In [None]:
def getStockData(tickers):
    df = pd.DataFrame()
    for ticker in tickers:
        data = pd.read_csv("../input/nasdaq-and-nyse-stocks-histories/fh_20181104/full_history/" + ticker + ".csv")
        data = data.sort_index(axis=0, ascending=False)
        data = data.drop(['adjclose', 'high', 'low'], axis=1)
        
        data["date"] = pd.to_datetime(data["date"])
        #data["date_day_of_week"] = data["date"].dt.dayofweek
        #data["date_year"] = data["date"].dt.year
        #data["date_day"] = data["date"].dt.day

        # Cuts the first 10% of the historical data off (IPO throws model off, typically flatter, ...)
        
        # 200 more cells will be cut off because of SMA. (?? What is the optimal amount of data to cut off?)
        data = data[int(len(data)*.1):]
        
        #Features and Label:
        
# --------- Stock Indicators ---------- #
        
        # Simple Moving Average (df, column for calculations, window)
        SMA(data, 'open', 50)
        SMA(data, 'open', 200)
        
        # Exponetial Moving Average
        EMA(data, 'open', 12)
        EMA(data, 'open', 26)
        
        # Moving Average Convergence Divergence (plus signal) (window1_EMA - window2_EMA) 
        MACD(data, 'open', 12, 26, 9, True)
        
        # Stochastic Oscillator (plus signal)
        SO(data, 'open', 14, 3, True)
        
        # Relative Strength Index (EMA, SMA)
        RSI(data, 'open', 14, True, True)
        
        # Bollinger Bands
        BB(data, 'open', 20)
        
        # On Balance Volume
        #OBV(data, 'open', 'volume', 21)
        
        # SMA(data, 'open', 5)
        # SMA(data, 'open', 20)


        # Option 1
        data['close-1'] = data['close'].shift(1)  #Yesterday's close
        data['log_label'] = np.where(data['close'] > (1.0 * data['close-1']), 1, 0) # is today going to close higher than yesterday?
        
        # Option 2 (model performs much worse)                 <| could try with close price for indicators |>
        #close1 = data['close'].shift(-1)  #Tomorrow's close
        #data['log_label'] = np.where(close1 > (1.0 * data['close']), 1, 0) # is tomorrow going to close higher than today?
        
        
        #close_25 = data['close'].shift(-25)
        
        #close_25 = data['close'].shift(25)
        #data['log_label'] = np.where(close_25 > data['close'], 1, 0)
        
        # multplying close-1 shifted accuracy to .6-.7, but only predicted 20 as 1s (9 correct, 11 incorrect) | AAPL, AMZN Data
        #data['log_label'] = np.where(data['close'] > (1.0 * data['close-1']), 1, 0)
        
        # alternative label  (lookback=15 for APPL, AMZN, GOOG ~ 0.92 accuracy!)
        #return1 = data["close"].pct_change(1)
        
        # when return > .001, accuracy goes through the roof (.79) but its 26 correct 1, 25 incorrect 1
        #data['log_label'] = np.where(return1 > (0.000000001), 1, 0)
        #return2 = np.where(data['return'] < (-0.1), -1, 0)
        #data['log_label'] = return1 + return2
        
        data["ticker"] = ticker
        df = df.append(data,ignore_index=True)
    
    # one-hot encoding, haven't figured out how to do sparse vectors yet
    df2 = pd.get_dummies(df["ticker"])
    df = pd.concat([df, df2], axis=1, join='outer')
    return df


# Eventually could be used to auto-generate a list of tickers for the getStockData function
def getTickers():
    #use dirnames from above
    #/kaggle/input/nasdaq-and-nyse-stocks-histories/all_symbols.txt
    return ""

In [None]:
#List of tickers
# Apple, Amazon, Google, Microsoft, Paypal (acting weird) = PYPL, Facebook, Dell = DMVT, Sony = SNE, IBM, HP = HPQ

#ticker_list = ["AAPL"]
#ticker_list = ["AMZN", "GOOG"]
#ticker_list = ["AAPL", "AMZN", "GOOG"]
ticker_list = ["AAPL", "AMZN", "GOOG", "MSFT", "FB", 'SNE', 'IBM', 'HPQ']

# Call function I defined above
df = getStockData(ticker_list)

# drop NA from SMAs
df = df.dropna()

# this ensures (relatively) even mix of companies in each data split
df = df.sort_values(by=['date'])

df.tail()
#df[df['GOOG'] == 1]

In [None]:
# ticker = 'PYPL'
# df[ticker][df[ticker] == 1].count()

In [None]:
# instead of log label on higher close, calculate return
# pandas pct_change(lookback_period), like close / close.shift(1), but cleaner and accurate

#df["return"] = df["close"].pct_change(1)
#df.head(20)

In [None]:
# Shuffles data in random order.  (this caused a drop of .1 and .2 on val and test scores respectively.  Perhaps date might be a valuable feature.)

# I also realize that the data was always in order (hence the neglible benefit of including date values), but it might actually help

# Also date introduces look ahead bias if its shuffled

#shuffle after splitting data?

# how to shuffle:
#    df = df.sample(frac=1)


### 11/11 Update
I just found out my data split has been putting companies into each bin (mostly apple in train, mostly google in val...) because the data is ordered by date, with companies stacked.

* Added in sort by date, to get even mix when splitting data.  Then adding other tech companies barely drops accuracy, not as much as before.
* Added way to seperate predictions by company (should add function to show tables for each one depending on ticker list).
* fixed accuracy score  (for some reason it worked, but had a different arg for the accuracy score than was passed in the function)

In [None]:
#split the data at 70%, 20%, and 10%
n = len(df)
# try model with less data
#aapl_df = aapl_df[int(n*0.25):]
n = len(df)

train_df = df[0:int(n*0.7)]
val_df = df[int(n*0.7):int(n*0.9)]
test_df = df[int(n*0.9):]


# shuffles the data within each category
train_df = train_df.sample(frac=1)
val_df = val_df.sample(frac=1)
test_df = test_df.sample(frac=1)

#get labels for each data split
train_y = train_df["log_label"]
val_y = val_df["log_label"]
test_y = test_df["log_label"]

#get features for each data split, everything except for open and close price of the past 3 days

#train_X = train_df.drop(columns=['close', 'open', 'log_label'])
#val_X = val_df.drop(columns=['close', 'open', 'log_label'])
#test_X = test_df.drop(columns=['close', 'open', 'log_label'], axis=1)

train_date = train_df['date']
val_date = val_df['date']
test_date = test_df['date']


train_X = train_df.drop(['log_label', 'close', 'open', 'date', 'ticker', 'volume'], axis=1)
val_X = val_df.drop(['log_label', 'close', 'open', 'date', 'ticker', 'volume'], axis=1)
test_X = test_df.drop(['log_label', 'close', 'open', 'date', 'ticker', 'volume'], axis=1)

train_X.describe()

In [None]:
# scales the data within each category
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
#train_X = scaler.fit_transform(train_X)
#val_X = scaler.fit_transform(val_X)
#test_X = scaler.fit_transform(test_X)


# Option 2 for scaling, performs worse but no chance of lookahead bias
#train_X = train_X * .01
#val_X = val_X * .01
#test_X = test_X * .01


In [None]:
from sklearn import linear_model

#Intializes the model and fits it to the training data
model = linear_model.LogisticRegression(random_state=0, max_iter=5000)
#model = linear_model.LogisticRegression(random_state=0, penalty = 'l2', C=.01)
model.fit(train_X, train_y)
y_val_pred = model.predict(val_X)
y_test_pred = model.predict(test_X)

import math
from sklearn.metrics import accuracy_score, mean_squared_error

def eval_metrics(y_actual, y_predict, dataset):
    print("\nEvaluation metrics for " + dataset + ":\n")
    print("Accuracy score is: %.2f" % accuracy_score(y_actual, y_predict))
    print("Mean Squared Error: %.3f" % mean_squared_error(y_actual, y_predict))
    print("Root Mean Squared Error: %.3f" % math.sqrt(mean_squared_error(y_actual, y_predict)))
    print("-----------------------------------------")
    
eval_metrics(val_y, y_val_pred, "Val data")
eval_metrics(test_y, y_test_pred, "Test data")


#graph = plt.scatter(val_X, y_val_pred, label="Validation Data")
#graph = plt.scatter(test_X, y_test_pred, label="Test Data")

#plt.legend()
#plt.show()


pred_df = pd.DataFrame(val_y)
pred_df["pred"] = y_val_pred
pred_df['date'] = val_df['date']
pred_df['close'] = val_df['close']
for ticker in ticker_list:
    pred_df[ticker] = val_X[ticker]

pred_df = pred_df.sort_values(by=['date'])
pred_df.describe()

In [None]:
from sklearn.metrics import confusion_matrix
#from sklearn.metrics import ConfusionMatrixDisplay

cm = confusion_matrix(test_y, y_test_pred)

#cm_display = ConfusionMatrixDisplay(cm).plot()
cm_display = pd.DataFrame(cm)
cm_display
#X axis is predicted
#y axis is actual

In [None]:
# prints prediction stats for company
def predByCompany(ticker_list):
    from tabulate import tabulate # nicer looking tables when printed
    for ticker in ticker_list:
        df = pd.DataFrame(pred_df, columns=['log_label', 'pred', ticker])
        print(tabulate(df[df[ticker] == 1].describe(), headers=['log_label', 'pred', ticker]))
        print('\n')
        
# ticker_list defined above
predByCompany(ticker_list)

## 11/13 Plot of 4 companies close price and model predictions

This is a cool way to visually understand the results.  It shows a bit of the model's ficklness, and how as of right now, the predictions wouldn't translate nicely to buy/sell signals.  However, I've figured out how to seperate results by company and plot them, which will continue to be useful in the future.

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize = (25,7))

# creates data frame with only APPl stocks, uses first 50 entries for graph
test_df = pred_df[pred_df['AAPL'] == 1]
test_df = test_df[0:50]

plt.subplot(1, 2, 1)

# plots green dots if prediction is 1, red dots if prediction is 0
plt.scatter(test_df['date'][test_df['pred'] == 1], (test_df['close'][test_df['pred'] == 1] * test_df['pred'][test_df['pred'] == 1]), color='Green', label = '1')
plt.scatter(test_df['date'][test_df['pred'] == 0], (test_df['close'][test_df['pred'] == 0] + test_df['pred'][test_df['pred'] == 0]), color='Red', label = '0')

# plots stock price and sets title
plt.plot(test_df['date'], test_df['close'], color='Blue', label = 'close price')
plt.title("Apple Stock with model predictions")
plt.legend(loc='lower right')


plt.subplot(1, 2, 2)

test_df = pred_df[pred_df['AMZN'] == 1]
test_df = test_df[0:50]

plt.scatter(test_df['date'][test_df['pred'] == 1], (test_df['close'][test_df['pred'] == 1] * test_df['pred'][test_df['pred'] == 1]), color='Green', label = '1')
plt.scatter(test_df['date'][test_df['pred'] == 0], (test_df['close'][test_df['pred'] == 0] + test_df['pred'][test_df['pred'] == 0]), color='Red', label = '0')
plt.plot(test_df['date'], test_df['close'], color='blue', label = 'close price')
plt.title("Amazon Stock with model predictions")
plt.legend(loc='lower right')

plt.show()
plt.figure(figsize = (25,7))


plt.subplot(1, 2, 1)

test_df = pred_df[pred_df['GOOG'] == 1]
test_df = test_df[0:50]

plt.scatter(test_df['date'][test_df['pred'] == 1], (test_df['close'][test_df['pred'] == 1] * test_df['pred'][test_df['pred'] == 1]), color='Green', label = '1')
plt.scatter(test_df['date'][test_df['pred'] == 0], (test_df['close'][test_df['pred'] == 0] + test_df['pred'][test_df['pred'] == 0]), color='Red', label = '0')
plt.plot(test_df['date'], test_df['close'], color='Blue', label = 'close price')
plt.title("Google Stock with model predictions")
plt.legend(loc='lower right')


plt.subplot(1, 2, 2)

test_df = pred_df[pred_df['MSFT'] == 1]
test_df = test_df[0:50]

plt.scatter(test_df['date'][test_df['pred'] == 1], (test_df['close'][test_df['pred'] == 1] * test_df['pred'][test_df['pred'] == 1]), color='Green', label = '1')
plt.scatter(test_df['date'][test_df['pred'] == 0], (test_df['close'][test_df['pred'] == 0] + test_df['pred'][test_df['pred'] == 0]), color='Red', label = '0')
plt.plot(test_df['date'], test_df['close'], color='blue', label = 'close price')
plt.title("Microsoft Stock with model predictions")
plt.legend(loc='lower right')

plt.show()

In [None]:
# this might help instead of pd.get_dummies()

'''
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
array = df["ticker"].to_numpy()
array = array.reshape(-1, 1)

d = enc.fit_transform(array)

import scipy.sparse
df2 = pd.DataFrame.sparse.from_spmatrix(d)
df['ticker_1hot'] = df2.values.tolist()
df

# [1,0] = APPL | [0,1] = AMZN

# ---________________---

from pandas.arrays import SparseArray

#array = df["ticker_1hot"].SparseArray
#df["ticker_1hot"] = SparseArray(df["ticker_1hot"])
df

d = scipy.sparse.csr_matrix(df.values)
d'''