In [35]:
#!/Users/soraward/opt/miniconda3/bin/python3 
data_root = "../../data/"
# ML stuff
import numpy as np
from numpy.fft import *
import torch
from sklearn.linear_model import Lasso
import pandas as pd


from PIL import Image
# plotting
import matplotlib.pyplot as plt
import seaborn as sns


# basic stuff
import datetime
import requests
import io
from collections import Counter

#stats stuff
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf


In [36]:
# set index as datetime
def date_index_nasdaq(nasdaq):
    nasdaq_c = nasdaq.copy()
    dates = pd.to_datetime(nasdaq_c.Date)
    nasdaq_c.set_index(dates, inplace=True)
    # set date as index
    nasdaq_c.drop("Date", axis=1, inplace=True)
    nasdaq_c = nasdaq_c["2012-05-18":]
    return nasdaq_c


# for prepare_stock
def date_range_df(start, end, column_name = "Time"):
    date_range = pd.date_range(start, end)
    df = pd.DataFrame(date_range, columns = [column_name])
    df.set_index(column_name, inplace=True)
    return df

# merging with date range df
def prepare_stock(nasdaq, start, end, stock_name="AAPL", drop=True):
    nasdaq = nasdaq.loc[nasdaq["Name"]==stock_name]
    dates = date_range_df(start, end)
    new_nasdaq = dates.merge(nasdaq, how="left", left_index=True, right_index=True)
    if drop:
        new_nasdaq.dropna(inplace=True)
    return new_nasdaq

# create features volatility, volume, adj close
def get_features(nasdaq):
    #rename Adj Close
    nasdaq.rename(columns={"Adj Close":"Adj_Close"}, inplace=True)
    nasdaq["log_Volatility"] = np.log(nasdaq.High - nasdaq.Low + 1)
    nasdaq["log_Volume"] = np.log(nasdaq.Volume + 1) 
    nasdaq["log_Adj_Close"] = np.log(nasdaq["Adj_Close"] + 1)
    # nasdaq["log_Adj_Close_diff"] = nasdaq["log_Adj_Close"].diff()
    nasdaq.drop(columns = ["Low", "High", "Close", "Open", "Name", "Volume"], inplace=True)
    # nasdaq.dropna(inplace = True)
    return nasdaq

# this will return feature engineered stock dataframe
def get_stock(nasdaq, stock_name="AAPL"):
    nasdaq_c = date_index_nasdaq(nasdaq)
    stock = prepare_stock(nasdaq_c, nasdaq_c.index[0], nasdaq_c.index[-1], stock_name)
    stock = get_features(stock)
    stock.fillna("ffill", inplace=True)
    return stock

# plot heatmap for top stocks
def plot_attribute(nasdaq, using,feature="log_Adj_Close"):
    stocks = pd.DataFrame()
    for name in using:
        stocks[name] = get_stock(nasdaq, name)[feature]
    stocks.dropna(inplace=True)
    stocks.plot()
    plt.show()

# for ARIMA or some shit    
def reindex(df):
    return df.reindex(pd.date_range(df.index[0], df.index[-1])).fillna(method="ffill")

In [37]:
nasdaq = pd.read_csv(data_root + "NASDAQ_100_Data_From_2010.csv", sep="\t")
# nasdaq.groupby("Name").apply(lambda df: print(df.shape))

# the line below is to get same sized data
    # first_names = list(set([name for name in nasdaq.Name]))
    # using = []
    # for i in first_names:
    #     if nasdaq.groupby("Name").get_group(i).shape == (2943, 8):
    #         using.append(i)
    # len(using)
features = {'Adj_Close', 'log_Volatility', 'log_Volume', 'log_Adj_Close'}
using = ['FB', 'TSLA', 'AAPL', 'AMZN', 'NVDA', 'MSFT', 'GOOGL']
# AAPL(Apple), MSFT(Microsoft), GOOGL(Google), AMZN(Amazon), TSLA(Tesla), FB(Facebook), NVDA(Nvidia)

In [80]:
def get_train_df(nasdaq, using, features):
    df_features_arr = reindex(get_stock(nasdaq, using[0])).to_numpy().T
    for name in using[1:]:
        adding = reindex(get_stock(nasdaq, name)).to_numpy().T
        df_features_arr = np.concatenate([df_features_arr, adding])
    df_features_arr = df_features_arr.T

    ## df_features = pd.DataFrame(data=df_features_arr, columns=pd.MultiIndex.from_tuples(zip(col_one, col_two)))
    
    # making columns
    # features must not include weekday here
    assert("weekday" not in features)
    col_one = []
    for element in using:
        for i in range(len(features)):
            col_one.append(element)
    col_two = list(features)*len(using)
    print(len(col_one), len(col_two))
    # scaling 
    scaler = MinMaxScaler((-1, 1))
    scaled = scaler.fit_transform(df_features_arr)
    df_features = pd.DataFrame(data=scaled, columns=pd.MultiIndex.from_tuples(zip(col_one, col_two)))

    df_features.index = pd.date_range("2012-05-18", "2021-09-10")

    day_of_week = np.array(list(map(lambda date: date.weekday(), df_features.index)))
    day_of_week = day_of_week.reshape(-1, 1)
    day_of_week = pd.Series(data=scaler.fit_transform(day_of_week).reshape(-1,), index = df_features.index)
    df_features["weekday"] = day_of_week

    return df_features

from sklearn.preprocessing import MinMaxScaler

# for feeding into network
def get_train_arr(nasdaq, using, features):
    df_features_arr = []
    for name in using:
        arr = reindex(get_stock(nasdaq, name)).to_numpy()
        # scaling for each column, for each stock_df in nasdaq
        scaler = MinMaxScaler(feature_range=(-1, 1))
        arr_scaled = scaler.fit_transform(arr)    

        # adding day of week
        day_of_week = np.array(list(map(lambda date: date.weekday(), pd.date_range("2012-05-18", "2021-09-10"))))
        day_of_week = day_of_week.reshape(-1, 1)
        day_of_week = scaler.fit_transform(day_of_week)
      
        arr_scaled = np.concatenate([arr_scaled, day_of_week], axis=1)

        df_features_arr.append(arr_scaled)


    df_features_arr = np.array(df_features_arr)
    features.add("weekday")
    df_features_arr = df_features_arr.reshape(-1, len(features), 7)

    return df_features_arr

In [84]:
get_train_arr(nasdaq, using, features).shape

(3403, 5, 7)

In [82]:
get_train_df(nasdaq, using, features={'Adj_Close', 'log_Volatility', 'log_Volume', 'log_Adj_Close'})

28 28


Unnamed: 0_level_0,FB,FB,FB,FB,TSLA,TSLA,TSLA,TSLA,AAPL,AAPL,...,NVDA,MSFT,MSFT,MSFT,MSFT,GOOGL,GOOGL,GOOGL,GOOGL,weekday
Unnamed: 0_level_1,Adj_Close,log_Volume,log_Adj_Close,log_Volatility,Adj_Close,log_Volume,log_Adj_Close,log_Volatility,Adj_Close,log_Volume,...,log_Volatility,Adj_Close,log_Volume,log_Adj_Close,log_Volatility,Adj_Close,log_Volume,log_Adj_Close,log_Volatility,Unnamed: 21_level_1
2012-05-18,-0.887502,0.271078,1.000000,-0.510124,-0.999335,-0.904507,-0.426573,-0.981489,-0.943105,-0.634701,...,-0.979027,-0.985425,-0.760133,0.153248,-0.933668,-0.984229,0.026879,0.631053,-0.939133,0.333333
2012-05-19,-0.887502,0.271078,1.000000,-0.510124,-0.999335,-0.904507,-0.426573,-0.981489,-0.943105,-0.634701,...,-0.979027,-0.985425,-0.760133,0.153248,-0.933668,-0.984229,0.026879,0.631053,-0.939133,0.666667
2012-05-20,-0.887502,0.271078,1.000000,-0.510124,-0.999335,-0.904507,-0.426573,-0.981489,-0.943105,-0.634701,...,-0.979027,-0.985425,-0.760133,0.153248,-0.933668,-0.984229,0.026879,0.631053,-0.939133,1.000000
2012-05-21,-0.910550,-0.100568,0.463670,-0.585156,-0.998783,-0.871515,-0.462524,-0.966767,-0.929994,-0.539763,...,-0.972909,-0.982635,-0.724027,-0.058080,-0.921593,-0.979000,-0.392194,0.297242,-0.919871,-1.000000
2012-05-22,-0.927178,-0.268563,0.244101,-0.645101,-0.997858,-0.924417,-0.276817,-0.943213,-0.931823,-0.633574,...,-0.977271,-0.982577,-0.876786,-0.047643,-0.921344,-0.984076,-0.329549,0.293385,-0.938565,-0.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-09-06,0.967513,-0.045294,-0.895508,0.989683,0.659357,0.029193,-0.177142,0.925245,0.966929,-0.342935,...,1.000000,0.975165,-0.279450,-0.609046,0.991072,0.977504,0.385508,-0.619427,0.991259,-1.000000
2021-09-07,1.000000,0.478093,-0.736901,1.000000,0.703441,0.344718,-0.069687,0.935736,1.000000,-0.008963,...,0.996184,0.968372,-0.176434,-0.522061,0.988612,0.985750,0.056621,-0.612534,0.994473,-0.666667
2021-09-08,0.974702,0.187063,-0.725966,0.991980,0.705606,0.397107,-0.094934,0.936244,0.978137,0.029051,...,0.989297,0.968584,-0.134340,-0.597614,0.988689,0.976765,0.264463,-0.659099,0.990970,-0.333333
2021-09-09,0.977061,-0.162289,-0.811581,0.992732,0.707861,0.055987,-0.208480,0.936772,0.963747,-0.169713,...,0.985806,0.947641,0.132019,-0.437561,0.981056,0.974479,0.151836,-0.749873,0.990076,0.000000


In [46]:
get_train_arr(nasdaq, using, features)[0][0]

array([-0.88750171,  0.27107764,  1.        , -0.5101242 ,  0.33333333,
       -0.88750171,  0.27107764])

In [57]:
features

{'Adj_Close', 'log_Adj_Close', 'log_Volatility', 'log_Volume', 'weekday'}