In [261]:
#!/Users/soraward/opt/miniconda3/bin/python3 
data_root = "../../data/"
# ML stuff
import numpy as np
from numpy.fft import *
import torch
from sklearn.linear_model import Lasso
import pandas as pd


from PIL import Image
# plotting
import matplotlib.pyplot as plt
import seaborn as sns


# basic stuff
import datetime
import requests
import io
from collections import Counter

#stats stuff
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf


In [262]:
# set index as datetime
def date_index_nasdaq(nasdaq):
    nasdaq_c = nasdaq.copy()
    dates = pd.to_datetime(nasdaq_c.Date)
    nasdaq_c.set_index(dates, inplace=True)
    # set date as index
    nasdaq_c.drop("Date", axis=1, inplace=True)
    nasdaq_c = nasdaq_c["2012-05-18":]
    return nasdaq_c


# for prepare_stock
def date_range_df(start, end, column_name = "Time"):
    date_range = pd.date_range(start, end)
    df = pd.DataFrame(date_range, columns = [column_name])
    df.set_index(column_name, inplace=True)
    return df

# merging with date range df
def prepare_stock(nasdaq, start, end, stock_name="AAPL", drop=True):
    nasdaq = nasdaq.loc[nasdaq["Name"]==stock_name]
    dates = date_range_df(start, end)
    new_nasdaq = dates.merge(nasdaq, how="left", left_index=True, right_index=True)
    if drop:
        new_nasdaq.dropna(inplace=True)
    return new_nasdaq

# create features volatility, volume, adj close
def get_features(nasdaq):
    #rename Adj Close
    nasdaq.rename(columns={"Adj Close":"Adj_Close"}, inplace=True)
    nasdaq["log_Volatility"] = np.log(nasdaq.High - nasdaq.Low + 1)
    nasdaq["log_Volume"] = np.log(nasdaq.Volume + 1) 
    nasdaq["log_Adj_Close"] = np.log(nasdaq["Adj_Close"] + 1)
    # nasdaq["log_Adj_Close_diff"] = nasdaq["log_Adj_Close"].diff()
    nasdaq.drop(columns = ["Low", "High", "Close", "Open", "Name", "Volume"], inplace=True)
    # nasdaq.dropna(inplace = True)
    return nasdaq

# this will return feature engineered stock dataframe
def get_stock(nasdaq, stock_name="AAPL"):
    nasdaq_c = date_index_nasdaq(nasdaq)
    stock = prepare_stock(nasdaq_c, nasdaq_c.index[0], nasdaq_c.index[-1], stock_name)
    stock = get_features(stock)
    stock.fillna("ffill", inplace=True)
    return stock

# plot heatmap for top stocks
def plot_attribute(nasdaq, using,feature="log_Adj_Close"):
    stocks = pd.DataFrame()
    for name in using:
        stocks[name] = get_stock(nasdaq, name)[feature]
    stocks.dropna(inplace=True)
    stocks.plot()
    plt.show()

# for ARIMA or some shit    
def reindex(df):
    return df.reindex(pd.date_range(df.index[0], df.index[-1])).fillna(method="ffill")

In [263]:
nasdaq = pd.read_csv(data_root + "NASDAQ_100_Data_From_2010.csv", sep="\t")
# nasdaq.groupby("Name").apply(lambda df: print(df.shape))

# the line below is to get same sized data
    # first_names = list(set([name for name in nasdaq.Name]))
    # using = []
    # for i in first_names:
    #     if nasdaq.groupby("Name").get_group(i).shape == (2943, 8):
    #         using.append(i)
    # len(using)
features = ['Adj_Close', 'log_Volatility', 'log_Volume', 'log_Adj_Close']
using = ['FB', 'TSLA', 'AAPL', 'AMZN', 'NVDA', 'MSFT', 'GOOGL']
# AAPL(Apple), MSFT(Microsoft), GOOGL(Google), AMZN(Amazon), TSLA(Tesla), FB(Facebook), NVDA(Nvidia)

In [358]:
def get_train_df_(nasdaq, using, features):
    df_features_arr = reindex(get_stock(nasdaq, using[0])).to_numpy().T
    for name in using[1:]:
        adding = reindex(get_stock(nasdaq, name)).to_numpy().T
        df_features_arr = np.concatenate([df_features_arr, adding])
    df_features_arr = df_features_arr.T

    ## df_features = pd.DataFrame(data=df_features_arr, columns=pd.MultiIndex.from_tuples(zip(col_one, col_two)))
    
    # making columns
    col_one = []
    for element in using:
        for i in range(len(features)):
            col_one.append(element)
    col_two = features*len(using)
    # scaling 
    scaler = MinMaxScaler((-1, 1))
    scaled = scaler.fit_transform(df_features_arr)
    df_features = pd.DataFrame(data=scaled, columns=pd.MultiIndex.from_tuples(zip(col_one, col_two)))

    df_features.index = pd.date_range("2012-05-18", "2021-09-10")

    df_features["weekday"] = pd.Series(data=list(map(lambda date: date.weekday(), df_features.index)), index=df_features.index)


    return df_features

from sklearn.preprocessing import MinMaxScaler

# for feeding into network
def get_train_arr(nasdaq, using):
    df_features_arr = []
    for name in using:
        arr = reindex(get_stock(nasdaq, name)).to_numpy()
        # scaling for each column, for each stock_df in nasdaq
        scaler = MinMaxScaler(feature_range=(-1, 1))
        arr_scaled = scaler.fit_transform(arr)    
        # print("max", scaler.data_max_)

        week_of_day = np.array(list(map(lambda date: date.weekday(), pd.date_range("2012-05-18", "2021-09-10"))))
        week_of_day = week_of_day.reshape(-1, 1)
        # print(week_of_day.shape)
        # print(arr_scaled.shape)
        arr_scaled = np.concatenate([arr_scaled, week_of_day], axis=1)
        print(arr_scaled[:, -1])
        df_features_arr.append(arr_scaled)


    df_features_arr = np.array(df_features_arr)
    df_features_arr = df_features_arr.reshape(-1, 5, 7)

    return df_features_arr