# Periodic download of IEX stock-ticker data

## Read IEX API credentials from `~/.config/iex.ini`

In [1]:
from pathlib import Path
config_path = Path.home() / '.config' / 'iex.ini'

from configparser import ConfigParser
config = ConfigParser()
config.read(str(config_path))
iex_config = config['iex']

api = 'https://cloud.iexapis.com'
public_key = iex_config['public_key']
secret_key = iex_config['secret_key']

In [2]:
# tickers = sorted("MMM ABT ABBV ABMD ACN ATVI ADBE AMD AAP AES AMG AFL A APD AKAM ALK ALB ARE ALXN ALGN ALLE AGN ADS LNT ALL GOOGL GOOG MO AMZN AMCR AEE AAL AEP AXP AIG AMT AWK AMP ABC AME AMGN APH ADI ANSS ANTM AON AOS APA AIV AAPL AMAT APTV ADM ARNC ANET AJG AIZ ATO T ADSK ADP AZO AVB AVY BKR BLL BAC BK BAX BBT BDX BRK.B BBY BIIB BLK HRB BA BKNG BWA BXP BSX BMY AVGO BR BF.B CHRW COG CDNS CPB COF CPRI CAH KMX CCL CAT CBOE CBRE CBS CDW CE CELG CNC CNP CTL CERN CF SCHW CHTR CVX CMG CB CHD CI XEC CINF CTAS CSCO C CFG CTXS CLX CME CMS KO CTSH CL CMCSA CMA CAG CXO COP ED STZ COO CPRT GLW CTVA COST COTY CCI CSX CMI CVS DHI DHR DRI DVA DE DAL XRAY DVN FANG DLR DFS DISCA DISCK DISH DG DLTR D DOV DOW DTE DUK DRE DD DXC ETFC EMN ETN EBAY ECL EIX EW EA EMR ETR EOG EFX EQIX EQR ESS EL EVRG ES RE EXC EXPE EXPD EXR XOM FFIV FB FAST FRT FDX FIS FITB FE FRC FISV FLT FLIR FLS FMC F FTNT FTV FBHS FOXA FOX BEN FCX GPS GRMN IT GD GE GIS GM GPC GILD GL GPN GS GWW HAL HBI HOG HIG HAS HCA HCP HP HSIC HSY HES HPE HLT HFC HOLX HD HON HRL HST HPQ HUM HBAN HII IEX IDXX INFO ITW ILMN IR INTC ICE IBM INCY IP IPG IFF INTU ISRG IVZ IPGP IQV IRM JKHY JEC JBHT SJM JNJ JCI JPM JNPR KSU K KEY KEYS KMB KIM KMI KLAC KSS KHC KR LB LHX LH LRCX LW LVS LEG LDOS LEN LLY LNC LIN LKQ LMT L LOW LYB MTB MAC M MRO MPC MKTX MAR MMC MLM MAS MA MKC MXIM MCD MCK MDT MRK MET MTD MGM MCHP MU MSFT MAA MHK TAP MDLZ MNST MCO MS MOS MSI MSCI MYL NDAQ NOV NTAP NFLX NWL NEM NWSA NWS NEE NLSN NKE NI NBL JWN NSC NTRS NOC NCLH NRG NUE NVDA NVR ORLY OXY OMC OKE ORCL PCAR PKG PH PAYX PYPL PNR PBCT PEP PKI PRGO PFE PM PSX PNW PXD PNC PPG PPL PFG PG PGR PLD PRU PEG PSA PHM PVH QRVO PWR QCOM DGX RL RJF RTN O REG REGN RF RSG RMD RHI ROK ROL ROP ROST RCL CRM SBAC SLB STX SEE SRE SHW SPG SWKS SLG SNA SO LUV SPGI SWK SBUX STT SYK STI SIVB SYMC SYF SNPS SYY TMUS TROW TTWO TPR TGT TEL FTI TFX TXN TXT TMO TIF TWTR TJX TSCO TDG TRV TRIP TSN UDR ULTA USB UAA UA UNP UAL UNH UPS URI UTX UHS UNM VFC VLO VAR VTR VRSN VRSK VZ VRTX VIAB V VNO VMC WAB WMT WBA DIS WM WAT WEC WCG WFC WELL WDC WU WRK WY WHR WMB WLTW WYNN XEL XRX XLNX XYL YUM ZBH ZION ZTS".split(" "))
# aapl = tickers.index('AAPL'); aapl
ticker = 'AAPL'
tickers = [ticker]
num_tickers = len(tickers)
num_tickers

1

In [60]:
from datetime import datetime as dt, timedelta as Δ
from dateutil.parser import parse

strptime = dt.strptime
time = dt.now
now = time()
today = now.date()
today.strftime('%Y-%m-%d')

'2019-11-03'

In [4]:
data_dir = Path.cwd() / 'data'
data_dir.mkdir(parents=True, exist_ok=True)

In [5]:
from sys import executable as python
!{python} -m pip install -Uq requests
from requests import get as GET

In [44]:
import json

def fetch(date, ticker, refetch_empty=False):
    date_str = date.strftime('%Y%m%d')
    out_path = data_dir / ('%s-%s' % (date_str, ticker))
    refetch = False
    if out_path.exists():
        if refetch_empty:
            with out_path.open('r') as f:
                data = json.load(f)
                if not data:
                    refetch = True
                    print('Re-fetching data for %s from %s' % (ticker, date_str))
                else:
                    return True
        else:
            return True
    else:
        print('Fetching data for %s from %s' % (ticker, date_str))

    url = f'https://cloud.iexapis.com/stable/stock/{ticker}/chart/date/{date_str}?token={secret_key}'
    resp = GET(url)
    resp.raise_for_status()
    with out_path.open('wb') as f:
        f.write(resp.content)

    data = json.loads(resp.content)
    if data:
        if refetch:
            print('Re-fetch found data for %s %s' % (date_str, ticker))
        return True

    return False

In [45]:
%%time
from concurrent.futures import ThreadPoolExecutor

end_date = today
start_date = datetime(2019, 4, 1).date()
N = 32
refetch_empty = False

def get_dates(start_date, end_date, step=1):
    date = start_date
    while date != end_date:
        if date.weekday() <= 4:
            yield date
        date += Δ(days=step)

dates = list(get_dates(start_date, end_date))

with ThreadPoolExecutor(max_workers = N) as p:
    results = p.map(
        lambda t: fetch(t[0], t[1], refetch_empty=refetch_empty), 
        [ (date, ticker) for date in dates for ticker in tickers ]
    )
    
    found_data = True in results
    if not found_data:
        print('No data found for %s' % date)

CPU times: user 12.6 ms, sys: 5.77 ms, total: 18.3 ms
Wall time: 14.5 ms


In [24]:
!{python} -m pip install -Uq pandas
from pandas import concat, DataFrame as DF, read_csv, read_json
import pandas as pd

In [13]:
minutes = 390  # [9:30am,4:00pm)

In [14]:
features = [ 'open', 'close', 'high', 'low', 'average', 'volume', 'notional', 'numberOfTrades' ]
num_features = len(features)

In [15]:
def load_data_arr(date, ticker):
    date_str = date.strftime('%Y%m%d')
    out_path = data_dir / ('%s-%s' % (date_str, ticker))
    if not out_path.exists():
        arr = zeros((minutes, len(features)))
        arr[:] = nan
        return arr
    df = read_json(out_path)
    if df.empty:
        arr = zeros((minutes, len(features)))
        arr[:] = nan
        return arr
    arr = df[features].values
    assert arr.shape == (minutes, len(features))
    return arr

In [16]:
!{python} -m pip install -Uq numpy joblib
import numpy as np
from numpy import array, nan, zeros, count_nonzero as cnz, isnan as na, mean, std, unique
from numpy.random import shuffle, permutation
from joblib import Parallel, delayed

In [17]:
def load_date_arr(date):
    arr = array([ 
        load_data_arr(start_date, ticker) 
        for ticker in tickers 
    ]) \
    .reshape((
        minutes, 
        len(tickers), 
        len(features),
    ))
    assert arr.shape == (minutes, num_tickers, len(features))
    return arr

In [80]:
def load_ticker_date_df(date, ticker):
    date_str = date.strftime('%Y%m%d')
    out_path = data_dir / ('%s-%s' % (date_str, ticker))
    if not out_path.exists():
        return None
    df = read_json(out_path)
    if df.empty:
        return None
    df['datetime'] = df['date'].apply(lambda d: d.strftime('%Y-%m-%d')) + ' ' + df['minute']
    df.drop(columns=['date', 'minute'])
    df = df[['datetime'] + features]
    df['datetime'] = df['datetime'].apply(lambda s: strptime(s, '%Y-%m-%d %H:%M'))
    return df

In [93]:
def load_ticker_df(ticker, N=None, limit=None):
    if limit is None:
        ds = dates
    else:
        ds = dates[:limit]

    if N is None:
        df = concat([ load_ticker_date_df(date, ticker) for date in ds ])
    else:
        df = concat(Parallel(n_jobs=N)( delayed(load_ticker_date_df)(date, ticker) for date in ds ))
    
    df.set_index('datetime', inplace=True)
    df.sort_index(inplace=True)

    for col in aapl.columns:
        df[col] = df[col].apply(lambda n: nan if n < 0 else n)

    return df

In [94]:
%%time
aapl = load_ticker_df('AAPL'); aapl

CPU times: user 4.61 s, sys: 62.7 ms, total: 4.67 s
Wall time: 4.73 s


Unnamed: 0_level_0,open,close,high,low,average,volume,notional,numberOfTrades
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-04-01 09:30:00,191.645,190.650,191.645,190.60,191.189,4320,825935.940,44
2019-04-01 09:31:00,190.700,190.980,190.980,190.64,190.761,3246,619210.510,32
2019-04-01 09:32:00,191.060,190.930,191.090,190.78,190.951,2253,430211.740,30
2019-04-01 09:33:00,190.980,190.830,191.010,190.76,190.946,2241,427911.290,27
2019-04-01 09:34:00,190.760,190.700,190.760,190.60,190.666,1069,203822.465,12
...,...,...,...,...,...,...,...,...
2019-11-01 15:55:00,255.350,255.540,255.620,255.29,255.528,4917,1256432.140,36
2019-11-01 15:56:00,255.530,255.730,255.750,255.52,255.616,4788,1223891.590,42
2019-11-01 15:57:00,255.740,255.730,255.880,255.73,255.822,3798,971613.530,38
2019-11-01 15:58:00,255.730,255.665,255.780,255.61,255.661,8396,2146526.195,88


In [95]:
aapl.count()

open              55577
close             55577
high              55577
low               55577
average           55577
volume            56367
notional          56367
numberOfTrades    56367
dtype: int64

In [101]:
nan_idxs = aapl.isna().any(axis=1); nan_idxs
nans = aapl[nan_idxs]
nans

Unnamed: 0_level_0,open,close,high,low,average,volume,notional,numberOfTrades
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-04-02 13:29:00,,,,,,0,0.0,0
2019-04-05 13:36:00,,,,,,0,0.0,0
2019-04-05 14:29:00,,,,,,0,0.0,0
2019-04-15 13:33:00,,,,,,0,0.0,0
2019-04-16 13:39:00,,,,,,0,0.0,0
...,...,...,...,...,...,...,...,...
2019-11-01 12:46:00,,,,,,0,0.0,0
2019-11-01 13:24:00,,,,,,0,0.0,0
2019-11-01 13:29:00,,,,,,0,0.0,0
2019-11-01 13:45:00,,,,,,0,0.0,0


In [124]:
last_valid_idx = -1
last_close = -1
last_closes = []
for idx, valid in enumerate(~nan_idxs):
    if valid:
        last_valid_idx = idx
        last_close = aapl['close'].iloc[idx]
    if last_valid_idx < 0:
        last_closes.append(nan)
    else:
        last_closes.append(last_close)
last_closes

[190.65,
 190.98,
 190.93,
 190.83,
 190.7,
 190.85,
 190.77,
 190.71,
 190.3,
 190.34,
 190.53,
 190.07,
 190.13,
 190.2,
 190.35,
 190.12,
 190.19,
 190.27,
 190.13,
 190.27,
 190.24,
 190.365,
 190.34,
 190.165,
 189.72,
 189.95,
 190.19,
 190.2,
 190.17,
 190.15,
 190.35,
 190.37,
 190.3,
 190.41,
 190.09,
 189.92,
 189.57,
 189.02,
 189.09,
 189.24,
 188.95,
 188.8,
 189.1,
 188.98,
 189.115,
 189.28,
 189.35,
 189.37,
 189.29,
 189.39,
 189.17,
 189.09,
 189.11,
 189.15,
 188.86,
 188.85,
 188.79,
 188.64,
 188.76,
 188.945,
 188.99,
 188.83,
 188.9,
 188.795,
 188.875,
 188.73,
 188.805,
 188.735,
 188.64,
 188.7,
 188.69,
 188.6,
 188.68,
 188.54,
 188.475,
 188.62,
 188.74,
 188.84,
 188.83,
 188.95,
 188.915,
 189.05,
 189.02,
 189.02,
 189.21,
 189.29,
 189.44,
 189.41,
 189.35,
 189.4,
 189.5,
 189.47,
 189.84,
 190.09,
 190.0,
 190.085,
 190.285,
 190.25,
 190.265,
 190.19,
 190.13,
 190.13,
 190.2,
 190.21,
 190.15,
 190.11,
 190.16,
 190.09,
 190.08,
 189.94,
 189.97,
 1

In [128]:
aapl['prev_close'] = last_closes
aapl

Unnamed: 0_level_0,open,close,high,low,average,volume,notional,numberOfTrades,prev_close
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2019-04-01 09:30:00,191.645,190.650,191.645,190.60,191.189,4320,825935.940,44,190.650
2019-04-01 09:31:00,190.700,190.980,190.980,190.64,190.761,3246,619210.510,32,190.980
2019-04-01 09:32:00,191.060,190.930,191.090,190.78,190.951,2253,430211.740,30,190.930
2019-04-01 09:33:00,190.980,190.830,191.010,190.76,190.946,2241,427911.290,27,190.830
2019-04-01 09:34:00,190.760,190.700,190.760,190.60,190.666,1069,203822.465,12,190.700
...,...,...,...,...,...,...,...,...,...
2019-11-01 15:55:00,255.350,255.540,255.620,255.29,255.528,4917,1256432.140,36,255.540
2019-11-01 15:56:00,255.530,255.730,255.750,255.52,255.616,4788,1223891.590,42,255.730
2019-11-01 15:57:00,255.740,255.730,255.880,255.73,255.822,3798,971613.530,38,255.730
2019-11-01 15:58:00,255.730,255.665,255.780,255.61,255.661,8396,2146526.195,88,255.665


In [145]:
nan_cols = [ 'open', 'close', 'high', 'low', 'average' ]
for col in nan_cols:
    aapl[col].fillna(aapl['prev_close'], inplace=True)

aapl.drop(columns='prev_close', inplace=True)
aapl[nan_cols][nan_idxs]

Unnamed: 0_level_0,open,close,high,low,average
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-04-02 13:29:00,193.000,193.000,193.000,193.000,193.000
2019-04-05 13:36:00,196.540,196.540,196.540,196.540,196.540
2019-04-05 14:29:00,196.795,196.795,196.795,196.795,196.795
2019-04-15 13:33:00,198.910,198.910,198.910,198.910,198.910
2019-04-16 13:39:00,199.725,199.725,199.725,199.725,199.725
...,...,...,...,...,...
2019-11-01 12:46:00,253.060,253.060,253.060,253.060,253.060
2019-11-01 13:24:00,253.485,253.485,253.485,253.485,253.485
2019-11-01 13:29:00,253.470,253.470,253.470,253.470,253.470
2019-11-01 13:45:00,253.750,253.750,253.750,253.750,253.750


In [146]:
aapl.count()

open              56367
close             56367
high              56367
low               56367
average           56367
volume            56367
notional          56367
numberOfTrades    56367
dtype: int64

In [161]:
next_avgs = aapl[['close']]
next_avgs['datetime'] = next_avgs.index
next_avgs['datetime'] = next_avgs['datetime'].apply(lambda dt: dt - Δ(minutes=1))
next_avgs.set_index('datetime', inplace=True)
next_avgs.rename(columns={'close': 'next_close'}, inplace=True)
next_avgs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0_level_0,next_close
datetime,Unnamed: 1_level_1
2019-04-01 09:29:00,190.650
2019-04-01 09:30:00,190.980
2019-04-01 09:31:00,190.930
2019-04-01 09:32:00,190.830
2019-04-01 09:33:00,190.700
...,...
2019-11-01 15:54:00,255.540
2019-11-01 15:55:00,255.730
2019-11-01 15:56:00,255.730
2019-11-01 15:57:00,255.665


In [172]:
aapl = aapl.merge(next_avgs, how='left', left_index=True, right_index=True).dropna(subset=['next_close']); aapl

Unnamed: 0_level_0,open,close,high,low,average,volume,notional,numberOfTrades,next_close
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2019-04-01 09:30:00,191.645,190.650,191.645,190.600,191.189,4320,825935.940,44,190.980
2019-04-01 09:31:00,190.700,190.980,190.980,190.640,190.761,3246,619210.510,32,190.930
2019-04-01 09:32:00,191.060,190.930,191.090,190.780,190.951,2253,430211.740,30,190.830
2019-04-01 09:33:00,190.980,190.830,191.010,190.760,190.946,2241,427911.290,27,190.700
2019-04-01 09:34:00,190.760,190.700,190.760,190.600,190.666,1069,203822.465,12,190.850
...,...,...,...,...,...,...,...,...,...
2019-11-01 15:54:00,255.200,255.300,255.360,255.135,255.267,3520,898538.565,37,255.540
2019-11-01 15:55:00,255.350,255.540,255.620,255.290,255.528,4917,1256432.140,36,255.730
2019-11-01 15:56:00,255.530,255.730,255.750,255.520,255.616,4788,1223891.590,42,255.730
2019-11-01 15:57:00,255.740,255.730,255.880,255.730,255.822,3798,971613.530,38,255.665


In [174]:
aapl.count()

open              56219
close             56219
high              56219
low               56219
average           56219
volume            56219
notional          56219
numberOfTrades    56219
next_close        56219
dtype: int64

In [175]:
from random import random, seed

In [262]:
validation_split = 0.1

# decide whether a given minute should be treated as a validation (otherwise: training) test case
# seed PRNG so that this info is stable over time, and we never train on samples marked for "validation"
def get_validation_flag(dt):
    seed(int(dt.strftime('%Y%m%d%H%M')))
    return random() < validation_split

vfs = aapl.index.to_series().apply(get_validation_flag); vfs
nt, nv = vfs[~vfs].count(), vfs[vfs].count()
n = nt + nv
nt / n, nv / n

(0.8985930023657482, 0.10140699763425176)

In [185]:
aapl['validation'] = vfs; aapl

Unnamed: 0_level_0,open,close,high,low,average,volume,notional,numberOfTrades,next_close,validation
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-04-01 09:30:00,191.645,190.650,191.645,190.600,191.189,4320,825935.940,44,190.980,False
2019-04-01 09:31:00,190.700,190.980,190.980,190.640,190.761,3246,619210.510,32,190.930,True
2019-04-01 09:32:00,191.060,190.930,191.090,190.780,190.951,2253,430211.740,30,190.830,False
2019-04-01 09:33:00,190.980,190.830,191.010,190.760,190.946,2241,427911.290,27,190.700,False
2019-04-01 09:34:00,190.760,190.700,190.760,190.600,190.666,1069,203822.465,12,190.850,False
...,...,...,...,...,...,...,...,...,...,...
2019-11-01 15:54:00,255.200,255.300,255.360,255.135,255.267,3520,898538.565,37,255.540,False
2019-11-01 15:55:00,255.350,255.540,255.620,255.290,255.528,4917,1256432.140,36,255.730,True
2019-11-01 15:56:00,255.530,255.730,255.750,255.520,255.616,4788,1223891.590,42,255.730,True
2019-11-01 15:57:00,255.740,255.730,255.880,255.730,255.822,3798,971613.530,38,255.665,False


In [202]:
aapl.to_csv(data_dir / 'aapl.csv')

In [255]:
cols = features + [ 'next_close' ]
trn = aapl[~aapl.validation][cols]
val = aapl[ aapl.validation][cols]
stats = \
    concat(
        [ 
            trn.mean(axis=0).rename('mean'), 
            trn.std(axis=0).rename('stddev') 
        ], 
        axis=1
    ) \
    .transpose(copy=True)

means = stats.loc['mean']
stddevs = stats.loc['stddev']

stats

Unnamed: 0,open,close,high,low,average,volume,notional,numberOfTrades,next_close
mean,207.866821,207.865959,207.922829,207.808743,207.86664,1725.638188,352696.439237,16.834673,207.866104
stddev,16.675256,16.676134,16.667464,16.68404,16.675257,3062.550611,647547.286346,17.739746,16.676562


In [256]:
normed = (aapl[cols] - means[cols]) / stddevs[cols]; normed

Unnamed: 0_level_0,open,close,high,low,average,volume,notional,numberOfTrades,next_close
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2019-04-01 09:30:00,-0.972808,-1.032371,-0.976623,-1.031449,-1.000143,0.847125,0.730818,1.531326,-1.012565
2019-04-01 09:31:00,-1.029479,-1.012582,-1.016521,-1.029052,-1.025810,0.496436,0.411575,0.854878,-1.015563
2019-04-01 09:32:00,-1.007890,-1.015581,-1.009921,-1.020661,-1.014416,0.172197,0.119706,0.742137,-1.021560
2019-04-01 09:33:00,-1.012687,-1.021577,-1.014721,-1.021859,-1.014715,0.168279,0.116153,0.573026,-1.029355
2019-04-01 09:34:00,-1.025881,-1.029373,-1.029720,-1.031449,-1.031507,-0.214409,-0.229904,-0.272533,-1.020360
...,...,...,...,...,...,...,...,...,...
2019-11-01 15:54:00,2.838528,2.844427,2.846094,2.836619,2.842556,0.585904,0.842938,1.136731,2.858737
2019-11-01 15:55:00,2.847523,2.858819,2.861693,2.845909,2.858208,1.042060,1.395629,1.080361,2.870130
2019-11-01 15:56:00,2.858318,2.870212,2.869493,2.859694,2.863486,0.999938,1.345377,1.418584,2.870130
2019-11-01 15:57:00,2.870911,2.870212,2.877293,2.872281,2.875839,0.676678,0.955787,1.193102,2.866232


In [261]:
concat(
    [ 
        normed[~aapl.validation].mean(axis=0).rename('train_mean'), 
        normed[~aapl.validation].std(axis=0).rename('train_stddev'),
        normed[ aapl.validation].mean(axis=0).rename('val_mean'), 
        normed[ aapl.validation].std(axis=0).rename('val_stddev'),
        normed                  .mean(axis=0).rename('mean'), 
        normed                  .std(axis=0).rename('stddev'),
    ], 
    axis=1
)

Unnamed: 0,train_mean,train_stddev,val_mean,val_stddev,mean,stddev
open,-8.449753e-14,1.0,0.011879,0.999344,0.001205,0.999931
close,1.834638e-14,1.0,0.011899,0.999314,0.001207,0.999928
high,-1.327459e-13,1.0,0.011984,0.99934,0.001215,0.999931
low,1.251468e-13,1.0,0.011832,0.999342,0.0012,0.999931
average,-3.050378e-14,1.0,0.011908,0.999346,0.001208,0.999931
volume,1.165807e-15,1.0,0.00795,1.148855,0.000806,1.016081
notional,-1.141307e-15,1.0,0.008522,1.160387,0.000864,1.017409
numberOfTrades,-4.350348e-15,1.0,0.015134,1.012939,0.001535,1.001321
next_close,-9.199683e-15,1.0,0.011911,0.999216,0.001208,0.999918


In [264]:
window = 30

def make_windowed_array(feature):        
    col = normed[feature]
    df = concat([ col.shift(lag) for lag in reversed(range(window)) ], axis=1)
    return df.values

array([ make_windowed_array for feature in features ])[(window-1):]

array([], dtype=object)

In [265]:
normed

Unnamed: 0_level_0,open,close,high,low,average,volume,notional,numberOfTrades,next_close
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2019-04-01 09:30:00,-0.972808,-1.032371,-0.976623,-1.031449,-1.000143,0.847125,0.730818,1.531326,-1.012565
2019-04-01 09:31:00,-1.029479,-1.012582,-1.016521,-1.029052,-1.025810,0.496436,0.411575,0.854878,-1.015563
2019-04-01 09:32:00,-1.007890,-1.015581,-1.009921,-1.020661,-1.014416,0.172197,0.119706,0.742137,-1.021560
2019-04-01 09:33:00,-1.012687,-1.021577,-1.014721,-1.021859,-1.014715,0.168279,0.116153,0.573026,-1.029355
2019-04-01 09:34:00,-1.025881,-1.029373,-1.029720,-1.031449,-1.031507,-0.214409,-0.229904,-0.272533,-1.020360
...,...,...,...,...,...,...,...,...,...
2019-11-01 15:54:00,2.838528,2.844427,2.846094,2.836619,2.842556,0.585904,0.842938,1.136731,2.858737
2019-11-01 15:55:00,2.847523,2.858819,2.861693,2.845909,2.858208,1.042060,1.395629,1.080361,2.870130
2019-11-01 15:56:00,2.858318,2.870212,2.869493,2.859694,2.863486,0.999938,1.345377,1.418584,2.870130
2019-11-01 15:57:00,2.870911,2.870212,2.877293,2.872281,2.875839,0.676678,0.955787,1.193102,2.866232


## Window size: 20

In [None]:
aapl = array([ load_data_arr(date, ticker) for date in dates ]); aapl.shape

In [None]:
shape = aapl.shape
aapl = aapl.reshape((shape[0] * shape[1], shape[2]))
avg = features.index('average')
avgs = aapl[:, avg]
shape = aapl.shape; shape

In [None]:
y = np.roll(avgs, -1)
y = y[:-1]
x = aapl[:-1]
x.shape, y.shape

In [None]:
window = 20
n = x.shape[0]
x = np.array([ x[i:(i+window)] for i in range(n-window+1) ])
y = y[(window-1):]
x.shape, y.shape

In [None]:
idxs = np.logical_and([ (cnz(na(row)) == 0) for row in x ], ~na(y))
y = y[idxs]
x = x[idxs]
x.shape, y.shape

In [None]:
n = x.shape[0]

In [None]:
xa = x[:, -1, avg]
xa[:10], y[:10]

In [None]:
cnz(xa > y), cnz(xa < y), cnz(xa == y)

In [None]:
(xa - y).max(), (xa - y).min()

In [None]:
mean((xa - y)**2)

-------

In [None]:
ox = x.copy()
oy = y.copy()

In [None]:
x = ox
y = oy

In [None]:
perm = permutation(n); perm

In [None]:
x = x[perm]
y = y[perm]

In [None]:
val_split = 0.1
tn = int((1 - val_split) * n)
vn = n - tn
tx = x[:tn]
ty = y[:tn]
vx = x[tn:]
vy = y[tn:]
[ a.shape for a in [tx,ty,vx,vy] ]

In [None]:
u = mean(tx[:, -1, :], axis=0)
s = std(tx[:, -1, :], axis=0)
x = (x - u) / s
y = (y - u[avg]) / s[avg]
mean(x[:, -1, :], axis=0), std(x[:, -1, :], axis=0), mean(y), std(y)

In [None]:
val_split = 0.1
tn = int((1 - val_split) * n)
vn = n - tn
tx = x[:tn]
ty = y[:tn]
vx = x[tn:]
vy = y[tn:]
[ a.shape for a in [tx,ty,vx,vy] ]

In [None]:
from tensorflow.keras.layers import Input, SimpleRNN, Dense
from tensorflow.keras import Sequential

In [None]:
model = Sequential()
model.add(SimpleRNN(4, input_shape=(window, len(features))))
model.add(Dense(1))
model.build()
model.compile(loss='mae', optimizer='adam')
model.summary()

In [None]:
px = model.predict(vx)
px.max() - px.min(), x[:10, -1, 0], y[:10], px[:10, -1]

In [None]:
mean(abs(tx[:, -1, avg] - ty)), mean(abs(vx[:, -1, avg] - vy)), mean(abs(x[:, -1, avg] - y))

In [None]:
%%time
model.fit(x, y, 
          validation_split=0.1,
          batch_size=n,
          epochs=100000)

px = model.predict(x)
res = np.swapaxes(array([x[:10, -1, 0], y[:10], px[:10, -1]]), 0, 1)
px.max() - px.min(), res

In [None]:
w = model.get_weights(); w, [ l.shape for l in w ]

In [None]:
!{python} -m pip install -Uq scipy
from scipy.stats import describe

In [None]:
describe(ty)