# Periodic download of IEX stock-ticker data

## Read IEX API credentials from `~/.config/iex.ini`

In [2]:
from pathlib import Path
config_path = Path.home() / '.config' / 'iex.ini'

from configparser import ConfigParser
config = ConfigParser()
config.read(str(config_path))
iex_config = config['iex']

api = 'https://cloud.iexapis.com'
public_key = iex_config['public_key']
secret_key = iex_config['secret_key']

In [3]:
tickers = sorted("MMM ABT ABBV ABMD ACN ATVI ADBE AMD AAP AES AMG AFL A APD AKAM ALK ALB ARE ALXN ALGN ALLE AGN ADS LNT ALL GOOGL GOOG MO AMZN AMCR AEE AAL AEP AXP AIG AMT AWK AMP ABC AME AMGN APH ADI ANSS ANTM AON AOS APA AIV AAPL AMAT APTV ADM ARNC ANET AJG AIZ ATO T ADSK ADP AZO AVB AVY BKR BLL BAC BK BAX BBT BDX BRK.B BBY BIIB BLK HRB BA BKNG BWA BXP BSX BMY AVGO BR BF.B CHRW COG CDNS CPB COF CPRI CAH KMX CCL CAT CBOE CBRE CBS CDW CE CELG CNC CNP CTL CERN CF SCHW CHTR CVX CMG CB CHD CI XEC CINF CTAS CSCO C CFG CTXS CLX CME CMS KO CTSH CL CMCSA CMA CAG CXO COP ED STZ COO CPRT GLW CTVA COST COTY CCI CSX CMI CVS DHI DHR DRI DVA DE DAL XRAY DVN FANG DLR DFS DISCA DISCK DISH DG DLTR D DOV DOW DTE DUK DRE DD DXC ETFC EMN ETN EBAY ECL EIX EW EA EMR ETR EOG EFX EQIX EQR ESS EL EVRG ES RE EXC EXPE EXPD EXR XOM FFIV FB FAST FRT FDX FIS FITB FE FRC FISV FLT FLIR FLS FMC F FTNT FTV FBHS FOXA FOX BEN FCX GPS GRMN IT GD GE GIS GM GPC GILD GL GPN GS GWW HAL HBI HOG HIG HAS HCA HCP HP HSIC HSY HES HPE HLT HFC HOLX HD HON HRL HST HPQ HUM HBAN HII IEX IDXX INFO ITW ILMN IR INTC ICE IBM INCY IP IPG IFF INTU ISRG IVZ IPGP IQV IRM JKHY JEC JBHT SJM JNJ JCI JPM JNPR KSU K KEY KEYS KMB KIM KMI KLAC KSS KHC KR LB LHX LH LRCX LW LVS LEG LDOS LEN LLY LNC LIN LKQ LMT L LOW LYB MTB MAC M MRO MPC MKTX MAR MMC MLM MAS MA MKC MXIM MCD MCK MDT MRK MET MTD MGM MCHP MU MSFT MAA MHK TAP MDLZ MNST MCO MS MOS MSI MSCI MYL NDAQ NOV NTAP NFLX NWL NEM NWSA NWS NEE NLSN NKE NI NBL JWN NSC NTRS NOC NCLH NRG NUE NVDA NVR ORLY OXY OMC OKE ORCL PCAR PKG PH PAYX PYPL PNR PBCT PEP PKI PRGO PFE PM PSX PNW PXD PNC PPG PPL PFG PG PGR PLD PRU PEG PSA PHM PVH QRVO PWR QCOM DGX RL RJF RTN O REG REGN RF RSG RMD RHI ROK ROL ROP ROST RCL CRM SBAC SLB STX SEE SRE SHW SPG SWKS SLG SNA SO LUV SPGI SWK SBUX STT SYK STI SIVB SYMC SYF SNPS SYY TMUS TROW TTWO TPR TGT TEL FTI TFX TXN TXT TMO TIF TWTR TJX TSCO TDG TRV TRIP TSN UDR ULTA USB UAA UA UNP UAL UNH UPS URI UTX UHS UNM VFC VLO VAR VTR VRSN VRSK VZ VRTX VIAB V VNO VMC WAB WMT WBA DIS WM WAT WEC WCG WFC WELL WDC WU WRK WY WHR WMB WLTW WYNN XEL XRX XLNX XYL YUM ZBH ZION ZTS".split(" "))
len(tickers)

505

In [4]:
from datetime import datetime
from dateutil.parser import parse
from datetime import timedelta as Δ

time = datetime.now
now = time()
today = now.date()
today

datetime.date(2019, 10, 27)

In [5]:
data_dir = Path.cwd() / 'data'
data_dir.mkdir(parents=True, exist_ok=True)

In [6]:
from sys import executable as python
!{python} -m pip install -Uq requests
from requests import get as GET

In [7]:
import json

def fetch(date_str, ticker):
    out_path = data_dir / ('%s-%s' % (date_str, ticker))
    if out_path.exists():
        return True

    print('Fetching data for %s from %s' % (ticker, date_str))

    url = f'https://cloud.iexapis.com/stable/stock/{ticker}/chart/date/{date_str}?token={secret_key}'
    resp = GET(url)
    resp.raise_for_status()
    with out_path.open('wb') as f:
        f.write(resp.content)

    data = json.loads(resp.content)
    if data:
        return True

    return False

In [85]:
%%time
from concurrent.futures import ThreadPoolExecutor

end_date = today
start_date = datetime(2019, 8, 1).date()
N = 32

def get_dates(start_date, end_date, step=1):
    date = start_date
    while date != end_date:
        if date.weekday() <= 4:
            yield date
        date += Δ(days=step)

dates = list(get_dates(start_date, end_date))

for date in dates:
    date_str = date.strftime('%Y%m%d')
    
    with ThreadPoolExecutor(max_workers = N) as p:
        results = p.map(lambda ticker: fetch(date_str, ticker), tickers)
    
    found_data = True in results
    if not found_data:
        print('No data found for %s; breaking' % date)

CPU times: user 1.89 s, sys: 547 ms, total: 2.43 s
Wall time: 2.07 s


In [11]:
!{python} -m pip install -Uq pandas
from pandas import DataFrame as DF, read_csv, read_json
import pandas as pd

In [12]:
def load_data(date, ticker):
    date_str = date.strftime('%Y%m%d')
    out_path = data_dir / ('%s-%s' % (date_str, ticker))
    if not out_path.exists():
        return None
    data = read_json(str(out_path))
    data['ticker'] = ticker
    return data

In [301]:
minutes = 390  # [9:30am,4:00pm)

In [306]:
def load_data_arr(date, ticker):
    date_str = date.strftime('%Y%m%d')
    out_path = data_dir / ('%s-%s' % (date_str, ticker))
    arr = zeros((minutes, len(features)))
    arr[:] = nan
    if not out_path.exists():
        return arr
    df = read_json(out_path)
    if df.empty:
        return arr
    arr = df[features].values
    assert arr.shape == (minutes, len(features))
    return arr

In [314]:
def load_date_arr(date):
    arr = np.array([ 
        load_data_arr(start_date, ticker) 
        for ticker in tickers 
    ]) \
    .reshape((
        minutes, 
        len(tickers), 
        len(features),
    ))
    assert arr.shape == (minutes, num_tickers, len(features))
    return arr

In [315]:
all = \
    np.array(
        Parallel(n_jobs=8)(delayed(load_date_arr)(date) for date in dates)
    )

In [316]:
shape =  all.shape; shape

(62, 390, 505, 8)

In [333]:
all = all.reshape(shape[0] * shape[1], *shape[2:]); all.shape

(24180, 505, 8)

In [13]:
!{python} -m pip install -Uq joblib
from joblib import Parallel, delayed

In [53]:
def clean_data(df):
    df['datetime'] = (df['date'].dt.strftime('%Y-%m-%d') + ' ' + df['minute']).apply(lambda s: datetime.strptime(s, '%Y-%m-%d %H:%M'))

    df.drop(columns=[ 'date', 'minute', 'label', 'changeOverTime', 'marketChangeOverTime', ], inplace=True)

    cols = list(df.columns)
    cols.remove('datetime')
    cols.remove('ticker')
    cols.remove('open')
    cols.remove('close')
    cols = [ 'datetime', 'ticker', 'open', 'close', ] + cols
    df = df.reindex(columns=cols)

    return df    

In [282]:
len(minutes) / 390

61.0

In [54]:
def make_date_csv(date):
    date_str = date.strftime('%Y%m%d')
    date_path = data_dir / ('%s.csv.gz' % date_str)
    if date_path.exists():
        print('Found %s; skipping' % date_path)
        return None

    print('Loading prices for %s' % date)
    dfs = [
        df
        for df in Parallel(n_jobs=8)(delayed(lambda t: load_data(*t))((date, ticker)) for ticker in tickers)
        if not df.empty
    ]
    if not dfs:
        print('%s: no data found' % date)
        return None

    df = pd.concat(dfs)
    df = clean_data(df)

    print('Writing: %s' % date_path)
    df.to_csv(date_path, index=False)
    print('Done')

In [55]:
%%time
Parallel(n_jobs=8)(delayed(make_date_csv)(date) for date in dates); None

CPU times: user 598 ms, sys: 79.8 ms, total: 677 ms
Wall time: 5min 42s


In [None]:
for date in dates:
    for minute in range(390):
        for ticker in tickers:
            

In [68]:
%%time
def load_date_csv(date):
    date_str = date.strftime('%Y%m%d')
    date_path = data_dir / ('%s.csv.gz' % date_str)
    if not date_path.exists():
        return None

    df = read_csv(date_path)

    cols = [ 'open', 'close', 'high', 'low', 'average', 'volume', 'notional', 'numberOfTrades' ]
    cols += [
        ('market%s%s' % (col[0].upper(), col[1:]))
        for col in cols
    ]
    cols = [ 'datetime', 'ticker' ] + cols
    df = df.reindex(columns=cols)

    return df

dfs = Parallel(n_jobs=8)(delayed(load_date_csv)(date) for date in dates)
df = pd.concat(dfs)
df.count()

CPU times: user 5.38 s, sys: 7.22 s, total: 12.6 s
Wall time: 16.9 s


datetime                11913720
ticker                  11913720
open                     7358232
close                    7358232
high                     7358232
low                      7358232
average                  7358232
volume                  11912614
notional                11912614
numberOfTrades          11912614
marketOpen              11345025
marketClose             11345025
marketHigh              11345025
marketLow               11345025
marketAverage           11345025
marketVolume            11912614
marketNotional          11912614
marketNumberOfTrades    11912614
dtype: int64

In [69]:
df

Unnamed: 0,datetime,ticker,open,close,high,low,average,volume,notional,numberOfTrades,marketOpen,marketClose,marketHigh,marketLow,marketAverage,marketVolume,marketNotional,marketNumberOfTrades
0,2019-08-01 09:30:00,AAL,30.400,30.380,30.400,30.380,30.387,300.0,9116.000,3.0,30.430,30.525,30.525,30.280,30.411,67173.0,2.042809e+06,148.0
1,2019-08-01 09:31:00,AAL,30.385,30.320,30.385,30.320,30.352,200.0,6070.500,2.0,30.470,30.320,30.470,30.320,30.381,18813.0,5.715578e+05,85.0
2,2019-08-01 09:32:00,AAL,30.410,30.460,30.460,30.405,30.425,2900.0,88233.000,11.0,30.320,30.440,30.477,30.320,30.432,40149.0,1.221800e+06,198.0
3,2019-08-01 09:33:00,AAL,30.370,30.370,30.370,30.370,30.370,10.0,303.700,1.0,30.440,30.360,30.440,30.360,30.387,5256.0,1.597126e+05,33.0
4,2019-08-01 09:34:00,AAL,30.405,30.385,30.425,30.385,30.414,2800.0,85160.000,10.0,30.370,30.390,30.440,30.360,30.392,20080.0,6.102805e+05,128.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196945,2019-10-25 15:55:00,ZTS,124.290,124.290,124.290,124.290,124.290,100.0,12429.000,1.0,124.245,124.260,124.290,124.245,124.268,9108.0,1.131836e+06,82.0
196946,2019-10-25 15:56:00,ZTS,124.270,124.300,124.330,124.270,124.305,400.0,49722.000,4.0,124.260,124.320,124.340,124.260,124.298,6224.0,7.736294e+05,53.0
196947,2019-10-25 15:57:00,ZTS,124.330,124.370,124.380,124.330,124.366,450.0,55964.790,7.0,124.310,124.360,124.380,124.310,124.339,10671.0,1.326819e+06,84.0
196948,2019-10-25 15:58:00,ZTS,124.370,124.405,124.405,124.370,124.384,804.0,100005.020,9.0,124.370,124.430,124.430,124.360,124.386,7517.0,9.350066e+05,70.0


In [70]:
!{python} -m pip install -Uq tensorflow
import tensorflow as tf
from tensorflow.keras import layers

In [71]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN

In [75]:
model = Sequential([
    SimpleRNN(128, input_shape=(504,8)),
    Dense(1),
])

In [76]:
model.build()

In [77]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_1 (SimpleRNN)     (None, 128)               17536     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 17,665
Trainable params: 17,665
Non-trainable params: 0
_________________________________________________________________


In [78]:
mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0
sample, sample_label = x_train[0], y_train[0]

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [80]:
x_train.shape

(60000, 28, 28)

In [82]:
bd = df.groupby('datetime')

In [86]:
!{python} -m pip install -Uq numpy
import numpy as np

In [98]:
num_tickers = len(tickers)

In [99]:
features = [ 'open', 'close', 'high', 'low', 'average', 'volume', 'notional', 'numberOfTrades' ]
cols = [ 'datetime', 'ticker' ] + features

In [100]:
data = df[cols]
shape = (len(minutes), num_tickers, len(features))
time_points = np.zeros(shape)

In [101]:
minutes = sorted(df['datetime'].unique())

In [103]:
minute_idx, minute = list(enumerate(minutes))[0]

In [104]:
time_point = np.zeros(shape[1:])

In [105]:
ticker_idx, ticker = list(enumerate(tickers))[0]

In [107]:
minute_df = data[data['datetime'] == minute]

In [116]:
d = minute_df[minute_df['ticker'] == ticker]

In [110]:
minute

'2019-08-01 09:30:00'

In [115]:
ticker = 'AAL'

In [117]:
d

Unnamed: 0,datetime,ticker,open,close,high,low,average,volume,notional,numberOfTrades
0,2019-08-01 09:30:00,AAL,30.4,30.38,30.4,30.38,30.387,300.0,9116.0,3.0


In [131]:
time_point[0] = list(d[features].to_records(index=False))[0]

ValueError: Can't cast from structure to non-structure, except if the structure only has a single field.

In [123]:
target = 'AAPL'

In [127]:
time_point = np.zeros(shape[1:])

In [128]:
time_point

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [130]:
time_point[0] = [0] * 8

In [278]:
from numpy import zeros, nan, empty

In [274]:
def get_minute_arr(df, minute):
    pass

def get_day_df(date):
    date_str = date.strftime('%Y%m%d')
    path = data_dir / ('%s.csv.gz' % date_str)
    if not path.exists():
        return None
    df = read_csv(path)
    return df

def get_day_arr(date):
    n = zeros(len(minutes), len(tickers), len(features))
    n[:] = nan
    d = get_day_df(date)[cols]
    n = d[features].values
    assert n.shape == (390 * 424, 8), "Date %s: %s" % (date, n.shape)
    return n.reshape((390, 424, 8))

In [275]:
np.array([
    get_day_arr(date)
    for date in
    dates
])

AssertionError: Date 2019-08-02: (150540, 8)

In [276]:
150540 / 390

386.0

In [261]:
d = get_day_df(start_date)[cols]

In [256]:
n = d[features].values; n.shape; n

array([[3.0400000e+01, 3.0380000e+01, 3.0400000e+01, ..., 3.0000000e+02,
        9.1160000e+03, 3.0000000e+00],
       [3.0385000e+01, 3.0320000e+01, 3.0385000e+01, ..., 2.0000000e+02,
        6.0705000e+03, 2.0000000e+00],
       [3.0410000e+01, 3.0460000e+01, 3.0460000e+01, ..., 2.9000000e+03,
        8.8233000e+04, 1.1000000e+01],
       ...,
       [1.1559000e+02, 1.1561000e+02, 1.1564000e+02, ..., 4.5300000e+02,
        5.2372310e+04, 7.0000000e+00],
       [1.1563000e+02, 1.1563000e+02, 1.1563000e+02, ..., 1.4500000e+02,
        1.6766350e+04, 2.0000000e+00],
       [1.1563000e+02, 1.1575000e+02, 1.1576000e+02, ..., 8.2900000e+02,
        9.5924195e+04, 1.1000000e+01]])

In [257]:
n.shape

(165360, 8)

In [258]:
n.reshape((390, 424, 8))

array([[[3.04000000e+01, 3.03800000e+01, 3.04000000e+01, ...,
         3.00000000e+02, 9.11600000e+03, 3.00000000e+00],
        [3.03850000e+01, 3.03200000e+01, 3.03850000e+01, ...,
         2.00000000e+02, 6.07050000e+03, 2.00000000e+00],
        [3.04100000e+01, 3.04600000e+01, 3.04600000e+01, ...,
         2.90000000e+03, 8.82330000e+04, 1.10000000e+01],
        ...,
        [           nan,            nan,            nan, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [           nan,            nan,            nan, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [1.50630000e+02, 1.50630000e+02, 1.50630000e+02, ...,
         4.00000000e+02, 6.02520000e+04, 1.00000000e+00]],

       [[1.50600000e+02, 1.50600000e+02, 1.50600000e+02, ...,
         3.00000000e+02, 4.51800000e+04, 1.00000000e+00],
        [1.50600000e+02, 1.50600000e+02, 1.50600000e+02, ...,
         2.00000000e+02, 3.01200000e+04, 1.00000000e+00],
        [           nan, 

In [222]:
d['features'] = d[features].values.tolist()

In [223]:
d = d.drop(columns=features)

In [262]:
d

Unnamed: 0,datetime,ticker,open,close,high,low,average,volume,notional,numberOfTrades
0,2019-08-01 09:30:00,AAL,30.400,30.380,30.400,30.380,30.387,300.0,9116.000,3.0
1,2019-08-01 09:31:00,AAL,30.385,30.320,30.385,30.320,30.352,200.0,6070.500,2.0
2,2019-08-01 09:32:00,AAL,30.410,30.460,30.460,30.405,30.425,2900.0,88233.000,11.0
3,2019-08-01 09:33:00,AAL,30.370,30.370,30.370,30.370,30.370,10.0,303.700,1.0
4,2019-08-01 09:34:00,AAL,30.405,30.385,30.425,30.385,30.414,2800.0,85160.000,10.0
...,...,...,...,...,...,...,...,...,...,...
165355,2019-08-01 15:55:00,ZTS,115.400,115.400,115.400,115.400,115.400,107.0,12347.800,2.0
165356,2019-08-01 15:56:00,ZTS,115.455,115.455,115.455,115.455,115.455,100.0,11545.500,1.0
165357,2019-08-01 15:57:00,ZTS,115.590,115.610,115.640,115.580,115.612,453.0,52372.310,7.0
165358,2019-08-01 15:58:00,ZTS,115.630,115.630,115.630,115.630,115.630,145.0,16766.350,2.0


In [264]:
g = d.groupby(['datetime', 'ticker'])[features].first(); g

Unnamed: 0_level_0,Unnamed: 1_level_0,open,close,high,low,average,volume,notional,numberOfTrades
datetime,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2019-08-01 09:30:00,AAL,30.400,30.380,30.400,30.380,30.387,300.0,9116.000,3.0
2019-08-01 09:30:00,AAP,150.875,150.650,150.875,150.650,150.762,200.0,30152.500,2.0
2019-08-01 09:30:00,AAPL,214.070,214.770,214.980,214.070,214.718,8564.0,1838848.750,61.0
2019-08-01 09:30:00,ABBV,66.975,66.975,66.975,66.975,66.975,58.0,3884.550,1.0
2019-08-01 09:30:00,ABC,,,,,,0.0,0.000,0.0
...,...,...,...,...,...,...,...,...,...
2019-08-01 15:59:00,XYL,78.750,78.850,78.865,78.740,78.785,700.0,55149.500,7.0
2019-08-01 15:59:00,YUM,116.970,116.860,117.030,116.860,116.950,2139.0,250156.180,26.0
2019-08-01 15:59:00,ZBH,137.360,137.500,137.500,137.340,137.459,3621.0,497739.405,37.0
2019-08-01 15:59:00,ZION,42.955,42.955,42.980,42.945,42.961,1833.0,78748.010,26.0


In [265]:
g.values

array([[3.04000000e+01, 3.03800000e+01, 3.04000000e+01, ...,
        3.00000000e+02, 9.11600000e+03, 3.00000000e+00],
       [1.50875000e+02, 1.50650000e+02, 1.50875000e+02, ...,
        2.00000000e+02, 3.01525000e+04, 2.00000000e+00],
       [2.14070000e+02, 2.14770000e+02, 2.14980000e+02, ...,
        8.56400000e+03, 1.83884875e+06, 6.10000000e+01],
       ...,
       [1.37360000e+02, 1.37500000e+02, 1.37500000e+02, ...,
        3.62100000e+03, 4.97739405e+05, 3.70000000e+01],
       [4.29550000e+01, 4.29550000e+01, 4.29800000e+01, ...,
        1.83300000e+03, 7.87480100e+04, 2.60000000e+01],
       [1.15630000e+02, 1.15750000e+02, 1.15760000e+02, ...,
        8.29000000e+02, 9.59241950e+04, 1.10000000e+01]])

In [266]:
g.values.shape

(165360, 8)

In [267]:
u = g.unstack(); u

Unnamed: 0_level_0,open,open,open,open,open,open,open,open,open,open,...,numberOfTrades,numberOfTrades,numberOfTrades,numberOfTrades,numberOfTrades,numberOfTrades,numberOfTrades,numberOfTrades,numberOfTrades,numberOfTrades
ticker,AAL,AAP,AAPL,ABBV,ABC,ABMD,ABT,ACN,ADBE,ADI,...,XEC,XEL,XLNX,XRAY,XRX,XYL,YUM,ZBH,ZION,ZTS
datetime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2019-08-01 09:30:00,30.400,150.875,214.070,66.975,,223.705,87.155,193.785,298.995,117.265,...,0.0,2.0,9.0,1.0,6.0,0.0,0.0,2.0,0.0,9.0
2019-08-01 09:31:00,30.385,,214.820,66.950,93.090,217.125,87.105,,299.340,,...,0.0,4.0,2.0,1.0,0.0,0.0,0.0,4.0,1.0,3.0
2019-08-01 09:32:00,30.410,150.230,215.510,67.020,92.795,215.500,87.080,193.530,299.290,117.880,...,1.0,2.0,1.0,1.0,2.0,0.0,0.0,1.0,0.0,3.0
2019-08-01 09:33:00,30.370,150.385,215.550,67.060,93.410,218.255,86.850,,299.360,117.860,...,6.0,1.0,2.0,5.0,0.0,1.0,1.0,1.0,2.0,1.0
2019-08-01 09:34:00,30.405,150.320,215.430,67.100,93.445,220.250,87.110,,301.630,,...,3.0,1.0,7.0,3.0,1.0,2.0,4.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-08-01 15:55:00,29.780,149.770,207.545,65.670,91.530,203.820,85.730,194.025,298.000,114.465,...,10.0,25.0,30.0,71.0,24.0,8.0,29.0,10.0,15.0,2.0
2019-08-01 15:56:00,29.825,149.915,207.650,65.670,91.390,204.390,85.770,194.090,298.150,114.600,...,14.0,16.0,9.0,39.0,14.0,9.0,24.0,14.0,15.0,1.0
2019-08-01 15:57:00,29.875,150.050,207.750,65.745,91.490,204.830,85.910,194.310,298.390,114.630,...,8.0,19.0,14.0,48.0,26.0,4.0,15.0,13.0,15.0,7.0
2019-08-01 15:58:00,29.875,150.020,207.875,65.695,91.520,204.815,85.890,194.340,298.115,114.520,...,10.0,19.0,16.0,22.0,31.0,7.0,15.0,13.0,26.0,2.0


In [268]:
u.values.shape

(390, 3392)

In [237]:
np.vectorize(np.array)(u.values)

ValueError: setting an array element with a sequence.

In [253]:
d['features'].values.flatten()

array([list([30.4, 30.38, 30.4, 30.38, 30.386999999999997, 300.0, 9116.0, 3.0]),
       list([30.385, 30.32, 30.385, 30.32, 30.351999999999997, 200.0, 6070.5, 2.0]),
       list([30.41, 30.46, 30.46, 30.405, 30.425, 2900.0, 88233.0, 11.0]),
       ...,
       list([115.59, 115.61, 115.64, 115.58, 115.61200000000001, 453.0, 52372.31, 7.0]),
       list([115.63, 115.63, 115.63, 115.63, 115.63, 145.0, 16766.35, 2.0]),
       list([115.63, 115.75, 115.76, 115.61, 115.711, 829.0, 95924.195, 11.0])],
      dtype=object)

In [245]:
u.values.shape

(390, 424)

In [244]:
np.hstack(u.values).shape

(165360,)

In [239]:
np.matrix(u.values).shape

(390, 424)

In [133]:
def get_time_point(minute):
    minute_df = data[data['datetime'] == minute]

    time_point = np.zeros(shape[1:])

    for ticker_idx, ticker in enumerate(tickers):
        d = minute_df[minute_df['ticker'] == ticker]
        quotes = list(d[features].to_records(index=False))
        if not quotes:
            continue
        if len(quotes) > 1:
            raise Exception('Multiple quotes for %s %s: %s' % (minute, ticker, str(quotes)))
        quote = list(quotes[0])
        assert len(quote) == shape[-1]
        time_point[ticker_idx] = quote
    
    return time_point

time_points = Parallel(n_jobs=8)(delayed(get_time_point)(minute) for minute in minutes)

KeyboardInterrupt: 

In [96]:
df['datetime']

0         2019-08-01 09:30:00
1         2019-08-01 09:31:00
2         2019-08-01 09:32:00
3         2019-08-01 09:33:00
4         2019-08-01 09:34:00
                 ...         
196945    2019-10-25 15:55:00
196946    2019-10-25 15:56:00
196947    2019-10-25 15:57:00
196948    2019-10-25 15:58:00
196949    2019-10-25 15:59:00
Name: datetime, Length: 11913720, dtype: object

In [94]:
len(minutes)

23790

In [95]:
minutes[:10]

['2019-08-01 09:30:00',
 '2019-08-01 09:31:00',
 '2019-08-01 09:32:00',
 '2019-08-01 09:33:00',
 '2019-08-01 09:34:00',
 '2019-08-01 09:35:00',
 '2019-08-01 09:36:00',
 '2019-08-01 09:37:00',
 '2019-08-01 09:38:00',
 '2019-08-01 09:39:00']