# S&P 100 Data Estimates - Single Stock

In [1]:
from __future__ import print_function
import pandas as pd
import numpy as np
import datetime as dt
import quandl

## Data 

In [2]:
datadir='../../data/'
assets=pd.read_csv(datadir + 'SP100.csv', comment='#').set_index('Symbol')
QUANDL={
    'authtoken':"6XyApK2BBj_MraQg2TMD",
    'start_date':dt.date(2017, 1, 2),
    'end_date':dt.date(2018, 12, 31)
}
RISK_FREE_SYMBOL = "USDOLLAR"
data={}

#### Download loop

If it stops because of Quandl error codes 503 or 504, try re-running it (it won't download data already downloaded). If Quandl complains about the speed of requests, try adding sleep time.

In [3]:
# Download assets' data
from time import sleep

def to_quandl_eod_ticker(ticker):
    '''
    Converts ticker to format in Quandl EOD dataset
    '''
    if 'USDOLLAR' not in ticker:
        return 'EOD/' + ticker.replace('.', '_')
    else:
        return 'FRED/DTB3'

# Construct a data dictionary: {ticker: pd.DataFrame(price/volume)}
for ticker in assets.index:
    if ticker in data:
        continue
    print('downloading %s from %s to %s' %(ticker, QUANDL['start_date'], QUANDL['end_date']))
    try:
        data[ticker] = quandl.get(to_quandl_eod_ticker(ticker), **QUANDL)
    except quandl.NotFoundError:
        print('\tInvalid asset code')

downloading AAPL from 2017-01-02 to 2018-12-31
downloading ABBV from 2017-01-02 to 2018-12-31
downloading ABT from 2017-01-02 to 2018-12-31
downloading ACN from 2017-01-02 to 2018-12-31
downloading ADBE from 2017-01-02 to 2018-12-31
downloading AGN from 2017-01-02 to 2018-12-31
downloading AIG from 2017-01-02 to 2018-12-31
downloading ALL from 2017-01-02 to 2018-12-31
downloading AMGN from 2017-01-02 to 2018-12-31
downloading AMZN from 2017-01-02 to 2018-12-31
downloading AXP from 2017-01-02 to 2018-12-31
downloading BA from 2017-01-02 to 2018-12-31
downloading BAC from 2017-01-02 to 2018-12-31
downloading BIIB from 2017-01-02 to 2018-12-31
downloading BK from 2017-01-02 to 2018-12-31
downloading BKNG from 2017-01-02 to 2018-12-31
downloading BLK from 2017-01-02 to 2018-12-31
downloading BMY from 2017-01-02 to 2018-12-31
downloading BRK.B from 2017-01-02 to 2018-12-31
downloading C from 2017-01-02 to 2018-12-31
downloading CAT from 2017-01-02 to 2018-12-31
downloading CELG from 2017-01

#### Computation 

In [4]:
keys=[el for el in assets.index if not el in (set(assets.index)-set(data.keys()))]

def select_first_valid_column(df, columns):
    for column in columns:
        if column in df.columns:
            return df[column]

# extract prices
prices=pd.DataFrame.from_dict(dict(zip(keys, [select_first_valid_column(data[k], ["Adj. Close", "Close", "Value"])
                                              for k in keys])))

#compute sigmas
open_price=pd.DataFrame.from_dict(dict(zip(keys, [select_first_valid_column(data[k], ["Open"]) for k in keys])))
close_price=pd.DataFrame.from_dict(dict(zip(keys, [select_first_valid_column(data[k], ["Close"]) for k in keys])))
sigmas = np.abs(np.log(open_price.astype(float))-np.log(close_price.astype(float)))

# extract volumes
volumes=pd.DataFrame.from_dict(dict(zip(keys, [select_first_valid_column(data[k], ["Adj. Volume", "Volume"])
                                               for k in keys])))

# fix risk free
prices[RISK_FREE_SYMBOL]=10000*(1 + prices[RISK_FREE_SYMBOL]/(100*250)).cumprod()

#### Filtering 

In [5]:
# filter NaNs - threshold at 2% missing values
bad_assets = prices.columns[prices.isnull().sum()>len(prices)*0.02]
if len(bad_assets):
    print('Assets %s have too many NaNs, removing them' % bad_assets)

prices = prices.loc[:,~prices.columns.isin(bad_assets)]
sigmas = sigmas.loc[:,~sigmas.columns.isin(bad_assets)]
volumes = volumes.loc[:,~volumes.columns.isin(bad_assets)]

nassets=prices.shape[1]

# days on which many assets have missing values
bad_days1=sigmas.index[sigmas.isnull().sum(1) > nassets*.9]
bad_days2=prices.index[prices.isnull().sum(1) > nassets*.9]
bad_days3=volumes.index[volumes.isnull().sum(1) > nassets*.9]
bad_days=pd.Index(set(bad_days1).union(set(bad_days2)).union(set(bad_days3))).sort_values()
print ("Removing these days from dataset:")
print(pd.DataFrame({'nan price':prices.isnull().sum(1)[bad_days],
                    'nan volumes':volumes.isnull().sum(1)[bad_days],
                    'nan sigmas':sigmas.isnull().sum(1)[bad_days]}))

prices=prices.loc[~prices.index.isin(bad_days)]
sigmas=sigmas.loc[~sigmas.index.isin(bad_days)]
volumes=volumes.loc[~volumes.index.isin(bad_days)]

# extra filtering
print(pd.DataFrame({'remaining nan price':prices.isnull().sum(),
                    'remaining nan volumes':volumes.isnull().sum(),
                    'remaining nan sigmas':sigmas.isnull().sum()}))
prices=prices.fillna(method='ffill')
sigmas=sigmas.fillna(method='ffill')
volumes=volumes.fillna(method='ffill')
print(pd.DataFrame({'remaining nan price':prices.isnull().sum(),
                    'remaining nan volumes':volumes.isnull().sum(),
                    'remaining nan sigmas':sigmas.isnull().sum()}))

Assets Index(['DD', 'DOW'], dtype='object') have too many NaNs, removing them
Removing these days from dataset:
Empty DataFrame
Columns: [nan price, nan volumes, nan sigmas]
Index: []
          remaining nan price  remaining nan volumes  remaining nan sigmas
AAPL                        0                      0                     0
ABBV                        0                      0                     0
ABT                         0                      0                     0
ACN                         0                      0                     0
ADBE                        0                      0                     0
...                       ...                    ...                   ...
WBA                         0                      0                     0
WFC                         0                      0                     0
WMT                         0                      0                     0
XOM                         0                      0              

#### Save 

In [6]:
# make volumes in dollars
volumes = volumes*prices

# compute returns
returns = (prices.diff()/prices.shift(1)).fillna(method='ffill').iloc[1:]

bad_assets = returns.columns[((-.5>returns).sum()>0)|((returns > 2.).sum()>0)]
if len(bad_assets):
    print('Assets %s have dubious returns, removed' % bad_assets)
    
prices = prices.loc[:,~prices.columns.isin(bad_assets)]
sigmas = sigmas.loc[:,~sigmas.columns.isin(bad_assets)]
volumes = volumes.loc[:,~volumes.columns.isin(bad_assets)]
returns = returns.loc[:,~returns.columns.isin(bad_assets)]

# remove USDOLLAR except from returns
prices = prices.iloc[:,:-1]
sigmas = sigmas.iloc[:,:-1]
volumes = volumes.iloc[:,:-1]


# save data
prices.to_csv(datadir+'ss_prices.csv.gz', compression='gzip', float_format='%.3f')
volumes.to_csv(datadir+'ss_volumes.csv.gz', compression='gzip', float_format='%d')
returns.to_csv(datadir+'ss_returns.csv.gz', compression='gzip', float_format='%.3e')
sigmas.to_csv(datadir+'ss_sigmas.csv.gz', compression='gzip', float_format='%.3e')

## Estimates 

In [14]:
print("Typical variance of returns: %g"%returns.var().mean())

Typical variance of returns: 0.000205843


In [15]:
return_estimate = returns.ewm(alpha=0.1, min_periods=60).mean().shift(1).dropna()
return_estimate

Unnamed: 0_level_0,AAPL,ABBV,ABT,ACN,ADBE,AGN,AIG,ALL,AMGN,AMZN,...,UPS,USB,UTX,V,VZ,WBA,WFC,WMT,XOM,USDOLLAR
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-03-31,0.002700,0.000630,-0.000594,-0.001365,0.004031,0.000344,0.000434,0.000220,-0.003267,0.002952,...,0.000244,-0.002623,0.000219,1.753553e-04,-0.001105,-0.001223,-0.001127,0.002185,0.002251,0.000029
2017-04-03,0.002242,0.000108,-0.000849,-0.001760,0.004029,-0.000058,0.000166,0.000014,-0.003128,0.003822,...,0.000821,-0.003552,-0.000017,2.508829e-07,-0.001628,-0.001413,-0.002047,0.002652,0.000004,0.000029
2017-04-04,0.002045,-0.000103,-0.000629,-0.003455,0.003210,-0.000572,-0.000877,-0.000196,-0.002925,0.004001,...,-0.000102,-0.003099,-0.000265,5.523967e-04,-0.000581,-0.001392,-0.002148,0.002039,0.000077,0.000029
2017-04-05,0.002586,0.000031,-0.001219,-0.002769,0.003236,-0.000199,-0.000157,0.000205,-0.002345,0.005321,...,0.000125,-0.002963,0.000710,-1.528269e-04,-0.000258,-0.001796,-0.002456,0.002086,0.000435,0.000030
2017-04-06,0.001808,-0.000203,-0.001958,-0.002831,0.002797,-0.000456,-0.001043,-0.000073,-0.003402,0.005059,...,0.000216,-0.003484,0.000204,1.444012e-04,-0.000821,-0.003230,-0.002609,0.001376,0.000586,0.000030
2017-04-07,0.001377,0.000002,-0.002059,-0.003254,0.002717,-0.000170,-0.000581,-0.000041,-0.003055,0.003341,...,0.000129,-0.002527,0.000104,1.974104e-04,-0.001944,-0.002303,-0.001638,0.000931,0.001109,0.000030
2017-04-10,0.001016,0.000710,-0.001486,-0.002388,0.002499,0.000061,-0.001333,-0.000443,-0.002052,0.002628,...,-0.001461,-0.002664,0.000610,-2.155862e-04,-0.001274,-0.001251,-0.002432,0.002898,0.000697,0.000030
2017-04-11,0.000796,-0.000232,-0.001886,-0.001808,0.002203,0.000587,-0.000971,-0.000090,-0.002208,0.003725,...,0.000049,-0.003121,0.000451,-5.866624e-05,-0.001393,-0.000907,-0.002736,0.002827,0.001075,0.000030
2017-04-12,-0.000360,-0.001426,-0.001720,-0.003122,0.001821,0.000107,-0.001997,-0.000314,-0.001790,0.002836,...,-0.000586,-0.002592,0.000637,-6.405744e-05,-0.000924,-0.000610,-0.003160,0.003051,0.000618,0.000031
2017-04-13,-0.000204,-0.000987,-0.001777,-0.003241,0.001200,-0.000439,-0.002818,-0.000530,-0.001648,0.001872,...,-0.001966,-0.002706,-0.000223,-2.491142e-04,-0.000379,-0.000682,-0.004765,0.002760,0.000713,0.000031


In [16]:
agree_on_sign=np.sign(returns.iloc[60:,:-1]) == np.sign(return_estimate.iloc[:,:-1])
print("Return predictions have the right sign %.1f%% of the times"%
      (100*agree_on_sign.sum().sum()/(agree_on_sign.shape[0]*(agree_on_sign.shape[1]-1))))

Return predictions have the right sign 50.6% of the times


In [22]:
volume_estimate=volumes.ewm(alpha=0.1, min_periods=60).mean().shift(1).dropna()
volume_estimate.to_csv(datadir+'ss_volume_estimate.csv.gz', compression='gzip', float_format='%d')
sigma_estimate = returns.ewm(alpha=0.1, min_periods=60).cov().dropna()
sigma_estimate.to_csv(datadir+'ss_sigma_estimate.csv.gz', compression='gzip', float_format='%.3e')
return_estimate.to_hdf(datadir+'ss_model.h5', 'return_estimate')
volume_estimate.to_hdf(datadir+'ss_model.h5', 'volume_estimate')
sigma_estimate.to_hdf(datadir+'ss_model.h5', 'sigma_estimate')