**Load the libraries and the data**

Note that you need to install "linearmodels" first.  (try pip3 install linearmodels or sudo pip3 install linearmodels)


In [4]:
import numpy as np
import pandas as pd
import datetime as dt
from linearmodels import PooledOLS 

# load the data
path = '../input/'
df = pd.read_pickle(path + 'MM_stock_data_FINAL.pkl')

In [5]:
#smallCapTickers = ['KYN', 'DY', 'AY', 'GWLLF', 'OPK', 'MEDP', 'FIX', 'BEL', 'BZUN', 'VGR']

**Get tickers of all the 30 stocks. **

**Create a dictionary to store all the estimates.**

In [6]:
tickers = df['Adj Close'].columns.values

estimates = dict()
for ticker in tickers:
    estimates[ticker] = dict()

In [7]:
''' Roll (1984) estimate of the effective spread '''

adjClose = df['Adj Close']

for ticker in tickers:

    logReturn = np.log(adjClose[ticker] / adjClose[ticker].shift(1)).reset_index()
    logReturn['month'] = logReturn['Date'].apply(lambda x: x.month)
    
    returnCov = logReturn.groupby('month')[ticker].apply(lambda x: x.cov(x.shift(1)))
    
    # For cases where the covariance is positive, the Roll estimate must be set to 0.
    rollEstimate = returnCov.apply(lambda x: 2 * np.sqrt(-x) if x < 0 else 0)
    
    estimates[ticker]['rollEstimate'] = rollEstimate



In [8]:
''' The Abdi and Ranaldo (2017) estimate of the effective '''

Low = df['Low']
High = df['High']

for ticker in tickers:

    dfTemp = np.log(adjClose[ticker]).reset_index()
    dfTemp.columns = ['Date', 'c']
    
    dfTemp['eta'] = np.log((High[ticker] + Low[ticker]) / 2).values
    dfTemp['month'] = dfTemp['Date'].apply(lambda x: x.month)
    
    AbdiCov = dfTemp.groupby('month').apply(lambda x: (x.c - x.eta).cov(x.c - x.eta.shift(1)))

    # For cases where the covariance is negative, the Abdi estimate must be set to 0.    
    abdiEstimate = AbdiCov.apply(lambda x: 2 * np.sqrt(x) if x > 0 else 0)

    estimates[ticker]['abdiEstimate'] = abdiEstimate




In [9]:
''' The Amihud (2002) illiquidity ratio ''' 

Volume = df['Volume']

for ticker in tickers:

    absLogReturn = abs(np.log(adjClose[ticker] / adjClose[ticker].shift(1)))
    
    # Beware of the way to calculate the dollar volume
    dollarVol = Volume[ticker] * adjClose[ticker]
    
    amihudRatio = absLogReturn / dollarVol
    
    amihudRatio = amihudRatio.reset_index()
    amihudRatio['month'] = amihudRatio['Date'].apply(lambda x: x.month)
    
    amihudRatio = amihudRatio.groupby('month')[ticker].mean()
    
    estimates[ticker]['amihudRatio'] = amihudRatio



In [10]:
''' The average daily market cap ''' 

MarketCap = df['MarketCap']

for ticker in tickers:

    avgMarketCap = MarketCap[ticker].reset_index()
    
    avgMarketCap['month'] = avgMarketCap['Date'].apply(lambda x: x.month)
    
    avgMarketCap = avgMarketCap.groupby('month')[ticker].mean()
    
    estimates[ticker]['avgMarketCap'] = avgMarketCap




In [11]:
''' The average daily trading volume (in number of shares) '''

Volume = df['Volume']

for ticker in tickers:

    avgVolume = Volume[ticker].reset_index()
    
    avgVolume['month'] = avgVolume['Date'].apply(lambda x: x.month)
    
    avgVolume = avgVolume.groupby('month')[ticker].mean()
    
    estimates[ticker]['avgVolume'] = avgVolume

    

In [12]:
''' The daily volatility as the standard deviation of daily stock returns over the month '''

adjClose = df['Adj Close']

for ticker in tickers:

    logReturn = np.log(adjClose[ticker] / adjClose[ticker].shift(1)).reset_index()
    logReturn['month'] = logReturn['Date'].apply(lambda x: x.month)
    
    dailyVol = logReturn.groupby('month')[ticker].std()
    
    estimates[ticker]['dailyVol'] = dailyVol
    
    


In [13]:
'''  The average daily value of the inverse of the price ''' 
# Not sure I understand this. Is it just the avg of the 1 / price ?

adjClose = df['Adj Close']

for ticker in tickers:

    invPrice = (adjClose[ticker] ** (-1)).reset_index()
    
    invPrice['month'] = invPrice['Date'].apply(lambda x: x.month)
    
    invPriceAvg = invPrice.groupby('month')[ticker].mean()
    
    estimates[ticker]['invPriceAvg'] = invPriceAvg




In [14]:
'''  compute the correlation matrix between all the variables ''' 

# We end up with 30 corr matrices? 

# for ticker in tickers:
#     print(pd.DataFrame(estimates[ticker]).corr())
    
# Or use panel data? 

panel = pd.DataFrame()

for ticker in tickers:
    
    tickerData = pd.DataFrame(estimates[ticker])
    tickerData['ticker'] = ticker
    
    panel = panel.append(tickerData)
    
panel = panel.reset_index()    
    
corrVars = [x for x in panel.columns if x not in ['month', 'ticker']]
panel[corrVars].corr()
    


Unnamed: 0,rollEstimate,abdiEstimate,amihudRatio,avgMarketCap,avgVolume,dailyVol,invPriceAvg
rollEstimate,1.0,0.341068,0.041133,0.064803,0.087607,0.405357,0.155949
abdiEstimate,0.341068,1.0,0.200562,0.072121,0.227481,0.922053,0.259097
amihudRatio,0.041133,0.200562,1.0,-0.208016,-0.31382,0.17759,0.390294
avgMarketCap,0.064803,0.072121,-0.208016,1.0,0.120097,0.114168,-0.115413
avgVolume,0.087607,0.227481,-0.31382,0.120097,1.0,0.260772,-0.002676
dailyVol,0.405357,0.922053,0.17759,0.114168,0.260772,1.0,0.220022
invPriceAvg,0.155949,0.259097,0.390294,-0.115413,-0.002676,0.220022,1.0


In [15]:
''' run a pooled regression of each of the transaction cost estimates against the three explanatory variables  '''

# There seems to be four explanatory variables? 

regResult = dict()

dependent = ['rollEstimate', 'abdiEstimate', 'amihudRatio']
exog = ['avgVolume', 'avgMarketCap', 'dailyVol', 'invPriceAvg']

panel = panel.set_index(['ticker', 'month'])

for y in dependent:
    mod = PooledOLS(panel[y], panel[exog])
    res = mod.fit()
    regResult[y] = res
    


In [16]:
regResult['rollEstimate']

0,1,2,3
Dep. Variable:,rollEstimate,R-squared:,0.5000
Estimator:,PooledOLS,R-squared (Between):,0.9505
No. Observations:,360,R-squared (Within):,0.0748
Date:,"Fri, Nov 30 2018",R-squared (Overall):,0.5000
Time:,11:00:19,Log-likelihood,1151.5
Cov. Estimator:,Unadjusted,,
,,F-statistic:,88.983
Entities:,30,P-value,0.0000
Avg Obs:,12.000,Distribution:,"F(4,356)"
Min Obs:,12.000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
avgVolume,-3.533e-11,1.367e-10,-0.2585,0.7962,-3.041e-10,2.335e-10
avgMarketCap,3.77e-09,5.044e-09,0.7474,0.4553,-6.15e-09,1.369e-08
dailyVol,0.4451,0.0429,10.367,0.0000,0.3607,0.5295
invPriceAvg,0.0315,0.0176,1.7896,0.0744,-0.0031,0.0660


In [17]:
regResult['abdiEstimate']

0,1,2,3
Dep. Variable:,abdiEstimate,R-squared:,0.9637
Estimator:,PooledOLS,R-squared (Between):,0.9952
No. Observations:,360,R-squared (Within):,0.6715
Date:,"Fri, Nov 30 2018",R-squared (Overall):,0.9637
Time:,11:00:19,Log-likelihood,1443.9
Cov. Estimator:,Unadjusted,,
,,F-statistic:,2364.4
Entities:,30,P-value,0.0000
Avg Obs:,12.000,Distribution:,"F(4,356)"
Min Obs:,12.000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
avgVolume,1.968e-11,6.067e-11,0.3244,0.7458,-9.963e-11,1.39e-10
avgMarketCap,-7.901e-10,2.239e-09,-0.3529,0.7244,-5.193e-09,3.613e-09
dailyVol,1.0829,0.0191,56.824,0.0000,1.0455,1.1204
invPriceAvg,0.0343,0.0078,4.3958,0.0000,0.0190,0.0497


In [18]:
regResult['amihudRatio']

0,1,2,3
Dep. Variable:,amihudRatio,R-squared:,0.4890
Estimator:,PooledOLS,R-squared (Between):,0.5538
No. Observations:,360,R-squared (Within):,-0.0810
Date:,"Fri, Nov 30 2018",R-squared (Overall):,0.4890
Time:,11:00:19,Log-likelihood,7075.0
Cov. Estimator:,Unadjusted,,
,,F-statistic:,85.181
Entities:,30,P-value,0.0000
Avg Obs:,12.000,Distribution:,"F(4,356)"
Min Obs:,12.000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
avgVolume,-7.095e-17,9.769e-18,-7.2630,0.0000,-9.016e-17,-5.174e-17
avgMarketCap,-1.075e-15,3.605e-16,-2.9823,0.0031,-1.784e-15,-3.661e-16
dailyVol,2.469e-08,3.068e-09,8.0457,0.0000,1.865e-08,3.072e-08
invPriceAvg,1.04e-08,1.257e-09,8.2801,0.0000,7.933e-09,1.288e-08
