In [80]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import warnings
from datetime import timedelta
from pandas.tseries.offsets import MonthEnd
import statsmodels.api as sm
#import ordered dicts
from collections import OrderedDict
import scipy.stats as stats
import plotly.express as px
import plotly.graph_objects as go

In [None]:
#We have inflation factor for every company
#Pick companies with t-stat > 2 or < -2
#If inflation is going up, pick companies with beta > 0
#If inflation is going down, pick companies with beta < 0
#Sort companies by beta (How much they move with inflation)
#Create quartiles of companies based on beta

#Looking at date, and inflation prediction flip long/short based on direction of inflation

In [82]:
def get_cpi():
    cpi = pd.read_excel('data/CPIAUCSL (1).xls')
    #cpi = pd.read_csv("T5YIE.csv")
    cpi.columns = ["date", 'CPI']
    cpi["date"] = pd.to_datetime(cpi["date"]) - timedelta(days=1)
    return cpi
    

def get_stocks():
    stocks = pd.read_csv("data/comp_stocks.csv")
    stocks["datadate"] = pd.to_datetime(stocks["datadate"])
    return stocks


def merge_cpi_stock(stocks, cpi):
    merged = pd.merge(stocks, cpi, how="left", left_on="MthCalDt", right_on="date")
    return merged

def bin_inflation_regimes(cpi, labels = ["deflation","low", "mid", "high"], bins = [-10, 0,1,3,20], value = "CPI", col = "Inflation"):
    cpi[col] = pd.cut(cpi[value],bins, labels=labels)
    return cpi


def get_percent_change(cpi):
    cpi["pct_change"] = cpi["CPI"].pct_change()
    return cpi

# get difference in inflation between two months
def get_inflation_diff(cpi, col = "diff", value = "CPI"):
    cpi[col] = cpi[value].diff()
    return cpi

In [83]:
cpi = get_cpi()
cpi = get_inflation_diff(cpi)
cpi = get_inflation_diff(cpi, col = "diff_of_diff", value = "diff")

In [84]:
cpi.dropna()

Unnamed: 0,date,CPI,diff,diff_of_diff
2,1948-02-29,6.81818,-2.66378,-1.90365
3,1948-03-31,8.27273,1.45455,4.11833
4,1948-04-30,9.38497,1.11224,-0.34231
5,1948-05-31,9.37500,-0.00997,-1.12221
6,1948-06-30,9.76158,0.38658,0.39655
...,...,...,...,...
895,2022-07-31,8.22736,-0.18582,0.33399
896,2022-08-31,8.21485,-0.01251,0.17331
897,2022-09-30,7.76249,-0.45236,-0.43985
898,2022-10-31,7.13535,-0.62714,-0.17478


In [85]:
stocks = get_stocks()
# remove stocks with above 500 trt1m
stocks = stocks[stocks["trt1m"] < 1000]

# remove stocks with below -100 trt1m
stocks = stocks[stocks["trt1m"] > -100]
# drop na gics
stocks = stocks.dropna(subset=["ggroup","gind", "gsector", "gsubind"])
# drop missing trt1m
stocks = stocks.dropna(subset=["trt1m"])


Columns (1) have mixed types. Specify dtype option on import or set low_memory=False.



In [86]:
stocks.shape

(6394616, 22)

In [87]:
# get factors
def get_factors():

    ff_factors = pd.read_csv("data/ff_factors.csv")
    ff_factor_mom = pd.read_csv("data/ff_factor_umd.csv")
    ff_factors.rename({"Unnamed: 0": "date"}, axis=1, inplace=True)
    ff_factor_mom.rename({"Unnamed: 0": "date", "Mom":"UMD"}, axis=1, inplace=True)
    ff_factor_mom.rename({"Unnamed: 0": "date", "Mom   ":"UMD"}, axis=1, inplace=True)
    ff_factors = ff_factors.merge(ff_factor_mom, how="left", on="date")
    
    ff_factors["date"] = pd.to_datetime(ff_factors["date"], format="%Y%m")
    ff_factors["date"] += MonthEnd(0)


    return ff_factors

In [88]:
def get_company(stocks, gvkey, date):
    comp = stocks[(stocks["gvkey"] == gvkey) & (stocks["datadate"] <= date)]
    comp = comp.drop_duplicates(subset="datadate", keep="first")
    comp = comp.set_index("datadate")
    comp.index = comp.index + MonthEnd(0)

    return comp

In [89]:
def prep_reg(cpi, comp_data, ff_factors, factor):
    
    X = cpi[["date", factor]].set_index("date")


    comp_data["Excess Return"] = comp_data["trt1m"] - ff_factors.set_index("date")["RF"]
    y = comp_data['Excess Return']

    

    # get indies that are the same between X and y
    indies_X = X.index.intersection(y.index)

    indies_y = y.index.intersection(X.index)

    indies = indies_X.intersection(indies_y)

    # save those rows
    X = X.loc[indies]
    y = y.loc[indies]
    
    return X, y

In [90]:
def run_reg(X, y):
    X = sm.add_constant(X)
    model = sm.OLS(y, X)
    results = model.fit()
    results.summary()
    return results

In [91]:
from tqdm import tqdm

In [92]:
# stocks between 2005 and 2020
stocks = stocks[stocks["datadate"] > "2004-06-01"] # six months needed for regression
stocks = stocks[stocks["datadate"] < "2020-01-01"]

In [93]:
dates_gvkeys = list(zip(stocks["datadate"], stocks["gvkey"]))

In [94]:
len(dates_gvkeys)

2752459

In [95]:
2644186/35770 * 8

591.3751188146491

In [96]:
import pickle

In [97]:
gvkeys = stocks["gvkey"].unique()

In [98]:
# load pickle file
with open('beta_dict.pickle', 'rb') as f:
    # The protocol version used is detected automatically, so we do not
    # have to specify it.
    beta_dict = pickle.load(f)

In [99]:
# get dates_gvkeys that are not in beta_dict
dates_gvkeys = [x for x in dates_gvkeys if x not in beta_dict.keys()]

In [20]:
len(dates_gvkeys)

386231

In [18]:
import json
import pickle

In [23]:
# save the betas
ff_factors = get_factors()
factor = "diff"

for company in tqdm(dates_gvkeys):

    date = company[0]
    gvkey = company[1]

    # supress warnings
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore")
        try:
            company_data = get_company(stocks, gvkey, date)
             
            if len(company_data) < 6:
                continue
            X, y = prep_reg(cpi, company_data, ff_factors, factor)
        
            
            
            if (X.shape[0] < 2) or (y.shape[0] < 2):
                continue
            
            
            
            reg_results = run_reg(X, y)
            beta_dict[company] = {"beta":reg_results.params[factor], "t-stat":reg_results.tvalues[factor]}

            # save dict to a file every 10000 iterations using pickle not json
            if len(beta_dict) % 1000 == 0:
                with open('beta_dict_final.pickle', 'wb') as f:
                # Pickle the 'data' dictionary using the highest protocol available.
                    pickle.dump(beta_dict, f, pickle.HIGHEST_PROTOCOL)            
                    
                    
           
        except ValueError:
            print("ValueError for gvkey: ", company)
            break

  0%|          | 66/386231 [00:00<44:12, 145.60it/s]


KeyboardInterrupt: 

In [100]:
# load pickle file
with open('beta_dict_final.pickle', 'rb') as f:
    # The protocol version used is detected automatically, so we do not
    # have to specify it.
    beta_dict = pickle.load(f)

In [101]:
beta_df = pd.DataFrame.from_dict(beta_dict, orient="index")
beta_df = beta_df.reset_index()
beta_df.columns = ["date", "gvkey", "beta", "t-stat"]
beta_df["date"] = pd.to_datetime(beta_df["date"])
beta_df["gvkey"] = beta_df["gvkey"].astype(int)

In [102]:
beta_df

Unnamed: 0,date,gvkey,beta,t-stat
0,2004-11-30,1003,0.000459,0.016964
1,2004-12-31,1003,0.020390,0.740262
2,2005-01-31,1003,0.011357,0.384415
3,2005-02-28,1003,0.006483,0.217023
4,2005-03-31,1003,-0.002816,-0.075046
...,...,...,...,...
2380995,2019-10-31,321764,-23.779050,-0.257696
2380996,2019-11-30,321764,-32.463695,-0.449178
2380997,2019-12-31,321764,-2.165565,-0.027525
2380998,2017-06-30,322494,3.729123,0.327351


In [164]:
# create results dataframe
results = pd.DataFrame(columns=["t_stat","mean", "std", "sharpe"])
t_stats = [j/100 for j in range(100, 525, 25)]

for t in t_stats:
    sig_stocks = beta_df[(beta_df["t-stat"] > t) | (beta_df["t-stat"] < -t)]
    # for each month get the count of stocks that are significant groupby date and count
    sig_stocks_count = sig_stocks.groupby("date").count().reset_index()
    # plot the count of significant stocks using plotly
    
    #merge sig_stocs with cpi and shift dates forward 1
    sig_stocks = sig_stocks.merge(cpi, how="left", on="date")
    sig_stocks["shifted_date"] = sig_stocks["date"] + MonthEnd(1)
    # sig_stocks = sig_stocks.drop("date", axis=1)
    sig_stocks
    #shift diff column back 1
    sig_stocks["pred_diff"] = sig_stocks.groupby("gvkey")["diff"].shift(-1)
    #shift diff of diff column back 1
    sig_stocks["pred_diff of diff"] = sig_stocks.groupby("gvkey")["diff_of_diff"].shift(-1)
    sig_stocks
    #Given a month, return the long and short stocks for next period
    def long_short(sig_stocks,date):
        sig_stocks= sig_stocks[sig_stocks["date"]==date]
        #get direction of month
        direction=sig_stocks["pred_diff"].mean()
        #get only ones that are significant
        if direction < 0:
            long_stocks=sig_stocks[sig_stocks["beta"]<0]
            short_stocks=sig_stocks[sig_stocks["beta"]>0]
        else:
            long_stocks=sig_stocks[sig_stocks["beta"]>0]
            short_stocks=sig_stocks[sig_stocks["beta"]<0]
        return long_stocks,short_stocks
        #split into long and short
    #loop through dates and get long and short stocks
    #make dictionary of long and short stocks
    long_dict={}
    short_dict={}
    for date in sig_stocks["date"].unique():
        long_stocks,short_stocks=long_short(sig_stocks,date)
        long_dict[date]=list(long_stocks.gvkey)
        short_dict[date]=list(short_stocks.gvkey)


    sig_stocks = sig_stocks.merge(stocks, how="left", left_on=["date", "gvkey"], right_on=["datadate", "gvkey"])
    #for each date get the return of each stock in the next month
    diff_dict={}
    for date in sig_stocks["date"].unique():
        #get the next month
        #if there is no next month, skip
        if date not in long_dict.keys():
            continue
        next_month=date+MonthEnd(1)
        #get the stocks that are long and short
        long_stocks=long_dict[date]
        short_stocks=short_dict[date]
        #get the returns of the stocks
        long_returns=stocks[(stocks["datadate"]==next_month) & (stocks["gvkey"].isin(long_stocks))]["trt1m"]
        short_returns=stocks[(stocks["datadate"]==next_month) & (stocks["gvkey"].isin(short_stocks))]["trt1m"]
        #get the mean of the returns
        long_mean=long_returns.mean()
        short_mean=short_returns.mean()
        #get the difference
        diff=long_mean-short_mean
        #create a dictionary of dates and differences
        diff_dict[date]=diff
        # #get the t-stat
        # t_stat=diff/np.sqrt((long_returns.var()/len(long_returns))+(short_returns.var()/len(short_returns)))
        # #get the p-value
        # p_value=1-stats.t.cdf(t_stat,len(long_returns)+len(short_returns)-2)
        # print("Date: ",date)
        # print("Long Mean: ",long_mean)
        # print("Short Mean: ",short_mean)
        # print("Difference: ",diff)
        # print("t-stat: ",t_stat)
        # print("p-value: ",p_value)
        # print("")
    #drop keys with nan values
    diff_dict={k: v for k, v in diff_dict.items() if not np.isnan(v)}
    #sort by date
    diff_dict=OrderedDict(sorted(diff_dict.items()))
    #convert to dataframe
    diff_df=pd.DataFrame.from_dict(diff_dict,orient="index")
    diff_df.columns=["diff"]
    diff_df
    diff_df["diff"].mean()
    ff_factors = get_factors()
    excess_returns = diff_df["diff"] - ff_factors.set_index("date")["RF"]
    excess_returns = excess_returns.dropna()
    # calculate sharpe ratio
    sharpe_ratio = excess_returns.mean() / excess_returns.std()
    excess_returns.mean()
    excess_returns.std()
    sharpe_ratio

    # add to new row
    results.loc[len(results)] = [t, excess_returns.mean(), excess_returns.std(), sharpe_ratio]


In [165]:
results

Unnamed: 0,date,mean,std,sharpe
0,4.75,-0.541703,4.136965,-0.130942
1,4.75,-0.545083,4.892785,-0.111406
2,4.75,-0.449041,5.78544,-0.077616
3,4.75,-0.680365,6.617465,-0.102813
4,4.75,-0.297868,8.033366,-0.037079
5,4.75,-0.08409,9.44589,-0.008902
6,4.75,-0.022394,11.128876,-0.002012
7,4.75,0.014696,14.325818,0.001026
8,4.75,0.984147,19.155775,0.051376
9,4.75,0.889586,17.186555,0.051761


In [166]:
results["t_stat"] = t_stats

In [168]:
results.drop("date", axis=1, inplace=True)

In [172]:
results = results[["t_stat", "sharpe", "mean", "std"]]

In [175]:
# plot the results, sharpe, mean, std
fig = go.Figure()
fig.add_trace(go.Scatter(x=results["t_stat"], y=results["sharpe"], mode="lines+markers", name="Sharpe Ratio"))
fig.add_trace(go.Scatter(x=results["t_stat"], y=results["mean"], mode="lines+markers", name="Mean"))
fig.add_trace(go.Scatter(x=results["t_stat"], y=results["std"], mode="lines+markers", name="Std"))

fig.update_layout(title="Results", xaxis_title="t-stat", yaxis_title="Value")

fig.show()


In [171]:
round(results[["t_stat", "sharpe", "mean", "std"]], 3)

Unnamed: 0,t_stat,sharpe,mean,std
0,1.0,-0.131,-0.542,4.137
1,1.25,-0.111,-0.545,4.893
2,1.5,-0.078,-0.449,5.785
3,1.75,-0.103,-0.68,6.617
4,2.0,-0.037,-0.298,8.033
5,2.25,-0.009,-0.084,9.446
6,2.5,-0.002,-0.022,11.129
7,2.75,0.001,0.015,14.326
8,3.0,0.051,0.984,19.156
9,3.25,0.052,0.89,17.187


In [140]:
sig_stocks = beta_df[(beta_df["t-stat"] > 4) | (beta_df["t-stat"] < -4)]

In [141]:
# for each month get the count of stocks that are significant groupby date and count
sig_stocks_count = sig_stocks.groupby("date").count().reset_index()

In [142]:
# plot the count of significant stocks using plotly
fig = px.line(sig_stocks_count, x="date", y="gvkey")
fig.show()

In [143]:
#merge sig_stocs with cpi and shift dates forward 1
sig_stocks = sig_stocks.merge(cpi, how="left", on="date")
sig_stocks["shifted_date"] = sig_stocks["date"] + MonthEnd(1)
# sig_stocks = sig_stocks.drop("date", axis=1)
sig_stocks

Unnamed: 0,date,gvkey,beta,t-stat,CPI,diff,diff_of_diff,shifted_date
0,2008-11-30,1166,9.128890,4.114896,-0.02223,-1.12215,1.50899,2008-12-31
1,2008-12-31,1166,9.123891,4.147361,-0.11359,-0.09136,1.03079,2009-01-31
2,2009-01-31,1166,8.997934,4.040090,0.00846,0.12205,0.21341,2009-02-28
3,2009-02-28,1166,9.069703,4.117109,-0.44648,-0.45494,-0.57699,2009-03-31
4,2008-10-31,1186,11.252240,4.670806,1.09992,-2.63114,-1.40888,2008-11-30
...,...,...,...,...,...,...,...,...
21391,2019-08-31,287622,52.698782,5.858985,1.71662,-0.03016,0.00282,2019-09-30
21392,2019-09-30,287622,58.460261,4.374261,1.76918,0.05256,0.08272,2019-10-31
21393,2016-03-31,318434,26.428691,5.390246,1.17263,0.28101,0.23667,2016-04-30
21394,2016-04-30,318434,23.551366,4.167943,1.07848,-0.09415,-0.37516,2016-05-31


In [144]:
#shift diff column back 1
sig_stocks["pred_diff"] = sig_stocks.groupby("gvkey")["diff"].shift(-1)
#shift diff of diff column back 1
sig_stocks["pred_diff of diff"] = sig_stocks.groupby("gvkey")["diff_of_diff"].shift(-1)
sig_stocks

Unnamed: 0,date,gvkey,beta,t-stat,CPI,diff,diff_of_diff,shifted_date,pred_diff,pred_diff of diff
0,2008-11-30,1166,9.128890,4.114896,-0.02223,-1.12215,1.50899,2008-12-31,-0.09136,1.03079
1,2008-12-31,1166,9.123891,4.147361,-0.11359,-0.09136,1.03079,2009-01-31,0.12205,0.21341
2,2009-01-31,1166,8.997934,4.040090,0.00846,0.12205,0.21341,2009-02-28,-0.45494,-0.57699
3,2009-02-28,1166,9.069703,4.117109,-0.44648,-0.45494,-0.57699,2009-03-31,,
4,2008-10-31,1186,11.252240,4.670806,1.09992,-2.63114,-1.40888,2008-11-30,,
...,...,...,...,...,...,...,...,...,...,...
21391,2019-08-31,287622,52.698782,5.858985,1.71662,-0.03016,0.00282,2019-09-30,0.05256,0.08272
21392,2019-09-30,287622,58.460261,4.374261,1.76918,0.05256,0.08272,2019-10-31,,
21393,2016-03-31,318434,26.428691,5.390246,1.17263,0.28101,0.23667,2016-04-30,-0.09415,-0.37516
21394,2016-04-30,318434,23.551366,4.167943,1.07848,-0.09415,-0.37516,2016-05-31,-0.21093,-0.21174


In [145]:
#Given a month, return the long and short stocks for next period
def long_short(sig_stocks,date):
    sig_stocks= sig_stocks[sig_stocks["date"]==date]
    #get direction of month
    direction=sig_stocks["pred_diff"].mean()
    #get only ones that are significant
    if direction < 0:
        long_stocks=sig_stocks[sig_stocks["beta"]<0]
        short_stocks=sig_stocks[sig_stocks["beta"]>0]
    else:
        long_stocks=sig_stocks[sig_stocks["beta"]>0]
        short_stocks=sig_stocks[sig_stocks["beta"]<0]
    return long_stocks,short_stocks
    #split into long and short

In [146]:
#loop through dates and get long and short stocks
#make dictionary of long and short stocks
long_dict={}
short_dict={}
for date in sig_stocks["date"].unique():
    long_stocks,short_stocks=long_short(sig_stocks,date)
    long_dict[date]=list(long_stocks.gvkey)
    short_dict[date]=list(short_stocks.gvkey)


In [147]:
'''
index vs gvkey
sig stocks only and not entire stock market...unknown if significant last year
sig_stocks and sig_t should be the same

'''

'\nindex vs gvkey\nsig stocks only and not entire stock market...unknown if significant last year\nsig_stocks and sig_t should be the same\n\n'

In [148]:
sig_stocks = sig_stocks.merge(stocks, how="left", left_on=["date", "gvkey"], right_on=["datadate", "gvkey"])

In [149]:
#for each date get the return of each stock in the next month
diff_dict={}
for date in sig_stocks["date"].unique():
    #get the next month
    #if there is no next month, skip
    if date not in long_dict.keys():
        continue
    next_month=date+MonthEnd(1)
    #get the stocks that are long and short
    long_stocks=long_dict[date]
    short_stocks=short_dict[date]
    #get the returns of the stocks
    long_returns=stocks[(stocks["datadate"]==next_month) & (stocks["gvkey"].isin(long_stocks))]["trt1m"]
    short_returns=stocks[(stocks["datadate"]==next_month) & (stocks["gvkey"].isin(short_stocks))]["trt1m"]
    #get the mean of the returns
    long_mean=long_returns.mean()
    short_mean=short_returns.mean()
    #get the difference
    diff=long_mean-short_mean
    #create a dictionary of dates and differences
    diff_dict[date]=diff
    # #get the t-stat
    # t_stat=diff/np.sqrt((long_returns.var()/len(long_returns))+(short_returns.var()/len(short_returns)))
    # #get the p-value
    # p_value=1-stats.t.cdf(t_stat,len(long_returns)+len(short_returns)-2)
    # print("Date: ",date)
    # print("Long Mean: ",long_mean)
    # print("Short Mean: ",short_mean)
    # print("Difference: ",diff)
    # print("t-stat: ",t_stat)
    # print("p-value: ",p_value)
    # print("")

In [150]:
#drop keys with nan values
diff_dict={k: v for k, v in diff_dict.items() if not np.isnan(v)}
#sort by date
diff_dict=OrderedDict(sorted(diff_dict.items()))
#convert to dataframe
diff_df=pd.DataFrame.from_dict(diff_dict,orient="index")
diff_df.columns=["diff"]
diff_df

Unnamed: 0,diff
2004-11-30,0.705597
2004-12-31,11.717035
2005-01-31,4.924956
2005-02-28,50.623050
2005-03-31,-0.628743
...,...
2019-07-31,-3.611662
2019-08-31,10.878370
2019-09-30,22.243458
2019-10-31,0.140807


In [151]:
diff_df["diff"].mean()

1.558134324145908

In [152]:
ff_factors = get_factors()

In [153]:
excess_returns = diff_df["diff"] - ff_factors.set_index("date")["RF"]

In [154]:
excess_returns = excess_returns.dropna()

In [155]:
# calculate sharpe ratio
sharpe_ratio = excess_returns.mean() / excess_returns.std()

In [156]:
excess_returns.mean()

1.4541119777771934

In [157]:
excess_returns.std()

26.49126531597928

In [158]:
sharpe_ratio

0.05489024251703398

In [68]:
#Histogram of differences
fig = px.histogram(diff_df, x="diff", nbins=100, title="Difference Distribution")
fig.show()


Unnamed: 0,date,gvkey,beta,t-stat,CPI,diff,diff_of_diff,shifted_date
0,2005-06-30,1009,-80.726283,-2.218897,3.06716,0.52613,0.85439,2005-07-31
1,2005-08-31,1009,-67.091382,-2.585226,4.74183,1.09490,0.51513,2005-09-30
2,2005-09-30,1009,-69.886995,-2.946334,4.35010,-0.39173,-1.48663,2005-10-31
3,2005-11-30,1009,-68.970612,-2.773012,3.33855,0.00000,1.01155,2005-12-31
4,2006-02-28,1009,-53.971757,-2.121254,3.41792,-0.22033,0.16021,2006-03-31
...,...,...,...,...,...,...,...,...
369622,2017-06-30,351371,-43.139448,-2.299032,1.72511,0.08454,0.30031,2017-07-31
369623,2017-07-31,351371,-43.319004,-2.434378,1.92812,0.20301,0.11847,2017-08-31
369624,2017-08-31,351371,-42.764813,-2.535639,2.18057,0.25245,0.04944,2017-09-30
369625,2018-10-31,351371,-34.040645,-2.168523,2.14733,-0.34470,-0.50467,2018-11-30


Unnamed: 0,date,gvkey,beta,t-stat,CPI,diff,diff_of_diff,shifted_date,pred_diff,pred_diff of diff
0,2005-06-30,1009,-80.726283,-2.218897,3.06716,0.52613,0.85439,2005-07-31,1.09490,0.51513
1,2005-08-31,1009,-67.091382,-2.585226,4.74183,1.09490,0.51513,2005-09-30,-0.39173,-1.48663
2,2005-09-30,1009,-69.886995,-2.946334,4.35010,-0.39173,-1.48663,2005-10-31,0.00000,1.01155
3,2005-11-30,1009,-68.970612,-2.773012,3.33855,0.00000,1.01155,2005-12-31,-0.22033,0.16021
4,2006-02-28,1009,-53.971757,-2.121254,3.41792,-0.22033,0.16021,2006-03-31,0.19592,0.41625
...,...,...,...,...,...,...,...,...,...,...
369622,2017-06-30,351371,-43.139448,-2.299032,1.72511,0.08454,0.30031,2017-07-31,0.20301,0.11847
369623,2017-07-31,351371,-43.319004,-2.434378,1.92812,0.20301,0.11847,2017-08-31,0.25245,0.04944
369624,2017-08-31,351371,-42.764813,-2.535639,2.18057,0.25245,0.04944,2017-09-30,-0.34470,-0.50467
369625,2018-10-31,351371,-34.040645,-2.168523,2.14733,-0.34470,-0.50467,2018-11-30,-0.14495,0.19975


In [118]:
sig_stocks = sig_stocks.sort_values("date")

# port const

In [203]:
#Given a month, return the long and short stocks for next period
def long_short(sig_stocks,date):
    sig_stocks= sig_stocks[sig_stocks["date"]==date]
    
    #get direction of month...all the same
    direction=sig_stocks["pred_diff"].mean()

    #get only ones that are significant
    if direction < 0:
        long_stocks=sig_stocks[sig_stocks["beta"]<0]
        # long_stocks["bin"]=pd.qcut(long_stocks["beta"], 4, labels=["L", "1", "2", "H"])

        short_stocks=sig_stocks[sig_stocks["beta"]>0]
        #short_stocks["bin"]=pd.qcut(short_stocks["beta"], 4, labels=["L", "1", "2", "H"])

        # long_stocks = long_stocks[long_stocks["bin"] == "L"]
        #short_stocks = short_stocks[short_stocks["bin"] == "H"]

    else:
        long_stocks=sig_stocks[sig_stocks["beta"]>0]
        #long_stocks["bin"]=pd.qcut(long_stocks["beta"], 4, labels=["L", "1", "2", "H"])

        short_stocks=sig_stocks[sig_stocks["beta"]<0]
        # short_stocks["bin"]=pd.qcut(short_stocks["beta"], 4, labels=["L", "1", "2", "H"])

        #long_stocks = long_stocks[long_stocks["bin"] == "H"]
        # short_stocks = short_stocks[short_stocks["bin"] == "L"]
        
    return long_stocks,short_stocks
    #split into long and short

In [214]:
#loop through dates and get long and short stocks
#make dictionary of long and short stocks
long_dict={}
short_dict={}

long_short_dict={}

for date in sig_stocks["date"].unique():
    long_stocks,short_stocks=long_short(sig_stocks,date)
    long_short_dict[date] = {}
    long_short_dict[date]["long"]=list(long_stocks.gvkey)
    long_short_dict[date]["short"]=list(short_stocks.gvkey)

    long_dict[date] = list(long_stocks.gvkey)
    short_dict[date] = list(short_stocks.gvkey)



In [215]:
# combine 
portfolio = pd.DataFrame.from_dict(long_short_dict).T
portfolio.reset_index(inplace=True)
portfolio.columns = ["date", "long", "short"]

In [216]:
portfolio

Unnamed: 0,date,long,short
0,2004-11-30,"[8, 13, 14, 16, 37, 43, 44, 47, 51, 53, 57, 66...","[0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 15, 17..."
1,2004-12-31,"[1285, 1286, 1289, 1290, 1292, 1293, 1294, 129...","[1287, 1288, 1291, 1296, 1308, 1309, 1310, 131..."
2,2005-01-31,"[2327, 2328, 2329, 2330, 2332, 2334, 2338, 233...","[2331, 2333, 2335, 2336, 2337, 2340, 2341, 234..."
3,2005-02-28,"[3273, 3274, 3278, 3279, 3281, 3282, 3284, 328...","[3275, 3276, 3277, 3280, 3283, 3285, 3286, 328..."
4,2005-03-31,"[4220, 4222, 4223, 4224, 4225, 4227, 4228, 422...","[4218, 4219, 4221, 4226, 4230, 4231, 4232, 423..."
...,...,...,...
177,2019-08-31,"[423710, 423711, 423712, 423713, 423715, 42371...","[423714, 423761, 423770, 423779, 423791, 42380..."
178,2019-09-30,"[426602, 426603, 426604, 426605, 426606, 42660...","[426692, 426693, 426948, 426979, 426987, 42717..."
179,2019-10-31,"[429489, 429490, 429491, 429492, 429493, 42949...","[429539, 429597, 429680, 429714, 429823, 42984..."
180,2019-11-30,"[432330, 432331, 432332, 432333, 432334, 43233...","[432501, 432520, 432521, 432581, 432670, 43269..."


In [217]:
# get size of long from each row
long_size = []
short_size = []
for i in range(len(portfolio)): 
    long_size.append(len(portfolio["long"][i]))
    short_size.append(len(portfolio["short"][i]))

In [218]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=portfolio["date"], y=long_size, name="long"))
fig.add_trace(go.Scatter(x=portfolio["date"], y=short_size, name="short"))
fig.update_layout(title="Size of Long and Short", xaxis_title="Date", yaxis_title="Size of Long and Short")

# plot cpi
fig.add_trace(go.Scatter(x=cpi["date"], y=cpi["CPI"], name="CPI", yaxis="y2"))
fig.update_layout(yaxis2=dict(overlaying="y", side="right", title="CPI"))

fig.show()

In [121]:
sig_stocks = sig_stocks.merge(stocks[["datadate", "gvkey", "trt1m"]], how="left", left_on=["gvkey", "date"], right_on=["gvkey", "datadate"])

In [179]:
portfolio

Unnamed: 0,date,long,short
0,2004-11-30,"[21519, 28111, 28109, 23218, 112622, 19159, 85...","[124438, 151908, 143664, 112544, 124320, 12435..."
1,2004-12-31,"[12550, 121662, 105478, 65809, 65809, 105570, ...","[24943, 24943, 205874, 3138, 29944, 10013, 285..."
2,2005-01-31,"[16454, 152808, 108172, 62521, 155032, 62206, ...","[30598, 205942, 106903, 26065, 110386, 205874,..."
3,2005-02-28,"[142380, 143625, 148240, 21311, 140190, 2204, ...","[63896, 63896, 62592, 64017, 21320, 62597, 157..."
4,2005-03-31,"[5330, 108365, 157235, 17286, 17197, 11708, 31...","[160926, 63826, 25364, 107541, 105478, 105488,..."
...,...,...,...
177,2019-08-31,"[186126, 183278, 183278, 160341, 285660, 19627...","[61090, 147564, 171114, 147891, 31668, 194855,..."
178,2019-09-30,"[184113, 19359, 19359, 175474, 175474, 107616,...","[186173, 186173, 177166, 184142, 180390, 31453..."
179,2019-10-31,"[30571, 32698, 180874, 180874, 177487, 18874, ...","[31848, 147891, 276613, 66597, 16705, 26045, 1..."
180,2019-11-30,"[111667, 32870, 5284, 5284, 31628, 26881, 2688...","[66042, 184747, 184747, 33858, 33852, 21765, 6..."


In [219]:
returns_dict={}
for date in portfolio["date"]:
    next_month=date+MonthEnd(1)
    # get all stocks we want to long
    long_stocks = portfolio[(portfolio["date"] == date)]["long"].values[0]
    short_stocks = portfolio[(portfolio["date"] == date)]["short"].values[0]
    long_returns=stocks[(stocks["datadate"]==next_month) & (stocks["gvkey"].isin(long_stocks))]["trt1m"]
    short_returns=stocks[(stocks["datadate"]==next_month) & (stocks["gvkey"].isin(short_stocks))]["trt1m"]

    long_mean=long_returns.mean()
    short_mean=short_returns.mean()

    long_short_return=long_mean-short_mean

    returns_dict[date]=long_short_return

In [220]:
#for each date get the return of each stock in the next month
returns_dict={}
for date in sig_stocks["date"].sort_values().unique():
    #get the next month
    #if there is no next month, skip
    # if date not in long_dict.keys():
    #     continue
    next_month=date+MonthEnd(1)
    #get the stocks that are long and short
    long_stocks=long_dict[date]
    short_stocks=short_dict[date]
    #get the returns of the stocks
    long_returns=stocks[(stocks["datadate"]==next_month) & (stocks["gvkey"].isin(long_stocks))]["trt1m"]
    short_returns=stocks[(stocks["datadate"]==next_month) & (stocks["gvkey"].isin(short_stocks))]["trt1m"]
    # break
    #get the mean of the returns
    long_mean=long_returns.mean()
    short_mean=short_returns.mean()
    #get the difference
    long_short_return=long_mean-short_mean
    #create a dictionary of dates and differences
    returns_dict[date]=long_short_return
    # #get the t-stat
    # t_stat=diff/np.sqrt((long_returns.var()/len(long_returns))+(short_returns.var()/len(short_returns)))
    # #get the p-value
    # p_value=1-stats.t.cdf(t_stat,len(long_returns)+len(short_returns)-2)
    # print("Date: ",date)
    # print("Long Mean: ",long_mean)
    # print("Short Mean: ",short_mean)
    # print("Difference: ",diff)
    # print("t-stat: ",t_stat)
    # print("p-value: ",p_value)
    # print("")

In [221]:
#drop keys with nan values
returns_dict={k: v for k, v in returns_dict.items() if not np.isnan(v)}
#sort by date
returns_dict=OrderedDict(sorted(returns_dict.items()))
#convert to dataframe
returns_dict=pd.DataFrame.from_dict(returns_dict,orient="index")
returns_dict.columns=["return"]
returns_dict

Unnamed: 0,return
2004-11-30,3.773637
2004-12-31,1.891985
2005-01-31,-3.497916
2005-02-28,-3.386312
2005-03-31,1.054663
...,...
2019-07-31,9.748178
2019-08-31,-12.306143
2019-09-30,-14.330279
2019-10-31,4.249741


In [222]:
# plot months
fig = px.line(returns_dict, x=returns_dict.index, y="return")
fig.show()


In [223]:
returns_dict["return"].mean()

-0.19147243037882578

In [224]:
#Histogram of long-short returns
fig = px.histogram(returns_dict, x="return", nbins=100, title="Long Short Distribution")
fig.show()
