# Size-Value-Momentum/Investment Sorts

In [1]:
import os
import pandas as pd

pd.__version__

'0.22.0'

In [2]:
def assign_bkts(df, var, brkpts, prefix=None, suffix="Bkt"):
    """
    Assigns buckets to column `var` based on percentiles in `brkpts`.
    
    Assumes breakpoints are already in dataframe `df` and are named the same as given in `brkpts`.
    Buckets are integers beginning with 1 ending with len(brkpts).
    """

    if not prefix:
        prefix = var
    varbkt = prefix + suffix

    df[varbkt] = pd.np.NaN

    for i, brkpt in enumerate(brkpts):  # index begins at 0
        if i==0:
            df.loc[df[var]<=df[brkpt], varbkt] = 1
        else:
            df.loc[(df[var]>df[brkpts[i-1]]) & (df[var]<=df[brkpt]), varbkt] = i + 1

    return df

In [3]:
%%time

df = pd.read_csv("C:/Data/Thesis/Combined_Data.csv")



Wall time: 41.1 s


In [15]:
df.date = pd.to_datetime(df.date)

In [4]:
size_bkts = pd.read_csv("C:/Data/Thesis/Bkts_Jun_ME_2.csv")

df = df.merge(size_bkts, on=["PERMNO", "HP"], how="left")

# Small

In [41]:
sdf = df[df.ME_JunBkt==1]

sdf.shape

(2370047, 41)

## $BM^m$ Buckets

In [42]:
brk = sdf[["date", "BMm"]][(sdf.EXCHCD==1) & (sdf.BMmOK)].groupby("date")
brk = brk.quantile(pd.np.array(range(1, 21))/20).unstack()
brk.columns = brk.columns.droplevel(0)

brk.to_csv("C:/Data/Thesis/Brks_Small_BMm.csv")  # column headers are decimals, potential floating point precision garbage

sdf = sdf.merge(brk.reset_index("date"), on="date", how="left")

sdf = assign_bkts(sdf, "BMm", [.25, .5, .75, 1.])

sdf = sdf.drop(pd.np.array(range(1, 21))/20, axis=1)

## Prior Buckets

In [43]:
brk = sdf[["date", "Prior"]][(sdf.EXCHCD==1) & (sdf.PriorOK)].groupby("date")
brk = brk.quantile(pd.np.array(range(1, 21))/20).unstack()
brk.columns = brk.columns.droplevel(0)

brk.to_csv("C:/Data/Thesis/Brks_Small_Prior.csv")  # column headers are decimals, potential floating point precision garbage

sdf = sdf.merge(brk.reset_index("date"), on="date", how="left")

sdf = assign_bkts(sdf, "Prior", [.25, .5, .75, 1.])

sdf = sdf.drop(pd.np.array(range(1, 21))/20, axis=1)

## $BM$ Buckets

In [44]:
brk = sdf[(sdf.EXCHCD==1) & (sdf.date.dt.month==7) & (sdf.BMOK)][["HP", "BMm"]].groupby("HP")
brk = brk.quantile(pd.np.array(range(1, 21))/20).unstack()
brk.columns = brk.columns.droplevel(0)

brk.to_csv("C:/Data/Thesis/Brks_Small_BM.csv")  # column headers are decimals, potential floating point precision garbage

sdf = sdf.merge(brk.reset_index("HP"), on="HP", how="left")

sdf = assign_bkts(sdf, "BM", [.25, .5, .75, 1.])

sdf = sdf.drop(pd.np.array(range(1, 21))/20, axis=1)

## Inv Buckets

In [45]:
brk = sdf[(sdf.EXCHCD==1) & (sdf.date.dt.month==7) & (~sdf.Inv.isna())][["HP", "Inv"]].groupby("HP")
brk = brk.quantile(pd.np.array(range(1, 21))/20).unstack()
brk.columns = brk.columns.droplevel(0)

brk.to_csv("C:/Data/Thesis/Brks_Small_Inv.csv")  # column headers are decimals, potential floating point precision garbage

sdf = sdf.merge(brk.reset_index("HP"), on="HP", how="left")

sdf = assign_bkts(sdf, "Inv", [.25, .5, .75, 1.])

sdf = sdf.drop(pd.np.array(range(1, 21))/20, axis=1)

## Size-Value-Momentum

In [46]:
grp, ix = ["date", "BMmBkt", "PriorBkt"], ((sdf.BMmOK) & (sdf.PriorOK))

sdf["BktSize"] = sdf[ix].groupby(grp)["L1_ME"].transform("sum")

for col in ["RET", "BMm", "BM", "Prior", "Inv", "BE_Growth_Future"]:
    print(col, end=" ")
    sdf["Wt"+col] = sdf[col] * sdf.L1_ME / sdf.BktSize
    bkt = sdf[ix].groupby(grp)["Wt"+col].sum()
    bkt = bkt.reset_index(grp[1:])
    bkt[grp[1]+grp[2]] = "S" + bkt[grp[1]].astype("int").astype("str") + bkt[grp[2]].astype("int").astype("str")
    bkt = bkt.pivot(columns=grp[1]+grp[2], values="Wt"+col).reset_index("date")
    bkt.to_csv("C:/Data/Thesis/Size_BMm_Prior_S_{}.csv".format(col))
print()

RET BMm BM Prior Inv BE_Growth_Future 


## Size-Value-Investment

In [48]:
grp, ix = ["date", "BMBkt", "InvBkt"], ((sdf.BMOK) & (~sdf.Inv.isna()))

sdf["BktSize"] = sdf[ix].groupby(grp)["Size"].transform("sum")

for col in ["RET", "BMm", "BM", "Prior", "Inv", "BE_Growth_Future"]:
    print(col, end=" ")
    sdf["Wt"+col] = sdf[col] * sdf.Size / sdf.BktSize
    bkt = sdf[ix].groupby(grp)["Wt"+col].sum()
    bkt = bkt.reset_index(grp[1:])
    bkt[grp[1]+grp[2]] = "S" + bkt[grp[1]].astype("int").astype("str") + bkt[grp[2]].astype("int").astype("str")
    bkt = bkt.pivot(columns=grp[1]+grp[2], values="Wt"+col).reset_index("date")
    bkt.to_csv("C:/Data/Thesis/Size_BM_Inv_S_{}.csv".format(col))
print()

RET BMm BM Prior Inv BE_Growth_Future 


# Big

In [49]:
sdf = df[df.ME_JunBkt==2]

sdf.shape

(581661, 41)

## $BM^m$ Buckets

In [50]:
brk = sdf[["date", "BMm"]][(sdf.EXCHCD==1) & (sdf.BMmOK)].groupby("date")
brk = brk.quantile(pd.np.array(range(1, 21))/20).unstack()
brk.columns = brk.columns.droplevel(0)

brk.to_csv("C:/Data/Thesis/Brks_Big_BMm.csv")  # column headers are decimals, potential floating point precision garbage

sdf = sdf.merge(brk.reset_index("date"), on="date", how="left")

sdf = assign_bkts(sdf, "BMm", [.25, .5, .75, 1.])

sdf = sdf.drop(pd.np.array(range(1, 21))/20, axis=1)

## Prior Buckets

In [51]:
brk = sdf[["date", "Prior"]][(sdf.EXCHCD==1) & (sdf.PriorOK)].groupby("date")
brk = brk.quantile(pd.np.array(range(1, 21))/20).unstack()
brk.columns = brk.columns.droplevel(0)

brk.to_csv("C:/Data/Thesis/Brks_Big_Prior.csv")  # column headers are decimals, potential floating point precision garbage

sdf = sdf.merge(brk.reset_index("date"), on="date", how="left")

sdf = assign_bkts(sdf, "Prior", [.25, .5, .75, 1.])

sdf = sdf.drop(pd.np.array(range(1, 21))/20, axis=1)

## $BM$ Buckets

In [52]:
brk = sdf[(sdf.EXCHCD==1) & (sdf.date.dt.month==7) & (sdf.BMOK)][["HP", "BMm"]].groupby("HP")
brk = brk.quantile(pd.np.array(range(1, 21))/20).unstack()
brk.columns = brk.columns.droplevel(0)

brk.to_csv("C:/Data/Thesis/Brks_Big_BM.csv")  # column headers are decimals, potential floating point precision garbage

sdf = sdf.merge(brk.reset_index("HP"), on="HP", how="left")

sdf = assign_bkts(sdf, "BM", [.25, .5, .75, 1.])

sdf = sdf.drop(pd.np.array(range(1, 21))/20, axis=1)

## Inv Buckets

In [53]:
brk = sdf[(sdf.EXCHCD==1) & (sdf.date.dt.month==7) & (~sdf.Inv.isna())][["HP", "Inv"]].groupby("HP")
brk = brk.quantile(pd.np.array(range(1, 21))/20).unstack()
brk.columns = brk.columns.droplevel(0)

brk.to_csv("C:/Data/Thesis/Brks_Big_Inv.csv")  # column headers are decimals, potential floating point precision garbage

sdf = sdf.merge(brk.reset_index("HP"), on="HP", how="left")

sdf = assign_bkts(sdf, "Inv", [.25, .5, .75, 1.])

sdf = sdf.drop(pd.np.array(range(1, 21))/20, axis=1)

## Size-Value-Momentum

In [54]:
grp, ix = ["date", "BMmBkt", "PriorBkt"], ((sdf.BMmOK) & (sdf.PriorOK))

sdf["BktSize"] = sdf[ix].groupby(grp)["L1_ME"].transform("sum")

for col in ["RET", "BMm", "BM", "Prior", "Inv", "BE_Growth_Future"]:
    print(col, end=" ")
    sdf["Wt"+col] = sdf[col] * sdf.L1_ME / sdf.BktSize
    bkt = sdf[ix].groupby(grp)["Wt"+col].sum()
    bkt = bkt.reset_index(grp[1:])
    bkt[grp[1]+grp[2]] = "B" + bkt[grp[1]].astype("int").astype("str") + bkt[grp[2]].astype("int").astype("str")
    bkt = bkt.pivot(columns=grp[1]+grp[2], values="Wt"+col).reset_index("date")
    bkt.to_csv("C:/Data/Thesis/Size_BMm_Prior_B_{}.csv".format(col))
print()

RET BMm BM Prior Inv BE_Growth_Future 


## Size-Value-Investment

In [55]:
grp, ix = ["date", "BMBkt", "InvBkt"], ((sdf.BMOK) & (~sdf.Inv.isna()))

sdf["BktSize"] = sdf[ix].groupby(grp)["Size"].transform("sum")

for col in ["RET", "BMm", "BM", "Prior", "Inv", "BE_Growth_Future"]:
    print(col, end=" ")
    sdf["Wt"+col] = sdf[col] * sdf.Size / sdf.BktSize
    bkt = sdf[ix].groupby(grp)["Wt"+col].sum()
    bkt = bkt.reset_index(grp[1:])
    bkt[grp[1]+grp[2]] = "B" + bkt[grp[1]].astype("int").astype("str") + bkt[grp[2]].astype("int").astype("str")
    bkt = bkt.pivot(columns=grp[1]+grp[2], values="Wt"+col).reset_index("date")
    bkt.to_csv("C:/Data/Thesis/Size_BM_Inv_B_{}.csv".format(col))
print()

RET BMm BM Prior Inv BE_Growth_Future 


# Combine Small and Big

In [48]:
prefix = "Size_BMm_Prior"

#for col in ["RET", "BMm", "BM", "Prior", "Inv", "BE_Growth_Future"]:
col = "BE_Growth_Future"
small = pd.read_csv("C:/Data/Thesis/{}_S_{}.csv".format(prefix, col)).iloc[:, 1:]
small.date = pd.to_datetime(small.date)
big = pd.read_csv("C:/Data/Thesis/{}_B_{}.csv".format(prefix, col)).iloc[:, 1:]
big.date = pd.to_datetime(big.date)
x = small.merge(big, how="left").set_index("date")["1963-07":"2017-12"].replace([pd.np.inf, -pd.np.inf], pd.np.nan).mean()

x.round(2).values.reshape([2, 4, 4])

array([[[ 0.42,  0.33,  0.36,  0.62],
        [ 0.08,  0.1 ,  0.11,  0.1 ],
        [ 0.01,  0.05,  0.05,  0.04],
        [-0.09, -0.02,  0.  , -0.02]],

       [[ 0.42,  0.25,  0.25,  0.5 ],
        [ 0.11,  0.1 ,  0.11,  0.13],
        [ 0.07,  0.08,  0.08,  0.09],
        [ 0.01,  0.04,  0.04,  0.03]]])

In [63]:
prefix = "Size_BM_Inv"

#for col in ["RET", "BMm", "BM", "Prior", "Inv", "BE_Growth_Future"]:
col = "BE_Growth_Future"
small = pd.read_csv("C:/Data/Thesis/{}_S_{}.csv".format(prefix, col)).iloc[:, 1:]
small.date = pd.to_datetime(small.date)
big = pd.read_csv("C:/Data/Thesis/{}_B_{}.csv".format(prefix, col)).iloc[:, 1:]
big.date = pd.to_datetime(big.date)
x = small.merge(big, how="left").set_index("date")["1963-07":"2017-12"].replace([pd.np.inf, -pd.np.inf], pd.np.nan).mean()

x.round(2).values.reshape([2, 4, 4])

array([[[ 0.6 ,  0.49,  0.28,  0.42],
        [ 0.06,  0.09,  0.12,  0.16],
        [ 0.03,  0.06,  0.09,  0.11],
        [-0.02,  0.01,  0.03,  0.05]],

       [[ 0.28,  0.21,  0.17,  0.4 ],
        [ 0.09,  0.11,  0.12,  0.17],
        [ 0.06,  0.08,  0.11,  0.14],
        [ 0.03,  0.05,  0.08,  0.1 ]]])