In [2]:
# packeges
import numpy as np
import pandas as pd

In [3]:
# hide warning messages
import warnings
warnings.filterwarnings("ignore")

In this notebook we're gonna create portfolios by decile for two factors, size and value from our intradaily sample.

In [4]:
# getting the daterange in daily frequency to create some dataframes (we're gonna use the marketcap dataset for this)
mktcap_path = '../../output/data/marketcap.csv'
marketcap = pd.read_csv(mktcap_path, index_col=0)
daterange = marketcap.index

In [5]:
# datetime object
daterange = pd.to_datetime(daterange)
daterange

DatetimeIndex(['2005-01-03', '2005-01-04', '2005-01-05', '2005-01-06',
               '2005-01-07', '2005-01-10', '2005-01-11', '2005-01-12',
               '2005-01-13', '2005-01-14',
               ...
               '2019-12-17', '2019-12-18', '2019-12-19', '2019-12-20',
               '2019-12-23', '2019-12-24', '2019-12-26', '2019-12-27',
               '2019-12-30', '2019-12-31'],
              dtype='datetime64[ns]', length=3775, freq=None)

In [6]:
# getting the factor column names from factors (we're gonna need this for the last loop)
factors_path = f'../../input/factors/20050103.csv'
factors = pd.read_csv(factors_path, index_col=0)

In [7]:
# all factors
pNYSE_factors = factors.columns[4:]

In [8]:
pNYSE_factors

Index(['pNYSE_size', 'pNYSE_value', 'pNYSE_prof', 'pNYSE_dur', 'pNYSE_valprof',
       'pNYSE_fscore', 'pNYSE_debtiss', 'pNYSE_repurch', 'pNYSE_nissa',
       'pNYSE_accruals', 'pNYSE_growth', 'pNYSE_aturnover', 'pNYSE_gmargins',
       'pNYSE_divp', 'pNYSE_ep', 'pNYSE_cfp', 'pNYSE_noa', 'pNYSE_inv',
       'pNYSE_invcap', 'pNYSE_igrowth', 'pNYSE_sgrowth', 'pNYSE_lev',
       'pNYSE_roaa', 'pNYSE_roea', 'pNYSE_sp', 'pNYSE_gltnoa', 'pNYSE_divg',
       'pNYSE_invaci', 'pNYSE_mom', 'pNYSE_indmom', 'pNYSE_valmom',
       'pNYSE_valmomprof', 'pNYSE_shortint', 'pNYSE_mom12', 'pNYSE_momrev',
       'pNYSE_lrrev', 'pNYSE_valuem', 'pNYSE_nissm', 'pNYSE_sue', 'pNYSE_roe',
       'pNYSE_rome', 'pNYSE_roa', 'pNYSE_strev', 'pNYSE_ivol', 'pNYSE_betaarb',
       'pNYSE_season', 'pNYSE_indrrev', 'pNYSE_indrrevlv', 'pNYSE_indmomrev',
       'pNYSE_ciss', 'pNYSE_price', 'pNYSE_age', 'pNYSE_shvol', 'pNYSE_exchsw',
       'pNYSE_ipo'],
      dtype='object')

In [9]:
# factors with just two percentiles (1 and 10)
pNYSE_factors_ = ['pNYSE_fscore', 'pNYSE_debtiss', 'pNYSE_repurch', 'pNYSE_exchsw', 'pNYSE_ipo']

In [9]:
# dropping the factors with just two percentiles
pNYSE_factors = list(set(pNYSE_factors) - set(pNYSE_factors_))

In [10]:
pNYSE_factors

['pNYSE_igrowth',
 'pNYSE_season',
 'pNYSE_divp',
 'pNYSE_indrrev',
 'pNYSE_divg',
 'pNYSE_lrrev',
 'pNYSE_strev',
 'pNYSE_inv',
 'pNYSE_price',
 'pNYSE_sue',
 'pNYSE_indmom',
 'pNYSE_roa',
 'pNYSE_age',
 'pNYSE_sp',
 'pNYSE_shortint',
 'pNYSE_growth',
 'pNYSE_roaa',
 'pNYSE_lev',
 'pNYSE_roe',
 'pNYSE_indmomrev',
 'pNYSE_mom',
 'pNYSE_momrev',
 'pNYSE_ep',
 'pNYSE_invcap',
 'pNYSE_gltnoa',
 'pNYSE_gmargins',
 'pNYSE_cfp',
 'pNYSE_roea',
 'pNYSE_noa',
 'pNYSE_valmom',
 'pNYSE_invaci',
 'pNYSE_aturnover',
 'pNYSE_sgrowth',
 'pNYSE_mom12',
 'pNYSE_indrrevlv',
 'pNYSE_dur',
 'pNYSE_nissa',
 'pNYSE_value',
 'pNYSE_valuem',
 'pNYSE_size',
 'pNYSE_accruals',
 'pNYSE_nissm',
 'pNYSE_shvol',
 'pNYSE_betaarb',
 'pNYSE_prof',
 'pNYSE_valmomprof',
 'pNYSE_ivol',
 'pNYSE_ciss',
 'pNYSE_valprof',
 'pNYSE_rome']

### Functions

drop_ticker function: receives two parameters, percentile and df_returns. 

* percentile is the percentile list (p1, p2, ..., p10).
* df_returns is the returns dataframe of any trade day.

This funtion returns the percentile list with just the ticks that are in the returns dataframe.

In [10]:
def drop_ticker(percentile, df_returns):
    drop_tickers = []
    for ticker in percentile:
        if ticker not in df_returns.columns:
            drop_tickers.append(ticker)
    for ticker in drop_tickers:
        percentile.remove(ticker)
    return percentile

portfolio_decile function: receives three parameters, col, df_factors and df_returns.

* col is the percentile NYSE factor column (pNYSE_size, pNYSE_value, ..., pNYSE_ipo)
* df_factors if the factors dataframe of any day.
* df_returns is the returns dataframe of any trade day.

This funtion returns all percentiles (p1, p2, ..., p10), each of them has tickers of firms whose are in this respective percentile and there is its matching column in returns dataframe.

In [11]:
def portfolio_decile(col, df_factors, df_returns): 
    # taking out all firms with NA for this specified factor
    temp = df_factors[df_factors[col].notna()]   

    # creating an empty list for 10 percentiles
    p1 = []
    p2 = [] 
    p3 = [] 
    p4 = [] 
    p5 = [] 
    p6 = [] 
    p7 = [] 
    p8 = [] 
    p9 = [] 
    p10 = [] 

    # filling the percentile lists with the respective tickers whose are in this percentile
    for permno in temp.index:
        if temp[col][permno] == 1:
            p1.append(temp['TAQ_TICKER'][permno])
        elif temp[col][permno] == 2:
            p2.append(temp['TAQ_TICKER'][permno])
        elif temp[col][permno] == 3:
            p3.append(temp['TAQ_TICKER'][permno])
        elif temp[col][permno] == 4:
            p4.append(temp['TAQ_TICKER'][permno])
        elif temp[col][permno] == 5:
            p5.append(temp['TAQ_TICKER'][permno])
        elif temp[col][permno] == 6:
            p6.append(temp['TAQ_TICKER'][permno])
        elif temp[col][permno] == 7:
            p7.append(temp['TAQ_TICKER'][permno])
        elif temp[col][permno] == 8:
            p8.append(temp['TAQ_TICKER'][permno])
        elif temp[col][permno] == 9:
            p9.append(temp['TAQ_TICKER'][permno])
        elif temp[col][permno] == 10:
            p10.append(temp['TAQ_TICKER'][permno])

    """
    Now, we need to use the drop_ticker function.
    Thus, we'll have the percentile lists with just the tickers that are in the returns dataframe.
    We'll use a loop to pass for all percentile lists
    """    
    percentiles = [p1,p2,p3,p4,p5,p6,p7,p8,p9,p10]
    for p in percentiles:
        drop_ticker(p, df_returns)

    return(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10)

value_weight function: receives two parameters, percentile and df_factors.

* percentile is the percentile list (p1, p2, ..., p10).
* df_factors if the factors dataframe of any day.

This funtion returns the value weight of the firms in this percentile (they are in the same sequence of the percentile list).

In [12]:
def value_weight(percentile, df_factors):
    # getting the marketcap value list
    marketcap = list(df_factors[df_factors['TAQ_TICKER'].isin(percentile)]['MARKETCAP'].values)
    # sum of the marketcaps of this percentile firms
    sum_marketcap = sum(marketcap)
    # getting the value weight
    weight = marketcap/sum_marketcap
    return weight

portfolio_formation function: receives four parameters, col, df, df_factors, df_returns.

* col is the percentile NYSE factor column (pNYSE_size, pNYSE_value, ..., pNYSE_ipo).
* df is the factor dataframe which will be filled with the value weighted portfolios.
* df_factors if the factors dataframe of any day.
* df_returns is the returns dataframe of any trade day.

This funtion returns nothing. It just fills the df inputed with portfolio returns.

In [13]:
def portfolio_formation(col, df, df_factors, df_returns):
    # getting the percentile lists from portfolio_decile function
    p1, p2, p3, p4, p5, p6, p7, p8, p9, p10 = portfolio_decile(col, df_factors, df_returns)

    """
    Now, we'll fill the df (input) dataframe with percentiles portfolios.
    Each percentile column of df will receive the value weighted portfolio of the stock returns in this specified percentile.
    """

    df['p1'] = (df_returns[p1]*value_weight(p1, df_factors)).sum(axis=1)
    df['p2'] = (df_returns[p2]*value_weight(p2, df_factors)).sum(axis=1)
    df['p3'] = (df_returns[p3]*value_weight(p3, df_factors)).sum(axis=1)
    df['p4'] = (df_returns[p4]*value_weight(p4, df_factors)).sum(axis=1)
    df['p5'] = (df_returns[p5]*value_weight(p5, df_factors)).sum(axis=1)
    df['p6'] = (df_returns[p6]*value_weight(p6, df_factors)).sum(axis=1)
    df['p7'] = (df_returns[p7]*value_weight(p7, df_factors)).sum(axis=1)
    df['p8'] = (df_returns[p8]*value_weight(p8, df_factors)).sum(axis=1)
    df['p9'] = (df_returns[p9]*value_weight(p9, df_factors)).sum(axis=1)
    df['p10'] = (df_returns[p10]*value_weight(p10, df_factors)).sum(axis=1)

portfolio_formation_ function: receives four parameters, col, df, df_factors, df_returns.

* col is the percentile NYSE factor column (pNYSE_size, pNYSE_value, ..., pNYSE_ipo).
* df is the factor dataframe which will be filled with the value weighted portfolios.
* df_factors if the factors dataframe of any day.
* df_returns is the returns dataframe of any trade day.

This funtion returns nothing. It just fills the df inputed with portfolio returns.

Obs.: This function do the same thing that portfolio_formation, but it does for binary factors.

In [14]:
def portfolio_formation_(col, df, df_factors, df_returns):
    # getting the percentile lists from portfolio_decile function
    p1, p2, p3, p4, p5, p6, p7, p8, p9, p10 = portfolio_decile(col, df_factors, df_returns)

    """
    Now, we'll fill the df (input) dataframe with percentiles portfolios.
    Each percentile column of df will receive the value weighted portfolio of the stock returns in this specified percentile.
    """

    df['p1'] = (df_returns[p1]*value_weight(p1, df_factors)).sum(axis=1)
    df['p10'] = (df_returns[p10]*value_weight(p10, df_factors)).sum(axis=1)

n_firms function: receives four parameters, col, df, df_factors, df_returns.

* col is the percentile NYSE factor column (pNYSE_size, pNYSE_value, ..., pNYSE_ipo).
* df is the factor dataframe which will be filled with the value weighted portfolios.
* df_factors if the factors dataframe of any day.
* df_returns is the returns dataframe of any trade day.

This funtion returns nothing. It just fills the df inputed with number of firms.

In [15]:
def n_firms(col, df, date, df_factors, df_returns):
    # getting the percentile lists from portfolio_decile function
    p1, p2, p3, p4, p5, p6, p7, p8, p9, p10 = portfolio_decile(col, df_factors, df_returns)

    """
    Now, we'll fill the df (input) dataframe with number of firms by percentile.
    Each percentile column of df will receive the number of firms in this specified percentile.
    """

    df['p1'][date] = len(p1)
    df['p2'][date] = len(p2)
    df['p3'][date] = len(p3)
    df['p4'][date] = len(p4)
    df['p5'][date] = len(p5)
    df['p6'][date] = len(p6)
    df['p7'][date] = len(p7)
    df['p8'][date] = len(p8)
    df['p9'][date] = len(p9)
    df['p10'][date] = len(p10)

### Data Generator Process

In [16]:
# we need to create a date range for the period we have
bdates = pd.bdate_range('2005-01-01', '2019-12-31')
bdates_ = []

# we need to convert in the csv's names format
for date in bdates:
    day = str(date)[:4] + str(date)[5:7] + str(date)[8:10]
    bdates_.append(day)

#### Portfolio-by-Deciles Loop

In [16]:
for pNYSE in pNYSE_factors:
    # getting just the factor name (for the output path folder)
    factor = pNYSE[6:]
    # number of firms dataframe
    n10 = pd.DataFrame(index=daterange, columns=['p1','p2','p3','p4','p5','p6','p7','p8','p9','p10'])
    for day in bdates_:
        try:
            # factors dataframe
            factors_path = f'../../input/factors/{day}.csv'
            factors = pd.read_csv(factors_path, index_col=0)
            # dropping all firms whose doesn't have TAQ_TICKER, because that is the only variable we can connect to returns database
            factors = factors[factors['TAQ_TICKER'] != '<undefined>']
            # getting the absolute value of Market Cap
            factors['MARKETCAP'] = factors['MARKETCAP'].abs()
            # filling with 0 the firms with NaN value for Market Cap
            factors['MARKETCAP'].fillna(0, inplace=True)
            
            # returns dataframe
            returns_path = f'../../input/returns/{day}.csv'
            returns = pd.read_csv(returns_path, index_col=0)
            
            # portfolio returns dataframe
            p10 = pd.DataFrame(index=returns.index, columns=['p1','p2','p3','p4','p5','p6','p7','p8','p9','p10'])

            # filling the portfolio returns dataframes (intradaily)
            portfolio_formation(pNYSE, p10, factors, returns) 
            
            # filling the number of firms dataframes (daily)
            date = pd.to_datetime(day)
            n_firms(pNYSE, n10, date, factors, returns)

            # converting portfolio returns dataframes to csv
            output_path = f'../../output/data/double_check/{factor}/{day}.csv'
            p10.to_csv(output_path, sep=',', encoding='utf-8')
        except:
            pass
    # converting number of firms dataframes to csv
    output_path = f'../../output/data/double_check/{factor}/n10.csv'
    n10.to_csv(output_path, sep=',', encoding='utf-8')

#### Binary Portfolio Loop

In [17]:
for pNYSE in pNYSE_factors_:
    # getting just the factor name (for the output path folder)
    factor = pNYSE[6:]
    # number of firms dataframe
    n10 = pd.DataFrame(index=daterange, columns=['p1','p2','p3','p4','p5','p6','p7','p8','p9','p10'])
    for day in bdates_:
        try:
            # factors dataframe
            factors_path = f'../../input/factors/{day}.csv'
            factors = pd.read_csv(factors_path, index_col=0)
            # dropping all firms whose doesn't have TAQ_TICKER, because that is the only variable we can connect to returns database
            factors = factors[factors['TAQ_TICKER'] != '<undefined>']
            # getting the absolute value of Market Cap
            factors['MARKETCAP'] = factors['MARKETCAP'].abs()
            # filling with 0 the firms with NaN value for Market Cap
            factors['MARKETCAP'].fillna(0, inplace=True)
            
            # returns dataframe
            returns_path = f'../../input/returns/{day}.csv'
            returns = pd.read_csv(returns_path, index_col=0)
            
            # portfolio returns dataframe
            p10 = pd.DataFrame(index=returns.index, columns=['p1','p2','p3','p4','p5','p6','p7','p8','p9','p10'])

            # filling the portfolio returns dataframes (intradaily)
            portfolio_formation_(pNYSE, p10, factors, returns) 
            
            # filling the number of firms dataframes (daily)
            date = pd.to_datetime(day)
            n_firms(pNYSE, n10, date, factors, returns)

            # converting portfolio returns dataframes to csv
            output_path = f'../../output/data/double_check/{factor}/{day}.csv'
            p10.to_csv(output_path, sep=',', encoding='utf-8')
        except:
            pass
    # converting number of firms dataframes to csv
    output_path = f'../../output/data/double_check/{factor}/n10.csv'
    n10.to_csv(output_path, sep=',', encoding='utf-8')