In [1]:
import wrds
import warnings
import pandas as pd
import numpy as np
import time
from tqdm import tqdm
import os
# import statsmodels.formula.api as smf
import datetime
from pandas.tseries.offsets import MonthEnd

In [2]:
# functions in /fire_pytools/utils

def pk_integrity(df, primary_key):

    """Check that df columns in primary_key consist no missing values or duplicates."""

    assert df[primary_key].notna().all().all(), 'Null values detected in primary key.'
    assert not df[primary_key].duplicated().any(), 'Duplicate values detected in primary key.'

    pass



def monthly_date(date):
    """
    Get year and month from a datestamp as a integer
    """
    mdate = date.year*100+date.month

    return mdate


def create_crsp_calendar(db):
    """"
    Create CRSP daily calender.
    Input:
        db: WRDS connection
    Return:
        tdays: pandas costume business days calendar
    """

    #db = wrds.Connection(wrds_username='lmota')

    # Erroneous dates: on these dates there are only 1 or 22 (2012-10-29) stock observations, all missing returns
    # seem to be erroneous holidays/weekenddays
    err_dates = pd.to_datetime(['1952-09-27', '1962-08-12', '1972-12-25', '1972-12-28',
                 '1973-01-25', '1973-09-09', '1976-08-29', '1977-01-15', '1977-07-14',
                 '1978-11-26', '1981-11-01', '1985-09-27', '1987-04-19', '1992-10-24',
                 '1995-09-30', '1996-02-19', '2001-09-11', '2012-10-29', '2017-09-30'])

    # Create dist_prc variable - count number of valid days since last
    sql = "SELECT DISTINCT date FROM crspq.dsf"
    dates = db.raw_sql(sql, date_cols=['date'])
    dates = dates.sort_values(['date']).reset_index().date

    # Notice that we are marking weekend as holidays because in the begging of the sample saturdays were trading days.
    all_days = pd.date_range(min(dates), max(dates))
    holidays = all_days[np.logical_not(all_days.isin(dates))]
    holidays = holidays.append(err_dates)
    holidays = holidays.unique()

    # Define CRSP's trading days calendar
    tdays = np.busdaycalendar(weekmask=[1, 1, 1, 1, 1, 1, 1], holidays=holidays.tolist())

    # Test
    # np.busday_count(min(dates), max(dates), busdaycal=TDays) == len(dates) - 1

    return tdays


def post_event_nan(df, event, vars, id_vars=['permno', 'date']):

    event_dt = df[event][id_vars]
    event_dt.rename(columns={id_vars[1]: 'event_date'}, inplace=True)
    pk_integrity(event_dt, id_vars[0])

    df = pd.merge(df, event_dt, on=id_vars[0], how='left')
    df.loc[df[id_vars[1]] > df['event_date'], vars] = np.nan
    df.drop(columns={'event_date'}, inplace=True)

    return df

TABLE = {'cusip': 'all',
         'permco': 'all',
         'issuno': 'all',
         'hexcd': 'all',
         'hsiccd': 'all',
         'shrout': 'all',
         'bidlo': 'sf',
         'askhi': 'sf',
         'prc': 'sf',
         'vol': 'sf',
         'ret': 'sf',
         'bid': 'sf',
         'ask': 'sf',
         'cfacpr': 'sf',
         'cfacshr': 'sf',
         'retx': 'sf',
         'altprc': 'msf',
         'spread': 'msf',
         'altprcdt': 'msf',
         'openprc': 'dsf',
         'numtrd': 'dsf',
         'comnam': 'seall',
         'dclrdt': 'seall',
         'dlamt': 'seall',
         'dlpdt': 'seall',
         'dlstcd': 'seall',
         'ncusip': 'seall',
         'nextdt': 'seall',
         'paydt': 'seall',
         'rcrddt': 'seall',
         'shrcls': 'seall',
         'shrflg': 'seall',
         'ticker': 'seall',
         'hsicmg': 'seall',
         'hsicig': 'seall',
         'nameendt': 'seall',
         'shrcd': 'seall',
         'exchcd': 'seall',
         'siccd': 'seall',
         'tsymbol': 'seall',
         'naics': 'seall',
         'primexch': 'seall',
         'trdstat': 'seall',
         'secstat': 'seall',
         'distcd': 'seall',
         'divamt': 'seall',
         'facpr': 'seall',
         'facshr': 'seall',
         'acperm': 'seall',
         'accomp': 'seall',
         'shrenddt': 'seall',
         'nwperm': 'seall',
         'nwcomp': 'seall',
         'dlretx': 'seall',
         'dlprc': 'seall',
         'dlret': 'seall',
         'trtscd': 'seall',
         'trtsendt': 'seall',
         'nmsind': 'seall',
         'mmcnt': 'seall',
         'nsdinx': 'seall',
         'year': 'mseall',
         'month': 'mseall'}

# MSEPAD is used when creating CRSP
# When merging mse and sf tables and for vars in MSEPAD we should copy information from
# event observations to succeeding non-event observations.
MSEPAD = ["ticker", "comnam", "ncusip", "shrout", "siccd", "exchcd",
          "shrcls", "shrcd",  "shrflg", "trtscd", "nmsind", "mmcnt",
          "naics", "nsdinx"]


# %% Function Definition


def crsp_sf(varlist, start_date, end_date, freq, permno_list=None, shrcd_list=None, exchcd_list=None, db=None):


    start_time = time.time()
    if db is None:
        db = wrds.Connection() # make sure to configure wrds connector before hand.

    assert freq in ['daily', 'monthly'], "freq must be either 'daily' or 'monthly'."

    if freq == 'daily' and len(varlist) > 1:
        warnings.warn('''Querying multiple variables from daily tables may take a long time if limited RAM is available (16GB or less).
                         WRDS connection may time out causing an error.''')

    prefix = 'd' if freq == 'daily' else 'm'

    sfvars = ['permno', 'date']
    sfvars += [i for i in varlist if TABLE[i] in ['{}sf'.format(prefix), 'sf']]
    sfvars += [i + ' AS {}_sf'.format(i) for i in varlist if TABLE[i] == 'all']

    seallvars = ['permno', 'date']

    if ('divamt' in set(varlist)) & ('facpr' not in set(varlist)):
        seallvars += ['facpr']
    if ('divamt' in set(varlist)) & ('distcd' not in set(varlist)):
        seallvars += ['distcd']

    seallvars += [i for i in varlist if TABLE[i] in ['{}seall'.format(prefix), 'seall']]
    seallvars += [i + ' AS {}_seall'.format(i) for i in varlist if TABLE[i] == 'all']

    permnos = 'AND permno IN ({})'.format(", ".join(map(str, permno_list))) if permno_list else ''

    sql = '''
          SELECT {}
          FROM crsp.{}sf
          WHERE date >= DATE '{}'
          AND date <= DATE '{}'
          {}
          '''.format(', '.join(sfvars), prefix, start_date, end_date, permnos)

    df = db.raw_sql(sql, date_cols=['date'])

    sql = '''
          SELECT {}
          FROM crsp.{}seall
          WHERE date >= DATE '{}'
          AND date <= DATE '{}'
          {}
          '''.format(', '.join(seallvars), prefix, start_date, end_date, permnos)

    df2 = db.raw_sql(sql, date_cols=['date'])

    # There are some duplicates in mseall that differ in variables related to cash distributions and acquisitions.
    # Cumulate if possible, or remove arbitrary row. Raise warning if any relevant variable was queried.
    nonunique_cols = ['rcrddt', 'dclrdt', 'acperm', 'accomp', 'paydt', 'distcd', 'facpr', 'facshr']

    if set(varlist).intersection(set(nonunique_cols)):
        warnings.warn('''Duplicate permno-date observations were deleted arbitrarily.
                             {} percent of the observations.
                             Please resolve manually the affected columns:
                             {}
                             '''.format(str((df2.duplicated(subset=['permno', 'date']).sum() / df2.shape[0]) * 100),
                                        ', '.join(nonunique_cols)))

    # Duplicated happens when there is two distribution types at the same day.
    # We need to have permno/date as primary key
    # df2.duplicated(subset=['permno', 'date', 'distcd', 'paydt']).sum() # TODO: why isn't this primary key?
    # We start with  52665 duplicated permno/date. 1867 non zero facpr (monthly).

    if 'divamt' in set(varlist):
        # Calculate (TOTAL) DIVAMT:
        # "If the Distribution code is 6225, a nonzero amount represents an offer price given to a certain amount of
        # shares. For these cases, the dollar value per share is actually DIVAMT multiplied by the percent of shares
        # accepted by the offer, where the percent of shares accepted can be derived by multiplying FACPR by negative
        # one. See FACPR." (WRDS)
        # print(df2[df2.distcd == 6225].to_string())

        df2.set_index(['permno', 'date'], inplace=True)

        div = df2[['distcd', 'divamt', 'facpr']].copy()
        div.reset_index(inplace=True)
        div.loc[div.distcd == 6225, 'divamt'] = div['facpr']*(-1)*div['divamt']

        div = div.groupby(['permno', 'date'])['divamt'].sum(min_count=1).to_frame('divamt')

        df2.reset_index(inplace=True)
        df2.drop(columns=['divamt'], inplace=True)

    # For all other entries keep last
    df2 = df2[~df2.duplicated(subset=['permno', 'date'], keep='last')].copy()
    pk_integrity(df2, ['date', 'permno'])

    # Merge tables.
    df['permno'] = df['permno'].astype('int')
    df2['permno'] = df2['permno'].astype('int')

    # Merge tables with following logic: for each sf observation, merge most recent seall observation with same permno.
    # If particular permno-date combination does not exist in sf but does in seall, include that observation.
    df.sort_values(['date', 'permno'], inplace=True)
    df2.sort_values(['date', 'permno'], inplace=True)

    df = pd.merge_asof(df, df2, on='date', by='permno', direction='backward')

    df2 = df2[(df2['date'] >= start_date) & (df2['date'] <= end_date)]
    df = df.merge(df2, on=['permno', 'date'], how='outer', validate='1:1', suffixes=('', '_y'), indicator=True)

    for i in df2.columns.tolist()[2:]:
        df.loc[df['_merge'] == 'right_only', i] = df[i + '_y']
    df.drop([s + '_y' for s in df2.columns.tolist()[2:]] + ['_merge'], axis=1, inplace=True)
    del df2

    # Make sure that downloaded columns that exist in both tables agree and keep union of them.
    for i in [j for j in varlist if TABLE[j] == 'all']:
        mask = df[[i + '_sf', i + '_seall']].notnull().all(axis=1)
        assert (df.loc[mask, i + '_sf'] == df.loc[mask, i + '_seall']).all(), \
            'sf and seall had conflicting values for {}.'.format(i)
        df.rename(columns={i + '_sf': i}, inplace=True)
        df.loc[df[i].isnull(), i] = df[i + '_seall']
        df.drop(i + '_seall', axis=1, inplace=True)

    # Add total divamt
    if 'divamt' in set(varlist):
        df = df.merge(div, on=['permno', 'date'], how='left')

    # delete wanted rows
    if exchcd_list:
        df = df[df.exchcd.isin(exchcd_list)]
    if shrcd_list:
        df = df[df.shrcd.isin(shrcd_list)]
    df = df[(df['date'] >= start_date) & (df['date'] <= end_date)]

    df = df[['permno', 'date']+varlist]

    pk_integrity(df, ['permno', 'date'])

    df.sort_values(['permno', 'date'], inplace=True)

    print("CRSP data was successfully downloaded in %s seconds." % str(time.time() - start_time))

    return df

def calculate_melag_weight(mdata):
    required_cols = ['lag_me', 'lag_dlret']

    set(required_cols).issubset(mdata.columns), "Required columns: {}.".format(', '.join(required_cols))

    df = mdata[required_cols].copy()
    df['melag'] = df.groupby('permno').lag_me.fillna(method='pad')
    df.reset_index(inplace=True)

    # Fill na after delisting
    df = post_event_nan(df=df, event=df.lag_dlret.notnull(), vars=['melag'], id_vars=['permno', 'date'])

    df.set_index(['permno', 'date'], inplace=True)

    return df[['melag']]

def get_adata_quarter(adata, data_name):

    data = adata.pivot_table(index='date', columns='permno', values= data_name).astype('float')
    # data = data.fillna(method='ffill', limit=2)
    data = data.loc[[x for x in data.index if x.month % 3 == 2]].reindex(data.index).ffill(limit=2).dropna(how='all')

    return data

def which_quantile(data_scalar, nyse_criteria_list):
  if np.isinf(data_scalar):
    return np.nan

  for q in range(1, len(nyse_criteria_list) - 1):
    if nyse_criteria_list[q-1] <= data_scalar < nyse_criteria_list[q]:
      return q

  return len(nyse_criteria_list) - 1


def get_quantile(data_series, nyse_bool, q=5):
    # NaN 값 처리
    data_series = data_series.dropna()

    if len(data_series) == 0:
        return np.nan

    # nyse_bool에서 NaN 값을 False로 변환
    tmp_nyse_bool = nyse_bool.loc[data_series.name].fillna(False)
    nyse_data_series = data_series[tmp_nyse_bool]

    nyse_criteria = [np.nanpercentile(nyse_data_series, x) for x in range(0, 100+int(100/q), int(100/q))]

    return data_series.apply(lambda x: which_quantile(x, nyse_criteria))



def get_port_rtn(ret_df, wl1_df):
  return (ret_df * wl1_df).sum(axis=1).replace(0, np.nan).dropna()


def get_quantile_rtn(ret_df, me_df, quantile_df):
  q = np.unique(quantile_df.values.flatten())
  q = int(max(q[~np.isnan(q)]))

  port_rtn = pd.DataFrame(index=ret_df.index, columns=range(1, q+1), dtype=float)

  for i in range(1, q+1):
    tmp_me = me_df[quantile_df == i]
    tmp_wl1 = tmp_me.divide(tmp_me.sum(axis=1), axis=0).shift(1).dropna(how='all')

    port_rtn[i] = get_port_rtn(ret_df, tmp_wl1)

  port_rtn[f'{q}-1'] = port_rtn[q] - port_rtn[1]
  return port_rtn.dropna(how='all')

def calculate_annual_volatility_and_sharpe_ratio(df, risk_free_rate=0.02):
    """
    월간 수익률 데이터프레임에서 연간 변동성과 샤프 비율을 계산합니다.

    :param df: 월간 수익률이 포함된 데이터프레임. 인덱스는 날짜, 컬럼은 'returns'.
    :param risk_free_rate: 연간 무위험 수익률 (기본값: 0.02)
    :return: 연간 변동성과 샤프 비율
    """
    # 연간 수익률 계산
    cumulative_returns = (1 + df['returns']).prod() ** (12 / len(df)) - 1

    # 연간 변동성 계산
    annual_volatility = df['returns'].std() * np.sqrt(12)

    # 샤프 비율 계산
    sharpe_ratio = (cumulative_returns - risk_free_rate) / annual_volatility

    return annual_volatility, sharpe_ratio


In [3]:
# %% Set Up
db = wrds.Connection()
start_time = time.time()

WRDS recommends setting up a .pgpass file.
Created .pgpass file successfully.
You can create this file yourself at any time with the create_pgpass_file() function.
Loading library list...
Done


In [4]:
from datetime import date
# from crsp import crsp_sf  # 가상의 함수, 실제 코드에 맞게 수정 필요

print('Stock monthly calculation started.')

# 필요한 변수 목록
varlist = ['dlret', 'dlretx', 'exchcd', 'naics', 'permco', 'prc', 'ret', 'shrcd', 'shrout', 'siccd', 'ticker', 'cusip']

start_date = '2000-01-01'
end_date = date.today().strftime("%Y-%m-%d")
# end_date = '2023-12-31'
freq = 'monthly'
permno_list = None  # 필요에 따라 특정 permno 리스트를 지정할 수 있습니다.
shrcd_list = [10, 11]
exchcd_list = [1, 2, 3]

# crsp_sf 함수 호출 (가상의 함수, 실제 코드에 맞게 수정 필요)
crspm = crsp_sf(varlist,
                start_date,
                end_date,
                freq=freq,
                permno_list=permno_list,
                shrcd_list=shrcd_list,
                exchcd_list=exchcd_list,
                db=db)

Stock monthly calculation started.
CRSP data was successfully downloaded in 170.67633652687073 seconds.


In [5]:
start_time = time.time()
if db is None:
    db = wrds.Connection() # make sure to configure wrds connector before hand.

assert freq in ['daily', 'monthly'], "freq must be either 'daily' or 'monthly'."

if freq == 'daily' and len(varlist) > 1:
    warnings.warn('''Querying multiple variables from daily tables may take a long time if limited RAM is available (16GB or less).
                     WRDS connection may time out causing an error.''')

prefix = 'd' if freq == 'daily' else 'm'

sfvars = ['permno', 'date']
sfvars += [i for i in varlist if TABLE[i] in ['{}sf'.format(prefix), 'sf']]
sfvars += [i + ' AS {}_sf'.format(i) for i in varlist if TABLE[i] == 'all']

seallvars = ['permno', 'date']

if ('divamt' in set(varlist)) & ('facpr' not in set(varlist)):
    seallvars += ['facpr']
if ('divamt' in set(varlist)) & ('distcd' not in set(varlist)):
    seallvars += ['distcd']

seallvars += [i for i in varlist if TABLE[i] in ['{}seall'.format(prefix), 'seall']]
seallvars += [i + ' AS {}_seall'.format(i) for i in varlist if TABLE[i] == 'all']

permnos = 'AND permno IN ({})'.format(", ".join(map(str, permno_list))) if permno_list else ''

# WRDS 데이터베이스 연결
db = wrds.Connection()

# SQL 쿼리 문자열
sql = '''
      SELECT {}
      FROM crsp.{}sf
      WHERE date >= DATE '{}'
      AND date <= DATE '{}'
      {}
      '''.format(', '.join(sfvars), prefix, start_date, end_date, permnos)

# SQL 쿼리 실행
df = db.raw_sql(sql, date_cols=['date'])

WRDS recommends setting up a .pgpass file.
Created .pgpass file successfully.
You can create this file yourself at any time with the create_pgpass_file() function.
Loading library list...
Done


In [6]:
crspm['rankyear'] = crspm.date.dt.year
crspm.loc[crspm.date.dt.month <= 6, 'rankyear'] = crspm.loc[crspm.date.dt.month <= 6, 'rankyear'] - 1

# Returns adjusted for delisting
crspm['retadj'] = ((1 + crspm['ret'].fillna(0)) * (1 + crspm['dlret'].fillna(0)) - 1)
crspm.loc[crspm[['ret', 'dlret']].isnull().all(axis=1), 'retadj'] = np.nan

crspm['me'] = abs(crspm['prc']) * (crspm['shrout'] / 1000)

# Adjust for delisting
crspm.loc[crspm.dlret.notnull(), 'me'] = np.nan

start_time1 = time.time()

crspm['edate'] = crspm['date'] + MonthEnd(0)
crspm.sort_values(['permno', 'edate'], inplace=True)
pk_integrity(crspm, ['permno', 'edate'])
crspm.set_index('edate', inplace=True)

# Resample to take care of missing months
scrspm = crspm[['permno', 'me', 'dlret']].groupby('permno').resample('M').mean().drop(columns='permno')  # mean maintains nan
scrspm = scrspm.groupby('permno').shift(1)
scrspm.columns = ['lag_' + i for i in scrspm.columns]

crspm.reset_index(inplace=True)
crspm.set_index(['permno', 'edate'], inplace=True)

crspm_full = crspm.join(scrspm, how='outer')

crspm_full.reset_index(inplace=True)

# Delete rows that were not in the original data set
crspm_full.dropna(subset=['date'], inplace=True)
crspm_full.drop(columns=['edate'], inplace=True)

print("Finish resampling. Time to resample data: %s minutes" % str((time.time() - start_time1)/60))

# Create MElag
crspm_full.set_index(['permno', 'date'], inplace=True)
crspm_full['melag_weights'] = calculate_melag_weight(crspm_full)
crspm_full.drop(columns=[x for x in crspm.columns if 'lag_' in x], inplace=True)

crspm_full.reset_index(inplace=True)
crspm_full.sort_values(['permno', 'date'], inplace=True)

print("Time to create CRSP monthly: %s seconds" % str(time.time() - start_time))

stock_monthly = crspm_full

print("Time to create CRSP monthly: %s minutes" % str(np.round((time.time() - start_time)/60, 2)))

Finish resampling. Time to resample data: 0.6724679271380106 minutes
Time to create CRSP monthly: 145.83034920692444 seconds
Time to create CRSP monthly: 2.43 minutes


In [7]:
# SQL 쿼리 작성 (Compustat 데이터베이스에서 필요한 데이터 추출)
query = """
    SELECT a.gvkey, a.datadate, c.tic, c.cusip, a.at, a.lt, a.sale, a.cogs, a.xsga, a.xrd, a.ebitda, a.ebit, a.xint, a.pi, a.txt, a.xido, a.ib, a.ni, a.dvc, a.dvt, a.capx, a.oancf, a.prstkc, a.sstk,  a.dltis, a.dlcch, a.fincf, a.act, a.rect, a.che, a.invt, a.intan, a.ivao, a.ppegt, a.ppent, a.lct,  a.ap, a.dlc, a.txp, a.dltt, a.txditc, a.pstkrv, a.seq, a.ceq, a.icapt, a.mib, a.ivst, a.ao, a.lo, a.dp
    FROM comp.funda as a
    LEFT JOIN comp.names as c
    ON a.gvkey = c.gvkey
    WHERE a.indfmt = 'INDL' AND a.datafmt = 'STD' AND a.popsrc = 'D' AND a.consol = 'C'
    AND a.datadate >= '2020-01-01'
"""

# 쿼리 실행 및 데이터프레임으로 변환
funda_df = db.raw_sql(query)

In [8]:
# SQL 쿼리 작성
query = """
    SELECT q.gvkey, q.datadate, c.tic, c.cusip,  q.actq, q.aoq, q.apq, q.atq, q.ceqq, q.cheq, q.cogsq, q.dlcq, q.dpq, q.epspiq, q.ibq, q.icaptq, q.intanq, q.invtq, q.ivaoq, q.ivstq, q.lctq, q.loq, q.ltq, q.mibtq, q.niq, q.piq, q.ppegtq, q.ppentq, q.pstkrq, q.recdq, q.rectq, q.rectrq, q.saleq, q.seqq, q.txdiq, q.txpq, q.txtq, q.xidoq, q.xintq,q.xrdq, q.xsgaq, q.capxy, q.chechy, q.dlcchy, q.dltisy, q.fincfy, q.oancfy, q.prstkcy, q.sstky
    FROM comp.fundq as q
    LEFT JOIN comp.names as c
    ON q.gvkey = c.gvkey
    WHERE q.indfmt = 'INDL' AND q.datafmt = 'STD' AND q.popsrc = 'D' AND q.consol = 'C'
    AND q.datadate >= '2000-01-01'
"""

# 쿼리 실행 및 데이터프레임으로 변환
fundq_df = db.raw_sql(query)

# 데이터프레임 출력
print(fundq_df.head())

    gvkey    datadate     tic      cusip      actq      aoq      apq  \
0  001013  2000-01-31  ADCT.1  000886309  1137.412  333.830  103.291   
1  001082  2000-01-31  SERV.1  81765M106    39.941    8.971   13.077   
2  001173  2000-01-31   AIM.1  008015307    17.435    0.877    1.968   
3  001183  2000-01-31   IDAI.  45169P106       NaN    8.982    0.632   
4  001189  2000-01-31   AGR.7  008489502   454.479  125.407      NaN   

        atq      ceqq     cheq  ...    xrdq    xsgaq   capxy  chechy  dlcchy  \
0  1814.519  1396.005  364.315  ...  75.200  205.600  46.785  21.138     NaN   
1   110.675    21.270    4.440  ...     NaN    2.169   9.597  -2.956  -7.367   
2    22.774    12.644    0.964  ...   0.801    1.839   0.558  -0.754   0.174   
3   103.962    83.879   54.333  ...     NaN      NaN   1.091  22.224   0.000   
4   839.170   363.297      NaN  ...     NaN      NaN  17.181 -12.630     NaN   

   dltisy  fincfy  oancfy  prstkcy   sstky  
0     NaN  16.289  41.582    0.000  42.65

In [9]:

# cusip의 마지막 자리수를 제거하는 함수 정의
def remove_last_digit(cusip):
    if pd.isnull(cusip):  # NaN 처리
        return cusip
    return cusip[:-1]

# apply 함수를 사용하여 cusip 열에 적용
funda_df['cusip'] = funda_df['cusip'].apply(remove_last_digit)
fundq_df['cusip'] = fundq_df['cusip'].apply(remove_last_digit)


funda_df.rename(columns={'datadate': 'edate'}, inplace=True)
funda_df['edate'] = pd.to_datetime(funda_df['edate'], format='%Y-%m-%d')
crspm_re = crspm.reset_index()
# adata = pd.merge(crspm_re, funda_df, on=['cusip', 'edate'], how='left')
# adata.to_csv(r'C:\Users\82108\PycharmProjects\Factor_Model\US\data\adata_2024_2q.csv')

In [30]:
fundq_df[fundq_df['tic'] == 'AAPL'][['tic', 'edate', 'aoq', 'saleq', 'cogsq', 'epspiq']].head(10)

Unnamed: 0,tic,edate,aoq,saleq,cogsq,epspiq
1998,AAPL,2000-03-31,1781.0,1945.0,1375.0,1.44
14735,AAPL,2000-06-30,1461.0,1825.0,1257.0,0.62
27252,AAPL,2000-09-30,1063.0,1870.0,1385.0,0.52
39555,AAPL,2000-12-31,735.0,1007.0,1004.0,-0.58
51481,AAPL,2001-03-31,454.0,1431.0,1024.0,0.12
63659,AAPL,2001-06-30,476.0,1475.0,1015.0,0.17
75613,AAPL,2001-09-30,314.0,1450.0,983.0,0.19
87369,AAPL,2001-12-31,277.0,1375.0,925.0,0.11
98801,AAPL,2002-03-31,263.0,1495.0,1057.0,0.11
110541,AAPL,2002-06-30,261.0,1429.0,1010.0,0.09


In [11]:
# save_path = r"C:\Users\MetaM\Downloads\fundq_2025-05-22.csv"
# fundq_df.to_csv(save_path, index=False)

In [12]:
# qdata.tail(12)

In [13]:
# fundaq_2024 = fundq_df[fundq_df['edate'] >= '2024-06-01']
# qticker_2024 = fundaq_2024['tic'].unique().tolist()
# qticker_2024 = [ticker for ticker in qticker_2024 if ticker is not None and len(ticker) != 5 and not any(char.isdigit() for char in ticker)]
# len(qticker_2024)

In [14]:
end_date = date.today().strftime('%Y-%m-%d')

In [15]:
import FinanceDataReader as fdr
# import yfinance as yf

In [16]:
sp = fdr.StockListing('SP500')
amex = fdr.StockListing('AMEX')
nyse = fdr.StockListing('NYSE')
nasdaq = fdr.StockListing('NASDAQ')
# 데이터 확인
sp500 = pd.DataFrame(sp)
amx = pd.DataFrame(amex)
nys = pd.DataFrame(nyse)
nsq = pd.DataFrame(nasdaq)

## 제외 되는 업종  'Utilities', 'Financials',  'Real Estate'
sp500_ex_fin= sp500[~sp500['Sector'].isin(['Utilities', 'Financials',  'Real Estate' ])].copy()

# amx['Industry'].unique().tolist()
ex_industry = ['금융, 상품 시장 운영 및 서비스 제공', '폐쇄형 펀드', '은행',  '투자 지주 회사', '부동산 임대, 개발 및 운영',
               '투자 관리 및 펀드 운영', '상업용 REITs', '전력 유틸리티','특수 REITs','주거용 REITs','소비자 대출',
               '복합보험 및 중개인', '기업 금융 서비스' ,'']

amx_ex_fin = amx[~amx['Industry'].isin(ex_industry)].copy()
nys_ex_fin = nys[~nys['Industry'].isin(ex_industry)].copy()
nsq_ex_fin = nsq[~nsq['Industry'].isin(ex_industry)].copy()

sp500_ex_fin.loc[:,'exchange']= 'SP500'
amx_ex_fin.loc[:,'exchange']= 'AMEX'
nys_ex_fin.loc[:,'exchange']= 'NYSE'
nsq_ex_fin.loc[:,'exchange']= 'NASDAQ'

filterd_df = pd.concat([sp500_ex_fin, amx_ex_fin, nys_ex_fin, nsq_ex_fin], join='inner')
ticker_list = filterd_df['Symbol'].unique().tolist()
# ticker_list
# .이나 ' '을 포함하지 않은 값을 필터링하는 함수
filtered_list = [item for item in ticker_list if '.' not in item and ' ' not in item]
len(filtered_list)

100%|██████████| 307/307 [00:00<00:00, 870.05it/s] 
100%|██████████| 2728/2728 [00:03<00:00, 876.34it/s] 
100%|██████████| 3650/3650 [00:03<00:00, 946.53it/s] 


4554

In [18]:
prc = []
error_list = []
for ticker in tqdm(filtered_list):
    try:
        df = fdr.DataReader(ticker, '2000-01-01', '2025-04-30')
        monthly_df = df[['Close', 'Adj Close']].resample('M').last()
        # print(monthly_df.head(5))
        monthly_df['ret'] = monthly_df['Adj Close'].pct_change(1)  
        monthly_df['ticker'] = ticker
        prc.append(monthly_df)
    except: error_list.append(ticker)

prc_df = pd.concat(prc)

100%|██████████| 4554/4554 [37:16<00:00,  2.04it/s] 


In [19]:
prc_df['edate'] = prc_df.index
data_after_specific_date = prc_df.loc[prc_df.index >= '2000-01-01'].copy()
data_after_specific_date = data_after_specific_date.rename(columns={'Close': 'prc'})
data_after_specific_date.index.name = 'date'
prc_df_re = data_after_specific_date.reset_index()
prc_df_re['rankyear'] = prc_df_re['date'].dt.year
concated_crspm = pd.concat([crspm_re, prc_df_re])
crspm_df = concated_crspm.groupby('ticker').apply(lambda group: group.ffill().infer_objects(copy=False)) 
# 데이터 프레임의 형식 일치 

fundq_df.rename(columns={'datadate': 'edate'}, inplace=True)

# 날짜 형식 일치화
crspm_df['edate'] = pd.to_datetime(crspm_df['edate'])
fundq_df['edate'] = pd.to_datetime(fundq_df['edate'])

fundq_df['cusip'] = fundq_df['cusip'].apply(lambda x: str(x)[:7] if len(str(x)) > 7 else x)
crspm_df['cusip'] = crspm_df['cusip'].apply(lambda x: str(x)[:7] if len(str(x)) > 7 else x)
qdata = pd.merge(crspm_df, fundq_df, on=['cusip', 'edate'], how='left')
adata = pd.merge(crspm_df, funda_df, on=['cusip', 'edate'], how='left')
mdata = crspm_df
mdata['me'] = (mdata['prc'] * mdata['shrout'])/1000
# common_list = sorted(list(set(crspm_re['ticker']) & set(prc_df['ticker'])))

In [20]:
mdata = mdata[~mdata['permno'].isna()].copy()
qdata = qdata[~qdata['permno'].isna()].copy()

mdata['permno'] = mdata['permno'].astype('int')
qdata['permno'] = qdata['permno'].astype('int')

qdata['siccd'] = qdata['siccd'].apply(lambda x: np.nan if np.isnan(x) else int(x))

# 한국, 미국데이터 공통 전처리

util_and_fin = list(range(4900, 4950)) + list(range(6000, 7000)) # 유틸리티, 금융업 제외 (한국은 금융업만 제외)
qdata = qdata[~qdata['siccd'].isin(util_and_fin)]

permno_list = sorted(list(set(mdata['permno']) & set(qdata['permno'])))
print(len(permno_list))

mdata = mdata[mdata['permno'].isin(permno_list)].copy()
qdata = qdata[qdata['permno'].isin(permno_list)].copy()

# naics	코드에서 상업은행 제외 14943
mdata = mdata[mdata['naics'] != 522110]
qdata = qdata[qdata['naics'] != 522110]

dates = pd.to_datetime(mdata['date'].sort_values().unique())

qdata = qdata[~np.isnan(qdata['date'])]

10687


In [21]:
# 주요 데이터
# ret은 수정주가 수익률
# Set option to opt into future behavior
# pd.set_option('future.no_silent_downcasting', True)

ret = mdata.pivot_table(index='date', columns='permno', values='ret').astype('float')
shrout = mdata.pivot_table(index='date', columns='permno', values='shrout').astype('float')

prc = abs(mdata.pivot_table(index='date', columns='permno', values='prc').astype('float'))
prc_jun = prc.loc[[x for x in prc.index if x.month % 3 == 0]].reindex(prc.index).ffill(limit=2).dropna(how='all')

me = prc * shrout / 1000
me = me[me > 0]

me_jun = me.loc[[x for x in me.index if x.month % 3 == 0]].reindex(me.index).ffill(limit=2).dropna(how='all')
# me_dec = me.shift(6).loc[[x for x in me.index if x.month == 6]].reindex(me.index).fillna(method='ffill', limit=11).dropna(how='all')

adata['be'] = adata['at'] - adata['lt']
be = adata.pivot_table(index='date', columns='permno', values='be').astype('float')
be_jun = be.loc[[x for x in be.index if x.month % 3 == 0]].reindex(be.index).ffill(limit=2).dropna(how='all')

market = mdata.pivot_table(index='date', columns='permno', values='exchcd', aggfunc=lambda x: x)
market_jun = market.loc[[x for x in be.index if x.month % 3 == 0]].reindex(be.index).ffill(limit=2).dropna(how='all')

# 예시: 데이터프레임이 혼합된 타입을 가지고 있을 경우
market = market.apply(pd.to_numeric, errors='coerce')
# NaN 값을 유지하면서 Int64 타입으로 변환
market = market.astype('Int64')

# 주가 수익률의 정확한 계산을 위한 수정주가 import and 수익률 계산

prc_bool = prc_jun > 5 # 미국 주식의 경우 prc_jun > 5
nyse_bool = (market == 1) |( market == 2) | (market == 3)
be_bool = be_jun > 0

me_nyse_criteria = me_jun[nyse_bool].apply(lambda x: np.nanpercentile(x, 5), axis=1)
me_bool = me_jun.ge(me_nyse_criteria, axis=0)
universe_bool = prc_bool & be_bool & me_bool

ret.replace([np.inf, -np.inf], np.nan, inplace=True)
ri = (1 + ret).cumprod()

qdata_fill = qdata.ffill(limit=2)

In [22]:
qdata_fill = qdata_fill.ffill(limit=1)
qdata_fill.tail(12)

Unnamed: 0,permno,edate,date,dlret,dlretx,exchcd,naics,permco,prc,ret,...,xrdq,xsgaq,capxy,chechy,dlcchy,dltisy,fincfy,oancfy,prstkcy,sstky
2763702,91205,2012-04-30,2012-04-30,,,1.0,337910,50616.0,2.06,0.019802,...,,100.124,3.822,-10.051,,0.702,-10.537,1.372,0.01,0.0
2763703,91205,2012-05-31,2012-05-31,,,1.0,337910,50616.0,1.68,-0.184466,...,,106.498,6.641,-26.774,,1.525,-37.635,13.857,0.188,0.046
2763704,91205,2012-06-30,2012-06-29,,,1.0,337910,50616.0,1.85,0.101191,...,,106.498,6.641,-26.774,,1.525,-37.635,13.857,0.188,0.046
2763705,91205,2012-07-31,2012-07-31,,,1.0,337910,50616.0,1.67,-0.097297,...,,106.498,6.641,-26.774,,1.525,-37.635,13.857,0.188,0.046
2763706,91205,2012-08-31,2012-08-31,,,1.0,337910,50616.0,1.6,-0.041916,...,,120.369,10.011,-19.142,,1.269,-47.753,31.402,2.905,0.062
2763707,91205,2012-09-30,2012-09-28,,,1.0,337910,50616.0,2.18,0.3625,...,,120.369,10.011,-19.142,,1.269,-47.753,31.402,2.905,0.062
2763708,91205,2012-10-31,2012-10-31,,,1.0,337910,50616.0,2.23,0.022936,...,,120.369,10.011,-19.142,,1.269,-47.753,31.402,2.905,0.062
2763709,91205,2012-11-30,2012-11-30,,,1.0,337910,50616.0,2.17,-0.026906,...,,124.358,15.914,20.179,,5.236,-46.123,82.108,3.059,0.104
2763710,91205,2012-12-31,2012-12-31,,,1.0,337910,50616.0,2.17,0.0,...,,124.358,15.914,20.179,,5.236,-46.123,82.108,3.059,0.104
2763711,91205,2013-01-31,2013-01-31,,,1.0,337910,50616.0,2.16,-0.004608,...,,124.358,15.914,20.179,,5.236,-46.123,82.108,3.059,0.104


In [27]:
qdata_fill[qdata_fill['ticker'] == 'NSC'][['date','saleq', 'niq']].head(12)

Unnamed: 0,date,saleq,niq
1735900,2000-01-31,235.622,-4.717
1735901,2000-02-29,235.622,-4.717
1735902,2000-03-31,1508.0,-48.0
1735903,2000-04-28,1508.0,-48.0
1735904,2000-05-31,1508.0,-48.0
1735905,2000-06-30,1592.0,116.0
1735906,2000-07-31,1592.0,116.0
1735907,2000-08-31,1592.0,116.0
1735908,2000-09-29,1535.0,99.0
1735909,2000-10-31,1535.0,99.0


In [34]:
# 저장 경로
save_path = fr"C:\Users\82108\OneDrive\바탕 화면\investment\investment_strategy\DATA"
os.makedirs(save_path, exist_ok=True)

# 파일 저장
# adata : 년간 데이터 
# mdata : 월간 데이터 
# qdata_fill : 분기 데이터 (월간 주기)
# fundq_df :  분기 데이터 (분기 주기) 
# adata.to_csv(os.path.join(save_path, f"adata_{end_date}.csv"), index=False)
# mdata.to_csv(os.path.join(save_path, f"mdata_{end_date}.csv"), index=False)
qdata_fill.to_csv(os.path.join(save_path, f"qdata_{end_date}.csv"), index=False)
fundq_df.to_csv(os.path.join(save_path, f"fundq_{end_date}.csv"), index=False)

In [33]:
fundq_df

Unnamed: 0,gvkey,edate,tic,cusip,actq,aoq,apq,atq,ceqq,cheq,...,xrdq,xsgaq,capxy,chechy,dlcchy,dltisy,fincfy,oancfy,prstkcy,sstky
0,001013,2000-01-31,ADCT.1,0008863,1137.412,333.830,103.291,1814.519,1396.005,364.315,...,75.200,205.600,46.785,21.138,,,16.289,41.582,0.000,42.657
1,001082,2000-01-31,SERV.1,81765M1,39.941,8.971,13.077,110.675,21.270,4.440,...,,2.169,9.597,-2.956,-7.367,9.503,-4.911,5.032,0.000,0.000
2,001173,2000-01-31,AIM.1,0080153,17.435,0.877,1.968,22.774,12.644,0.964,...,0.801,1.839,0.558,-0.754,0.174,1.974,0.528,-0.724,0.542,0.000
3,001183,2000-01-31,IDAI.,45169P1,,8.982,0.632,103.962,83.879,54.333,...,,,1.091,22.224,0.000,0.000,-2.582,-18.925,2.582,
4,001189,2000-01-31,AGR.7,0084895,454.479,125.407,,839.170,363.297,,...,,,17.181,-12.630,,38.728,32.244,-21.550,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166436,260849,2025-04-30,UTG,7561581,,,,,,,...,,,,,,,,,,
166437,260850,2025-04-30,SCD,50208A1,,,,,,,...,,,,,,,,,,
166438,264645,2025-04-30,CSQ,1281251,,,,,,,...,,,,,,,,,,
166439,339965,2025-04-30,SNOW,8334451,4785.974,2819.130,155.263,8157.407,2408.000,3910.684,...,472.404,1024.073,44.989,-379.270,0.000,0.000,-564.057,228.373,623.136,59.453


In [8]:
#Accounting variables

#Income statement
sale = get_adata_quarter(qdata_fill, 'saleq')
cogs = get_adata_quarter(qdata_fill, 'cogsq')
gp = sale - cogs
xsga = get_adata_quarter(qdata_fill, 'xsgaq')
# xad = get_adata_annual(adata, 'XAD')
xrd = get_adata_quarter(qdata_fill, 'xrdq')
# xlr = get_adata_annual(adata, 'XLR')
# opex = get_adata_annual(adata, 'XOPR')
# ebitda = get_adata_quarter(adata, 'ebitda')
# ebit = get_adata_quarter(adata, 'ebit')
xint = get_adata_quarter(qdata_fill, 'xintq')
op = get_adata_quarter(qdata_fill, 'xintq')
# ope = ebitda - xint.replace(np.nan, 0)
pi = get_adata_quarter(qdata_fill, 'piq')   # pretax income
tax = get_adata_quarter(qdata_fill, 'txtq')  # income tax
xido = get_adata_quarter(qdata_fill, 'xidoq')# extraordinary item and discontinue
ni = get_adata_quarter(qdata_fill, 'ibq')
nix = get_adata_quarter(qdata_fill, 'niq')
fi = nix + xint.replace(np.nan, 0)
# dvc = get_adata_quarter(adata, 'dvc')
# div = get_adata_annual(adata, 'dvt')


#Cash flow statement
capx = get_adata_quarter(qdata_fill, 'capxy') # capex
capex_sale = capx.divide(sale.replace(0, np.nan), axis=0)
ocf = get_adata_quarter(qdata_fill, 'oancfy')
fcf = ocf - capx
eqbb = get_adata_quarter(qdata_fill, 'prstkcy') # 자사주 매입
eqis = get_adata_quarter(qdata_fill, 'sstky')   # 자사주 매각
eqnetis = eqis.replace(np.nan, 0) - eqbb.replace(np.nan, 0)
# eqpo = div.replace(np.nan, 0) + eqbb.replace(np.nan, 0)
# eqnpo = div - eqnetis

dltnetis = get_adata_quarter(qdata_fill, 'dltisy') # 장기채권 발행
dstnetis = get_adata_quarter(qdata_fill, 'dlcchy') # 유동채권 변화
dbnetis = dltnetis - dstnetis
netis = eqnetis.replace(np.nan, 0) + dbnetis.replace(np.nan, 0)
# fincf = get_adata_annual(adata, 'fincf')


# Balance Sheet - Assets
at = get_adata_quarter(qdata_fill, 'atq') # 분기 자산
at = at[at > 0]
ca = get_adata_quarter(qdata_fill, 'actq')  # 분기 유동자산
rec = get_adata_quarter(qdata_fill, 'rectq') # receivables
cash = get_adata_quarter(qdata_fill, 'cheq') # 현금
inv = get_adata_quarter(qdata_fill, 'invtq') # inventory
nca = at - ca
intan = get_adata_quarter(qdata_fill, 'intanq') #무형자산
ivao = get_adata_quarter(qdata_fill, 'ivaoq') # investment
ppeg = get_adata_quarter(qdata_fill, 'ppegtq') # porperty plant and Equipment total
ppen = get_adata_quarter(qdata_fill, 'ppentq') # porperty plant and Equipment net

# #Balance Sheet - Liabilities
lt = get_adata_quarter(qdata_fill, 'ltq')  # 총부채
cl = get_adata_quarter(qdata_fill, 'lctq') # 유동부채
ap = get_adata_quarter(qdata_fill, 'atq')  # asset total
debtst = get_adata_quarter(qdata_fill, 'dlcq') # Debt in current liability
txp = get_adata_quarter(qdata_fill, 'txpq')    # Income taxes payable
ncl = lt - cl
debtlt = get_adata_quarter(qdata_fill, 'dltisy')  # long term debt total
txditc = get_adata_quarter(qdata_fill, 'txdiq') #  Income Taxes - Deferred (txdiq)

# #Balance Sheet - Financing
pstk = get_adata_quarter(qdata_fill, 'pstkrq') # 상환우선주
debt = (debtst.replace(np.nan, 0) + debtlt.replace(np.nan, 0)).replace(0, np.nan)
netdebt = debt - cash.replace(np.nan, 0)

seq = get_adata_quarter(qdata_fill, 'seqq')  # share holder equity
ceq = get_adata_quarter(qdata_fill, 'ceqq')  # common equit

# be = ceq + txditc.replace(np.nan, 0)
# be = be[be > 0]
icapt = get_adata_quarter(qdata_fill, 'icaptq') # invested capital
mib = get_adata_quarter(qdata_fill, 'mibtq')      # Noncontrolling Interests - Total - Balance Sheet (mibtq)
bev = icapt + debtst.replace(np.nan, 0) - cash.replace(np.nan, 0)
bev = bev[bev > 0]
# 
#Balance Sheet - Summary
nwc = ca - cl
coa = ca - cash
col = cl - debtst.replace(np.nan, 0)
cowc = coa - col
ncoa = at - ca - ivao
ncol = lt - cl - debtlt
nncoa = ncoa - ncol

# ivst = get_data(adata, 'IVST')

# fna = ivst.replace(np.nan, 0) + ivao.replace(np.nan, 0)
# fnl = debt + pstk.replace(np.nan, 0)
# nfna = fna - fnl
oa = coa + ncoa
ol = col + ncol
ao = get_adata_quarter(qdata_fill, 'aoq')# asset other
lo = get_adata_quarter(qdata_fill, 'loq')  # liabilities other
dp = get_adata_quarter(qdata_fill, 'dpq')
noa = oa - ol
lnoa = ppen + intan + ao - lo + dp
# invt = get_adata_annual(adata, 'invt')
# caliq = ca - invt
# ppeinv = ppen + invt
aliq = cash + 0.75*coa + 0.5*(at - ca - intan.replace(np.nan, 0))

be = at - lt
be = be[be > 0]

roeq = ni.divide(be.replace(0, np.nan), axis=0)
roaq = ni.divide(at.replace(0, np.nan), axis=0)

#Market based
oacc = ni - ocf
# tacc = oacc + nfna.diff(12)
# ocf_qtr = get_adata_annual(adata, 'oancfq')
# cop = ebitda + xrd.replace(np.nan, 0) - oacc
# emp = get_adata_annual(adata, 'emp')

mev = me + netdebt
mev = mev[mev > 0]
mev_jun = mev.loc[[x for x in mev.index if x.month == 6]].reindex(mev.index).ffill(limit=11).dropna(how='all')

#Accounting Characteristics

##Accounting based size measures
assets = at
sales = sale
book_equity = be
net_income = ni
enterprise_value = mev
mat = at + be + me

NameError: name 'get_adata_quarter' is not defined

In [9]:
me_quantile =  me_jun.iloc[1:].apply(lambda x: get_quantile(x, universe_bool, 10), axis=1)
roeq_quantile = roeq.iloc[1:].apply(lambda x: get_quantile(x, universe_bool, 5), axis=1)
roaq_quantile = roaq.iloc[1:].apply(lambda x: get_quantile(x, universe_bool, 5), axis=1)

start_year = '2001'
# eq_data = me.applymap(lambda x: np.nan if np.isnan(x) else 1)

NameError: name 'me_jun' is not defined

In [89]:
me_quantile.head(3)

permno,10026,10028,10032,10044,10051,10066,10104,10107,10145,10158,...,93384,93393,93397,93401,93420,93422,93423,93426,93434,93436
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-04-30,10.0,10.0,10.0,10.0,10.0,,10.0,10.0,10.0,10.0,...,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
2020-05-29,10.0,10.0,10.0,10.0,10.0,,10.0,10.0,10.0,10.0,...,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
2020-06-30,10.0,10.0,10.0,10.0,10.0,,10.0,10.0,10.0,10.0,...,,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0


In [81]:
me_jun

permno,10026,10028,10032,10044,10051,10066,10104,10107,10145,10158,...,93384,93393,93397,93401,93420,93422,93423,93426,93434,93436
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-03-31,2285.448,67.84848,1601.6088,29.112,583.14382,,152412.71472,1197019.0,94627.920657,500.3414,...,96.62715,252.69196,233.9398,204.27984,113.44375,81.009879,1061.59878,251.22088,68.4659,96940.0
2020-04-30,2285.448,67.84848,1601.6088,29.112,583.14382,,152412.71472,1197019.0,94627.920657,500.3414,...,96.62715,252.69196,233.9398,204.27984,113.44375,81.009879,1061.59878,251.22088,68.4659,96940.0
2020-05-29,2285.448,67.84848,1601.6088,29.112,583.14382,,152412.71472,1197019.0,94627.920657,500.3414,...,96.62715,252.69196,233.9398,204.27984,113.44375,81.009879,1061.59878,251.22088,68.4659,96940.0
2020-06-30,2401.23144,164.2364,2059.36416,25.77625,625.85208,,169606.05414,1540774.0,101480.20232,819.7878,...,,1039.89403,258.76242,260.90127,240.738,312.41478,1626.83727,308.33152,76.22724,200844.7
2020-07-31,2401.23144,164.2364,2059.36416,25.77625,625.85208,,169606.05414,1540774.0,101480.20232,819.7878,...,,1039.89403,258.76242,260.90127,240.738,312.41478,1626.83727,308.33152,76.22724,200844.7
2020-08-31,2401.23144,164.2364,2059.36416,25.77625,625.85208,,169606.05414,1540774.0,101480.20232,819.7878,...,,1039.89403,258.76242,260.90127,240.738,312.41478,1626.83727,308.33152,76.22724,200844.7
2020-09-30,2466.32685,115.50396,2065.64498,16.25956,602.75782,,179750.0136,1590936.0,115520.49963,992.8818,...,,1012.2612,254.7688,171.0828,89.873,218.77101,1720.6686,317.81664,82.95848,406701.5
2020-10-30,2466.32685,115.50396,2065.64498,16.25956,602.75782,,179750.0136,1590936.0,115520.49963,992.8818,...,,1012.2612,254.7688,171.0828,89.873,218.77101,1720.6686,317.81664,82.95848,406701.5
2020-11-30,2466.32685,115.50396,2065.64498,16.25956,602.75782,,179750.0136,1590936.0,115520.49963,992.8818,...,,1012.2612,254.7688,171.0828,89.873,218.77101,1720.6686,317.81664,82.95848,406701.5
2020-12-31,2945.19372,140.01,2249.78886,24.58755,838.01691,,190449.55946,1678381.0,149248.6122,1560.19984,...,,1050.616,293.95575,225.6,,580.61704,2897.7157,395.13696,100.10345,677340.2


In [90]:
universe_bool

permno,10026,10028,10032,10044,10051,10066,10104,10107,10145,10158,...,93384,93393,93397,93401,93420,93422,93423,93426,93434,93436
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-03-31,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2020-04-30,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2020-05-29,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2020-06-30,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2020-07-31,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2020-08-31,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2020-09-30,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2020-10-30,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2020-11-30,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2020-12-31,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
