In [2]:
import polars as pl
import pandas as pd
import os, textwrap
from pathlib import Path
from hackutilitiy import *
import matplotlib.pyplot as plt

## Load relevant data

In [3]:
# Load the rets data for all securities
rets_db = pd.read_csv('../data/quant_data.csv', index_col='Unnamed: 0').loc[:, 
            ["gvkey", "id", "iid", "date", "stock_ret", "excntry", "me", "year", "month"]]

rets_db.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6401414 entries, 0 to 6401413
Data columns (total 9 columns):
 #   Column     Dtype  
---  ------     -----  
 0   gvkey      float64
 1   id         object 
 2   iid        object 
 3   date       int64  
 4   stock_ret  float64
 5   excntry    object 
 6   me         float64
 7   year       int64  
 8   month      int64  
dtypes: float64(3), int64(3), object(3)
memory usage: 488.4+ MB


In [4]:
print(f"rets_db dims: {rets_db.shape}")
print(f"rets_db dims NA dropped: {rets_db.dropna().shape}")

rets_db dims: (6401414, 9)
rets_db dims NA dropped: (6391093, 9)


In [5]:
# Load the text features harvested from the corporate filings (10K and 10Q Risk Factors and MD&As)
txt_features_db = pd.read_csv('../data/txt_features.csv').loc[:,
                ["gvkey", "cik", "cusip", "date", "rf_feature", "mgmt_feature"]
]

txt_features_db.head()

Unnamed: 0,gvkey,cik,cusip,date,rf_feature,mgmt_feature
0,6831.0,16099,549282101,20050103,0.0,6.6e-05
1,11872.0,779544,040712101,20050103,0.0,9.5e-05
2,24783.0,831641,88162G103,20050103,0.0,4.1e-05
3,61721.0,866415,459412102,20050103,0.0,0.008602
4,146117.0,1141240,53634X100,20050103,0.0,0.002048


In [6]:
print(f"txt_features_db dims: {txt_features_db.shape}")
print(f"txt_features_db dims NA dropped: {txt_features_db.dropna().shape}")

txt_features_db dims: (358439, 6)
txt_features_db dims NA dropped: (353294, 6)


In [7]:
# Load the link tables for quant and text data(compustat gvkey flag and SEC EDGAR cik flag respectively)
cik_gvkey_lnktbl = pd.read_csv('../data/linktables/cik_gvkey_linktable_us_only.csv')
cik_gvkey_lnktbl.rename(columns={'datadate':'date'}, inplace=True)
cik_gvkey_lnktbl.head()

Unnamed: 0,gvkey,date,iid,tic,cusip,conm,tpci,cik
0,1003,2005-01-31,1,ANTQ,354100,A.A. IMPORTING CO INC,0,730052.0
1,1004,2005-01-31,1,AIR,361105,AAR CORP,0,1750.0
2,1009,2005-01-31,1,ABSI.1,781104,ABS INDUSTRIES INC,0,313368.0
3,1013,2005-01-31,1,ADCT.1,886309,ADC TELECOMMUNICATIONS INC,0,61478.0
4,1019,2005-01-31,1,AFAP,1038108,AFA PROTECTIVE SYSTEMS INC,0,2668.0


In [8]:
# Load and rectify the global data linktable
date_gvkey_iid_global_lnktbl =pd.read_csv('../data/linktables/date-gvkey_iid_linktable_global.csv')
date_gvkey_iid_global_lnktbl.head() 

  date_gvkey_iid_global_lnktbl =pd.read_csv('../data/linktables/date-gvkey_iid_linktable_global.csv')


Unnamed: 0,fic,gvkey,datadate,iid,conm
0,,5,2006-01-31,01W,
1,,5,2006-02-28,01W,
2,,5,2006-03-31,01W,
3,,5,2006-04-30,01W,
4,,5,2006-05-31,01W,


In [9]:
date_gvkey_iid_global_lnktbl.rename(columns={'datadate':'date'}, inplace=True)

In [10]:
date_gvkey_iid_global_lnktbl['date'] =date_gvkey_iid_global_lnktbl['date'].str.split('-').str.join('')

In [11]:
date_gvkey_iid_global_lnktbl = date_gvkey_iid_global_lnktbl.astype({"date": int})

## Merge quant and text features:

In [12]:
# drop data without NA identifiers and rectify dtypes
rets_db = rets_db.dropna(subset=['gvkey']).astype({"gvkey": int})
txt_features_db = txt_features_db.dropna(subset=['gvkey']).astype({"gvkey": int})
cik_gvkey_lnktbl['date'] = cik_gvkey_lnktbl['date'].str.split('-').str.join('')
cik_gvkey_lnktbl = cik_gvkey_lnktbl.dropna(subset=['cik']).astype({"date": int, "cik": int})

In [13]:
# Display the join keys:
print(f"rets_db links: {set(rets_db.columns).intersection(set(cik_gvkey_lnktbl.columns))}")
print(f"txt_features_db links: {set(txt_features_db.columns).intersection(set(cik_gvkey_lnktbl.columns))}")
print(f"rets_db and txt_features_db join keys: {set(rets_db.columns).intersection(set(txt_features_db.columns))}")
print(f"txt_features_db and date_gvkey_iid_global_lnktbl join keys: {set(txt_features_db.columns).intersection(set(date_gvkey_iid_global_lnktbl.columns))}")

rets_db links: {'gvkey', 'date', 'iid'}
txt_features_db links: {'cik', 'gvkey', 'cusip', 'date'}
rets_db and txt_features_db join keys: {'gvkey', 'date'}
txt_features_db and date_gvkey_iid_global_lnktbl join keys: {'gvkey', 'date'}


In [14]:
# Collect data relevant to the forecasting model:
# Text data are linked to market data with ["gvkey", "cik", "cusip", "date"] keys. 
# Same text data regardless of the iid flag for a given ["gvkey", "cik", "cusip", "date"] key
# US Quant and text data are merged on ["gvkey", "iid","date"] keys.
# Global data linked to returns ["gvkey", "iid", "fic", "date", "conm"].
modeling_data = (rets_db.merge(txt_features_db.merge(cik_gvkey_lnktbl.drop_duplicates(subset=["gvkey", "cik", "cusip", "date"]),
                                                      how='left', 
                                                      on =["gvkey", "cik", "cusip", "date"]),
                                    how='left', 
                                    on =["gvkey", "iid","date"])
                        .loc[:, # re-order the columns in the data frame
    ['date',  'tic', 'conm', 'excntry', 'me', 'rf_feature', 'mgmt_feature', 'stock_ret', 'year', 'month',  'tpci', 'gvkey', 'iid', 'cik', 'cusip', 'id']
                       ].rename(columns={'excntry':'fic', 'stock_ret':'target_ret'})
                        .merge(date_gvkey_iid_global_lnktbl, # link global identifiers
                                how='left',
                                on=["gvkey", "iid", "fic", "date", "conm"])
)

In [15]:
modeling_data.shape

(6400963, 16)

In [16]:
#tpci = 0 → common shares, i.e. the standard tradable equity issue
modeling_data.dropna() 

Unnamed: 0,date,tic,conm,fic,me,rf_feature,mgmt_feature,target_ret,year,month,tpci,gvkey,iid,cik,cusip,id
17590,20050228,XOM,EXXON MOBIL CORP,USA,332886.863760,0.000000,0.000038,0.232171,2005,2,0,4503,01,34088.0,30231G102,crsp_11850
17592,20050228,INCB,INDIANA COMMUNITY BANCORP,USA,101.545876,0.000000,0.000029,0.003570,2005,2,0,16974,01,867493.0,454674102,crsp_11866
17609,20050228,CHFC,CHEMICAL FINANCIAL CORP,USA,933.732762,0.000000,0.000040,-0.105121,2005,2,0,15197,01,814184.0,163731102,crsp_11992
17627,20050228,GR,GOODRICH CORP,USA,4079.367509,0.000000,0.010294,0.079592,2005,2,0,5229,01,42542.0,382388106,crsp_12140
17672,20050228,PEP,PEPSICO INC,USA,90456.577285,0.000000,0.016805,0.002980,2005,2,0,8479,01,77476.0,713448108,crsp_13856
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6322164,20250430,INN,SUMMIT HOTEL PROPERTIES INC,USA,593.920620,0.001738,0.004906,-0.247689,2025,4,0,185396,01,1497645.0,866082100,comp_185396_01
6322205,20250430,SXC,SUNCOKE ENERGY INC,USA,776.038400,0.000224,0.000175,-0.015217,2025,4,0,186773,01,1514705.0,86722A103,comp_186773_01
6322241,20250430,NBHC,NATIONAL BANK HLDGS CORP,USA,1456.364850,0.000270,0.000066,-0.055135,2025,4,0,190963,02,1475841.0,633707104,comp_190963_02
6328714,20250430,BLKB,BLACKBAUD INC,USA,3055.093800,0.000599,0.005578,-0.024335,2025,4,0,260893,01,1280058.0,09227Q100,comp_260893_01


In [18]:
# Load feature labels glossary
factor_glossary = pd.read_csv('../stock_char_appendix/table1.csv')

# Load accounting ratios
acc_ratios = pd.read_csv('../data/acc_ratios.csv')

# Load shortlisted security characteristics
features_short = pd.read_csv('../data/referenced_predictive_factors.csv')
factor_list = features_short['Acronym'].tolist()
factor_list.extend(acc_ratios.Variable.tolist())
factor_list.extend(['date',  'iid', 'id', 'gvkey']  )

In [19]:
# Collect the shorlisted features from the rets data base
df = pl.scan_csv('../MAIN DATA and SUPPORTING CODES/ret_sample.csv'
                 ).select(factor_list
                 ).collect().to_pandas()
df.head()

Unnamed: 0,market_equity,dolvol_126d,turnover_126d,ivol_ff3_21d,ni_ivol,beta_60m,z_score,f_score,netdebt_me,rd_sale,...,op_at,rd_me,be_me,div12m_me,ni_me,sale_me,date,iid,id,gvkey
0,2398.152284,15472220.0,0.005768,0.018152,0.013934,0.650452,0.907301,4.0,1.626597,,...,0.059734,,1.254719,0.013995,-0.025893,1.702724,20050228,01C,comp_001081_01C,1081.0
1,301.116426,170833.4,0.000595,0.012638,0.0072,0.423608,0.676485,3.0,3.594891,,...,0.068392,,1.512289,0.020393,-0.014616,0.956297,20050228,01C,comp_001096_01C,1096.0
2,32.8083,59343.47,0.002271,0.0251,0.095603,1.636192,4.253169,8.0,-0.069556,0.073753,...,0.207169,0.044349,0.329155,,0.063216,0.640234,20050228,02,comp_001117_02,1117.0
3,911.419063,5289732.0,0.006959,0.018742,0.06791,1.92985,2.85421,4.0,0.075822,0.135861,...,0.199529,0.109261,0.366407,,-0.040524,0.804217,20050228,01W,comp_001166_01W,1166.0
4,1099.753789,4953975.0,0.004108,0.010074,0.035117,0.220041,3.600732,6.0,0.021285,,...,0.095411,,0.45472,0.002284,0.031513,0.167404,20050228,01C,comp_001186_01C,1186.0


In [20]:
print(f"modeling data and factor features join keys: {set(modeling_data.columns).intersection(set(factor_list))}")

modeling data and factor features join keys: {'gvkey', 'id', 'date', 'iid'}


In [None]:
# Merge the collected forecasting model data and the shortlisted features
modeling_data = modeling_data.merge(df, how='left', on=['date',  'iid', 'id', 'gvkey'])

In [22]:
modeling_data.head()

Unnamed: 0,date,tic,conm,fic,me,rf_feature,mgmt_feature,target_ret,year,month,...,niq_be,ocf_at,gp_at,niq_at,op_at,rd_me,be_me,div12m_me,ni_me,sale_me
0,20050228,,,CAN,2398.152284,,,-0.143457,2005,2,...,0.051041,0.000749,0.092638,0.018483,0.059734,,1.254719,0.013995,-0.025893,1.702724
1,20050228,,,CAN,301.116426,,,0.028077,2005,2,...,0.007062,0.043963,0.089504,0.001693,0.068392,,1.512289,0.020393,-0.014616,0.956297
2,20050228,,,USA,32.8083,,,-0.168627,2005,2,...,0.04857,0.202308,0.668503,0.032732,0.207169,0.044349,0.329155,,0.063216,0.640234
3,20050228,,,NLD,911.419063,,,0.086271,2005,2,...,0.014161,0.091636,0.350565,0.004722,0.199529,0.109261,0.366407,,-0.040524,0.804217
4,20050228,,,CAN,1099.753789,,,0.149056,2005,2,...,0.017315,0.06565,0.113388,0.016225,0.095411,,0.45472,0.002284,0.031513,0.167404


In [19]:
modeling_data.columns

Index(['date', 'tic', 'conm', 'fic', 'me', 'rf_feature', 'mgmt_feature',
       'target_ret', 'year', 'month', 'tpci', 'gvkey', 'iid', 'cik', 'cusip',
       'id', 'market_equity', 'dolvol_126d', 'turnover_126d', 'ivol_ff3_21d',
       'ni_ivol', 'beta_60m', 'z_score', 'f_score', 'netdebt_me', 'rd_sale',
       'ebit_bev', 'ebit_sale', 'ni_be', 'niq_be', 'ocf_at', 'gp_at', 'niq_at',
       'op_at', 'rd_me', 'be_me', 'div12m_me', 'ni_me', 'sale_me'],
      dtype='object')

In [14]:
pl.from_pandas(modeling_data.loc[:,
    ['gvkey', 'iid', 'id', 'tpci', 'date', 'tic', 'conm', 'fic', 'market_equity', 'dolvol_126d', 'turnover_126d', 'ivol_ff3_21d',
       'ni_ivol', 'beta_60m', 'z_score', 'f_score', 'netdebt_me', 'rd_sale',
       'ebit_bev', 'ebit_sale', 'ni_be', 'niq_be', 'ocf_at', 'gp_at', 'niq_at',
       'op_at', 'rd_me', 'be_me', 'div12m_me', 'ni_me', 'sale_me', 'rf_feature', 'mgmt_feature', 'target_ret', 'year', 'month']
]).write_csv('../data/modeling_data.csv')

In [None]:
modeling_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6400963 entries, 0 to 6400962
Data columns (total 39 columns):
 #   Column         Dtype  
---  ------         -----  
 0   date           int64  
 1   tic            object 
 2   conm           object 
 3   fic            object 
 4   me             float64
 5   rf_feature     float64
 6   mgmt_feature   float64
 7   target_ret     float64
 8   year           int64  
 9   month          int64  
 10  tpci           object 
 11  gvkey          int64  
 12  iid            object 
 13  cik            float64
 14  cusip          object 
 15  id             object 
 16  market_equity  float64
 17  dolvol_126d    float64
 18  turnover_126d  float64
 19  ivol_ff3_21d   float64
 20  ni_ivol        float64
 21  beta_60m       float64
 22  z_score        float64
 23  f_score        float64
 24  netdebt_me     float64
 25  rd_sale        float64
 26  ebit_bev       float64
 27  ebit_sale      float64
 28  ni_be          float64
 29  niq_be        

## Explore text data

In [None]:
# Read the .pkl file
txt_sample = pd.read_pickle("TEXT DATA US by YEAR/2006/text_us_2006.pkl")
txt_sample['date'] = pd.to_datetime(txt_sample.date)
txt_sample = txt_sample.astype({"gvkey": int})
txt_sample.dropna()

Unnamed: 0,date,cik,file_type,rf,mgmt,gvkey,cusip,year
195739,2006-01-03,62234,10Q,,Item 2. \n Management's Discussion and Analysi...,7022,566330106,2006
195740,2006-01-03,890923,10K,Item 1A. Risk Factors\n- -------- ------------...,Item 7. Management's Discussion and Analysis o...,25918,98975W104,2006
195738,2006-01-03,1123360,10Q,,ITEM 2. \n\nMANAGEMENT S DISCUSSION AND ANALYS...,141913,37940X102,2006
195749,2006-01-04,16160,10Q,,Item 2. \n Management s Discussion and Analysi...,64275,128030202,2006
195745,2006-01-04,96879,10Q,,Item 2. \n \n Management s Discussion and Anal...,10391,879131100,2006
...,...,...,...,...,...,...,...,...
213523,2006-12-29,1063085,10K,ITEM 1A. \n\nRISK\n FACTORS \n\n13 Item 1A. \n...,ITEM 7. \n\nMANAGEMENT S\n DISCUSSION AND ANAL...,113361,149016107,2006
213506,2006-12-29,1133062,10K,Item\n 1A. \n\nRisk\n Factors \n\n1.\n We May ...,Item\n 7. \n\nManagement s\n Discussion and An...,147249,47077R109,2006
213509,2006-12-29,1183941,10K,ITEM 1A \n\nRISK FACTORS \n \n 11 ITEM 1A. Ris...,ITEM 7. \n\nMANAGEMENT S DISCUSSION AND ANALYS...,230796,00506P103,2006
213524,2006-12-29,1310094,10K,ITEM 1A. RISK FACTORS Risk Factors This report...,ITEM 7. MANAGEMENT S DISCUSSION AND ANALYSIS O...,162956,00430L103,2006


In [None]:
data_dct['us_cik_gvkey_linktable'][data_dct['us_cik_gvkey_linktable'].tpci=='1']

Unnamed: 0,gvkey,date,iid,tic,cusip,conm,tpci,cik
57,1225,2005-01-31,17,ALP.PN,010392595,ALABAMA POWER CO,1,3153.0
99,1393,2005-01-31,03,AO.PA,023586209,U-HAUL HOLDING CO,1,4457.0
109,1440,2005-01-31,02,AEP.Z,025537200,AMERICAN ELECTRIC POWER CO,1,4904.0
201,1831,2005-01-31,03,ASFZ,046008207,ASSOCIATES FIRST CAP -CL A,1,7974.0
249,2025,2005-01-31,06,BRE.PD,05564E601,BRE PROPERTIES INC,1,1011174.0
...,...,...,...,...,...,...,...,...
498281,157454,2007-05-31,01,BGE.PB,05541Q206,BGE CAPITAL TRUST II,1,1258417.0
498343,157877,2007-05-31,01,PNU,69350H202,PNC CAPITAL TRUST D,1,1060264.0
498880,161933,2007-05-31,86,MER.PN,59022Y840,MERRILL LYNCH & CO (DUPL 5),1,
499627,164561,2007-05-31,02,WRB.PA,08449Q203,BERKLEY (WR) CAP TR II,1,1267530.0


In [None]:
txt_sample.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16553 entries, 195739 to 213529
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   date       16553 non-null  datetime64[ns]
 1   cik        16553 non-null  int64         
 2   file_type  16553 non-null  object        
 3   rf         16553 non-null  object        
 4   mgmt       16553 non-null  object        
 5   gvkey      16553 non-null  int64         
 6   cusip      16553 non-null  object        
 7   year       16553 non-null  int64         
dtypes: datetime64[ns](1), int64(3), object(4)
memory usage: 1.1+ MB


In [None]:
df = (txt_sample.merge(data_dct['us_cik_gvkey_linktable'].drop_duplicates(subset=["gvkey", "cik", "cusip", "date"]),
              how='left', 
              on =["gvkey", "cik", "cusip", "date"])
        ).drop(['cik', 'cusip', 'gvkey', 'iid'], axis=1)

## Inspect the rf and mgmt reports

In [None]:
rf_sample1 = txt_sample.rf.reset_index(drop=True).dropna().iloc[16542]

In [None]:
rf_sample2 = txt_sample.rf.reset_index(drop=True).dropna().iloc[16535]

In [None]:
print('\n'.join(textwrap.fill(para, width=80) for para in rf_sample1.split('\n') if para.strip()))

Item 1A. Risk Factors 37 ITEM 1A. RISK FACTORS.
OUR OPERATIONS ARE SUBJECT TO INTEREST RATE RISK AND VARIATIONS IN INTEREST
RATES MAY NEGATIVELY AFFECT FINANCIAL PERFORMANCE.
 Our earnings and cash flows are largely dependent upon our net interest
income. Net interest income is the difference between interest income earned on
interest-earning assets such as loans and securities and interest expense paid
on interest-bearing liabilities such as deposits and borrowed money. Changes in
the general level of interest rates may have an adverse effect on our business,
financial condition and result of operations. Interest rates are highly
sensitive to many factors that are beyond our control, including general
economic conditions and policies of various governmental and regulatory agencies
and, in particular, the FRB. Changes in monetary policy, including changes in
interest rates, influence the amount of interest income that we receive on loans
and securities and the amount of interest that w

In [None]:
print('\n'.join(textwrap.fill(para, width=80) for para in rf_sample2.split('\n') if para.strip()))

Item 1A. Risk
Factors
 Because of the following
factors, as well as other factors affecting the Company s operating results and
financial condition, past financial performance should not be considered to be
a reliable indicator of future performance, and investors should not use
historical trends to anticipate results or trends in future periods.
 The matters relating to the investigation by the
Special Committee of the Board of Directors and the restatement of the Company s
consolidated financial statements may result in additional litigation and
governmental
enforcement actions.
 On June 29, 2006, the Company announced that an
internal review had discovered irregularities related to the issuance of
certain stock option grants made between 1997 and 2001, including a grant to
its Chief Executive Officer CEO ), Steve Jobs. The Company also announced a
Special Committee of outside directors Special Committee had been formed
and had hired independent counsel to conduct a full investigatio

In [None]:
mgmt_sample1 = txt_sample.mgmt.reset_index(drop=True).dropna().iloc[11]
mgmt_sample2 = txt_sample.mgmt.reset_index(drop=True).dropna().iloc[73]

In [None]:
print('\n'.join(textwrap.fill(para, width=80) for para in mgmt_sample1.split('\n') if para.strip()))

Item 2.
Management s Discussion and Analysis of Financial Condition and Results of
Operations
 12 Item 2. Management s Discussion and Analysis of Financial Condition and
Results of Operations Forward-Looking Statements This discussion contains
forward-looking statements within the meaning of the Private Securities
Litigation Reform Act of 1995. These
statements reflect our current views with respect to future events and financial
performance. The words believe, expect, anticipate, intend, estimate, forecast,
 project, should and similar expressions are intended to identify forward-
looking statements within the meaning of the Private Securities Litigation
Reform Act of 1995. All forecasts and projections in this
document are forward-looking statements, and are based on management s current
expectations or beliefs of the Company s results, based on current information
available pertaining to the Company, including the risk factors noted
below. From time to time, we also may provide oral

In [None]:
print('\n'.join(textwrap.fill(para, width=80) for para in mgmt_sample2.split('\n') if para.strip()))

Item 2 Management s Discussion and Analysis of Financial Condition and Results
of Operations Item 2 Management s Discussion and Analysis of Financial Condition
and Results of Operations
Cautionary Disclosures To Qualify Forward Looking Statements
This report on Form 10-Q includes forward-looking statements within the meaning
of Section
27A of the Securities Act of 1933, as amended, and Section 21E of the Securities
Exchange Act of
1934, as amended. All statements other than statements of historical facts
included in this Form
10-Q, including, without limitation, statements contained in this Management s
Discussion and
Analysis of Financial Condition and Results of Operations and Notes to
Consolidated Financial
Statements located elsewhere in this report regarding our financial position,
business strategy,
plans and objectives of management for future operations, future sales and
industry conditions, are
forward-looking statements. Although we believe that the expectations reflected
in 

# Construct Text data dataframe

In [None]:
txt_db = txt_sample.loc[:,["year", "date", "file_type", "rf", "mgmt", "gvkey", "cusip", "cik" ]].copy()
txt_db.tail()

Unnamed: 0,year,date,file_type,rf,mgmt,gvkey,cusip,cik
213523,2006,2006-12-29,10K,ITEM 1A. \n\nRISK\n FACTORS \n\n13 Item 1A. \n...,ITEM 7. \n\nMANAGEMENT S\n DISCUSSION AND ANAL...,113361,149016107,1063085
213506,2006,2006-12-29,10K,Item\n 1A. \n\nRisk\n Factors \n\n1.\n We May ...,Item\n 7. \n\nManagement s\n Discussion and An...,147249,47077R109,1133062
213509,2006,2006-12-29,10K,ITEM 1A \n\nRISK FACTORS \n \n 11 ITEM 1A. Ris...,ITEM 7. \n\nMANAGEMENT S DISCUSSION AND ANALYS...,230796,00506P103,1183941
213524,2006,2006-12-29,10K,ITEM 1A. RISK FACTORS Risk Factors This report...,ITEM 7. MANAGEMENT S DISCUSSION AND ANALYSIS O...,162956,00430L103,1310094
213529,2006,2006-12-29,10K,Item 1A. Risk Factors 33 ITEM 1A. RISK FACTORS...,Item 7. Management's Discussion and Analysis o...,162576,114039100,1310313


In [None]:
data_dct['global_name_merge'].columns

Index(['fic', 'gvkey', 'date', 'iid', 'conm'], dtype='object')

In [None]:
txt_db.merge(data_dct['global_name_merge'].drop_duplicates(subset=["gvkey", "date"]),
              how='left', 
              on =["gvkey",  "date"], 
              validate="one_to_one").dropna()

Unnamed: 0,year,date,file_type,rf,mgmt,gvkey,cusip,cik,fic,iid,conm
8498,2006,2006-06-30,10Q,,Item 2. Management's Discussion and Analysis o...,13498,143658300,815097,PAN,01W,CARNIVAL CORPORATION & PLC


In [None]:
data_dct['na_name_merge'].columns

Index(['gvkey', 'date', 'iid', 'tic', 'cusip', 'conm', 'cik'], dtype='object')

In [None]:
txt_db.merge(data_dct['na_name_merge'].drop_duplicates(subset=["gvkey", "cik", "cusip", "date"]),
              how='left', on =["gvkey", "cik", "cusip", "date"]).dropna().drop(['cik', 'cusip', 'gvkey', 'iid'], axis=1).reset_index(drop=True)   

Unnamed: 0,year,date,file_type,rf,mgmt,tic,conm
0,2006,2006-01-31,10Q,,Item 2. Management's Discussion and Analysis o...,SCX,STARRETT (L.S.) CO -CL A
1,2006,2006-01-31,10Q,,Item 2. Management s Discussion and Analysis o...,WWD,WOODWARD INC
2,2006,2006-01-31,10Q,,Item 2. \n\nManagement s Discussion and Analys...,CVCO,CAVCO INDUSTRIES INC
3,2006,2006-01-31,10K,,ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS O...,ANIX,ANIXA BIOSCIENCES INC
4,2006,2006-01-31,10Q,Item 1A Risk Factors 16 Item 1A Risk Factors\n...,Item 2 Management's Discussion and Analysis of...,UCIA,UCI MEDICAL AFFILIATES INC
...,...,...,...,...,...,...,...
591,2006,2006-11-30,10K,Item 1A Risk Factors 2 ITEM 1A. RISK FACTORS\n...,Item 7 Management's Discussion and Analysis of...,OFI,OVERHILL FARMS INC
592,2006,2006-11-30,10Q,Item 1A. Risk Factors Item 1A. Risk Factors \n...,Item 2. Management s Discussion and Analysis o...,AVNT,AVIENT CORP
593,2006,2006-11-30,10Q,Item 1A. Risk Factors............................,Item 2. Management's Discussion and Analysis o...,CONNQ,CONN'S INC
594,2006,2006-11-30,10Q,ITEM 1A. \n \n RISK FACTORS We have disclosed ...,ITEM 2.\n MANAGEMENT S DISCUSSION AND ANALYSIS...,EXLS,EXLSERVICE HOLDINGS INC


In [None]:
data_dct['us_cik_gvkey_linktable'].columns

Index(['gvkey', 'date', 'iid', 'tic', 'cusip', 'conm', 'tpci', 'cik'], dtype='object')

In [None]:
txt_db.merge(data_dct['us_cik_gvkey_linktable'].drop_duplicates(subset=["gvkey", "cik", "cusip", "date"]),
              how='left', 
              on =["gvkey", "cik", "cusip", "date"]).dropna().drop(['cik', 'cusip', 'gvkey', 'iid'], axis=1).reset_index(drop=True).iloc[300:400]  

Unnamed: 0,year,date,file_type,rf,mgmt,tic,conm,tpci
300,2006,2006-03-31,10K,Item 1A. \n\nRisk Factors \n\n20 ITEM 1A. RISK...,Item 7. \n\nManagement s Discussion and Analys...,SNDA,SONIDA SENIOR LIVING INC,0
301,2006,2006-03-31,10K,ITEM 1A. \n\nRisk Factors \n \n 6 Item 1A. Ris...,ITEM 7. \n\nManagement s Discussion and Analys...,VERT,VERTICALNET INC,0
302,2006,2006-03-31,10K,Item 1A. \n\nRisk\n Factors \n\nWe\n have a hi...,Item 7. \n\nManagement s\n Discussion and Anal...,ENHT,ENHERENT CORP,0
303,2006,2006-03-31,10K,Item\n 1A. Risk Factors \n\nAn\n investment in...,Item\n 7. Management's Discussion and Analysis...,GFED,GUARANTY FED BANCSHARES INC,0
304,2006,2006-03-31,10K,Item 1A. \n\nRisk Factors \n \n 15 ITEM 1A. RI...,Item 7. \n\nManagement s Discussion and Analys...,OMNI,OMNI ENERGY SERVICES CORP,0
...,...,...,...,...,...,...,...,...
395,2006,2006-03-31,10K,Item 1A \n \n Risk Factors \n\n40 Item 1A. RIS...,Item 7 \n \n Management s Discussion and Analy...,BKD,BROOKDALE SENIOR LIVING INC,0
396,2006,2006-03-31,10K,Item 1A \n\nRisk Factors \n\n14 Item 1A. \n \n...,Item 7 \n\nManagement s Discussion and Analysi...,CBOU,CARIBOU COFFEE CO,0
397,2006,2006-03-31,10K,Item 1A \n\nRisk Factors \n\n7 Item 1A. \n\nRi...,Item 7 \n\nManagement s\n Discussion and Analy...,CSA.3,COGDELL SPENCER INC,0
398,2006,2006-03-31,10K,Item 1A Risk Factors \n\n16 Item 1A. Risk Fact...,Item 7 Management s Discussion and Analysis of...,TCMIQ,TRIPLE CROWN MEDIA INC,0


🔹 cik

Meaning: Central Index Key.

Issued by the SEC’s EDGAR system to uniquely identify companies and individuals who file disclosure documents with the SEC.

Example: Apple’s CIK = 0000320193.

Used to link Compustat firms (gvkey) with SEC filings (10-K, 10-Q).

🔹 tpci

Meaning: Trading Public Company Indicator (Compustat variable).

A flag indicating the trading status of the security or company.

Values (from WRDS / Compustat docs):

"0" = Not a public company / not trading

"1" = Publicly traded company

Sometimes extended values exist depending on dataset vintage.

🔹 In your us_cik_gvkey_linktable context

gvkey → Compustat’s unique firm identifier.

cik → SEC’s identifier (so you can join to EDGAR filings).

tpci → Indicator whether the gvkey is linked to a publicly traded company (helps filter out private or non-trading entities when consolidating).

✅ So practically:

Use cik when you want to pull SEC filings for the firm.

Use tpci to filter down to actively traded companies (exclude stale or private).




🔹 cusip

Meaning: Committee on Uniform Securities Identification Procedures number.

It’s a 9-character alphanumeric identifier assigned to U.S. and Canadian securities (stocks and bonds).

Structure:

First 6 = issuer code.

Next 2 = issue identifier.

Last 1 = check digit.

In Compustat: lets you link to market data (CRSP, Bloomberg, etc.) at the security level.

Example: Apple Inc. common stock = 037833100.

🔹 iid

Meaning: Issue Identifier (Compustat).

Distinguishes different securities (issues) for the same firm (gvkey).

Example: A company might have common stock, preferred stock, bonds — all share the same gvkey but different iid.

Together, (gvkey, iid) pinpoints a unique security within a firm.

🔹 fic

Meaning: Foreign Incorporation Code.

Indicates the country of incorporation of the firm.

It’s a 3-character ISO country code (e.g., USA, CAN, GBR).

Useful when consolidating across global datasets or filtering U.S. vs foreign companies.

✅ In summary, in this context:

cusip → security identifier, links to markets.

iid → security issue within the firm.

fic → firm’s country of incorporation.