# Text Analysis Workbook for Bankrupcy Project

#### Notes:
- Explore/select features

### Initialize Libaries, Data, & Structures

In [1]:
# load libaries
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from statsmodels.api import OLS, add_constant
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import roc_auc_score
from sklearn.metrics import plot_roc_curve
import statsmodels.api as sm 
import pandas as pd
import numpy as np

### Full File Load & Merge

In [268]:
# load linking data
links = pd.read_csv('firm-links.csv')
links = links[['LPERMNO','cik','datadate']]
links['date_month'] = pd.to_datetime(links['datadate']).dt.to_period('M')
links = links.rename(columns={'cik':'CIK','LPERMNO':'PERMNO'})
links = links[['PERMNO','CIK','date_month']]

# load firm data
#data = pd.read_csv('data-clean.csv', low_memory=False)
data = pd.read_csv('data-exclude.csv', low_memory=False)
data['PRC'] = np.where(data['PRC'] >= 15, 15, data['PRC'])
data['year'] = pd.to_datetime(data['date_month']).dt.to_period('Y')
data['date_month'] = pd.to_datetime(data['date_month']).dt.to_period('M')

# define constants & structures
y_col = 'financialDistress_12_periods'

In [269]:
# load text data
text = pd.read_csv('text-data.csv', encoding='latin-1')
text['date_month'] = pd.to_datetime(text['FDATE'], format='%Y%m%d').dt.to_period('M')
target_forms = ['10-K','10-Q']#,'8-K']
text = text[text.Form.isin(target_forms)].reset_index(drop=True)
text.Form.value_counts()

10-Q    587983
10-K    191743
Name: Form, dtype: int64

### Merge Datasets & Save Text + Original Data

In [270]:
print('Size pre-cik merge:', data[data['date_month'].between('1994-01','2018-12')].shape)
print('Data date range:',data['date_month'].min(), data['date_month'].max())
print('Text date range:',text['date_month'].min(), text['date_month'].max())
print('Link date range:',links['date_month'].min(), links['date_month'].max(),'\n')

# merge linking information (PERMNO to CIK)
d = data[data['date_month'].between('1994-01','2018-12')]
merged = d.merge(links, on=['PERMNO','date_month'], how='left')
print('Size post-cik merge:', merged.shape)
print('Cik nan count:', merged.CIK.isna().sum())
print('Date range:',merged['date_month'].min(), merged['date_month'].max(),'\n')

# merge text data (on CIK, date_month)
merged = merged.merge(text, on=['CIK','date_month'], how='left')
print('Size post-text merge:', merged.shape)
print('Cik nan count:', merged.CIK.isna().sum())
print('Date range:', merged['date_month'].min(), merged['date_month'].max(),'\n')

# reduce firm-months w/ multipule reportings
m = merged.groupby(['date_month','PERMNO']).mean()
m['doc_count'] = merged.groupby(['date_month','PERMNO']).count()['RET']
m['doc_count'] = np.where(m['FinTerms_Litigious'].isna(),0,m['doc_count'])
m = m.reset_index().drop(columns=['CIK','FDATE'])
print('Final size:', m.shape)
vc = m['financialDistress_12_periods'].value_counts(dropna=False)
print('Final distress count:', vc[1], '/', (vc[0]+vc[1]))
print('Final date range:', m['date_month'].min(), m['date_month'].max())

Size pre-cik merge: (823090, 17)
Data date range: 1972-02 2018-11
Text date range: 1994-01 2019-09
Link date range: 1990-01 2020-12 

Size post-cik merge: (823090, 18)
Cik nan count: 527803
Date range: 1994-01 2018-11 

Size post-text merge: (823709, 47)
Cik nan count: 527803
Date range: 1994-01 2018-11 

Final size: (823090, 43)
Final distress count: 25906 / 823090
Final date range: 1994-01 2018-11


In [271]:
# replace nans with retroactive 
cols = ['Gunning_Fog_Index','ARI','SMOG_Index','ParagraphCount','CharCount','WordCount',
        'ComplexWordCount','SentenceCount','averageWordsPerParagraph','CharCountTokens',
        'FinTerms_Negative','FinTerms_ModalWeak_count','FinTerms_ModalStrong_count', 
        'FinTerms_Positive','HarvardIV_Negative','FinTerms_Litigious_count','FinTerms_Uncertainty_count',
        'FinTerms_ModalWeak','FinTerms_Litigious','LM_Master_Dictionary','FinTerms_Uncertainty', 
        'LM_Master_Dictionary_count','HarvardIV_Negative_count','FinTerms_Negative_count',
        'FinTerms_ModalStrong','FinTerms_Positive_count']

m2 = m.sort_values(['PERMNO','date_month']).reset_index(drop=True)
mask = (m2.PERMNO == m2.PERMNO.shift(1)) & (m2.WordCount.isna())
for col in cols[:]: m2[col] = np.where(mask, m2[col].shift(1), m2[col])
mask = (m2.PERMNO == m2.PERMNO.shift(1)) & (m2.WordCount.isna()) 
for col in cols[:]: m2[col] = np.where(mask, m2[col].shift(1), m2[col])
m2 = m2.replace(np.nan,0) # interaction variable btw presence & magnitude of report

In [272]:
#m2.to_csv('data-exclude-text.csv',index=False)
#m2.to_csv('data-clean-text.csv',index=False)

### Quick File Load

In [273]:
# load firm data & define constants & structures
#merged = pd.read_csv('data-clean-text.csv', low_memory=False)
merged = pd.read_csv('data-exclude-text.csv', low_memory=False)
merged['year'] = pd.to_datetime(merged['date_month']).dt.to_period('Y')
merged['date_month'] = pd.to_datetime(merged['date_month']).dt.to_period('M')
y_col = 'financialDistress_12_periods'
cols = list(merged.columns[16:-1])

### Feature Selection

In [279]:
# correlation analysis
cols = list(merged.columns[16:-1])
X = merged[cols]
corr_mat = X.corr()
corr_mat[(abs(corr_mat) > 0.75) & (abs(corr_mat) < 1.0)]
# ARI vs Gunning_Fog_Index
# count, sentiment, readibility predictors

Unnamed: 0,Gunning_Fog_Index,ARI,SMOG_Index,ParagraphCount,CharCount,WordCount,ComplexWordCount,SentenceCount,averageWordsPerParagraph,CharCountTokens,...,FinTerms_Uncertainty_count,FinTerms_ModalWeak,FinTerms_Litigious,LM_Master_Dictionary,FinTerms_Uncertainty,LM_Master_Dictionary_count,HarvardIV_Negative_count,FinTerms_Negative_count,FinTerms_ModalStrong,FinTerms_Positive_count
Gunning_Fog_Index,,0.99784,0.997044,,,,,,,,...,,0.76981,0.785764,0.988792,0.905852,,,,0.841717,
ARI,0.99784,,0.993949,,,,,,,,...,,0.757309,0.779013,0.987241,0.894815,,,,0.831968,
SMOG_Index,0.997044,0.993949,,,,,,,,,...,,0.77332,0.785686,0.996062,0.912155,,,,0.84688,
ParagraphCount,,,,,0.9499,0.94455,0.940432,0.922887,,0.948228,...,0.854563,,,,,0.930822,0.90435,0.868687,,0.886456
CharCount,,,,0.9499,,0.999085,0.996988,0.97069,,0.999769,...,0.9269,,,,,0.993456,0.975258,0.945661,,0.95542
WordCount,,,,0.94455,0.999085,,0.997308,0.972016,,0.999264,...,0.927615,,,,,0.996935,0.977938,0.950502,,0.957789
ComplexWordCount,,,,0.940432,0.996988,0.997308,,0.97849,,0.997944,...,0.940136,,,,,0.995626,0.981099,0.950553,,0.964249
SentenceCount,,,,0.922887,0.97069,0.972016,0.97849,,,0.973451,...,0.948016,,,,,0.974651,0.974912,0.918212,,0.955409
averageWordsPerParagraph,,,,,,,,,,,...,,,,,,,,,,
CharCountTokens,,,,0.948228,0.999769,0.999264,0.997944,0.973451,,,...,0.931682,,,,,0.99449,0.977766,0.947265,,0.958232


In [280]:
# For each X, calculate VIF and save in dataframe
vifcols = ['CharCountTokens','WordCount','CharCount','SMOG_Index','Gunning_Fog_Index',
           'LM_Master_Dictionary_count','ARI','LM_Master_Dictionary','ComplexWordCount',
           'HarvardIV_Negative_count','SentenceCount','FinTerms_Uncertainty_count',
           'FinTerms_Negative_count','FinTerms_Uncertainty','FinTerms_ModalWeak_count',
           'HarvardIV_Negative','FinTerms_Positive_count','ParagraphCount','FinTerms_Negative']
X2 = merged[cols].dropna()
X2 = X2.drop(columns=vifcols)
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X2.values, i) for i in range(X2.shape[1])]
vif["features"] = X2.columns

# Inspect VIF scores
vifcols = list(vif.features)
vif = vif.round(1).sort_values('VIF Factor', ascending=False).reset_index(drop=True)
vif

Unnamed: 0,VIF Factor,features
0,5.2,FinTerms_ModalStrong
1,4.8,FinTerms_Positive
2,4.2,FinTerms_ModalStrong_count
3,3.8,FinTerms_Litigious
4,3.5,FinTerms_Litigious_count
5,3.3,FinTerms_ModalWeak
6,1.0,averageWordsPerParagraph


In [281]:
# isolate just text and 
cols = ['financialDistress_12_periods','date_month'] + list(text.columns[4:-1])
merged2 = merged[cols].dropna()

# split data into test, train
dropcols = [y_col, 'date_month']
X_train = merged2[merged2['date_month'] < '2010-1']
X_test = merged2[merged2['date_month'] >= '2010-1']
y_train = X_train[y_col]
y_test = X_test[y_col]
X_train, X_test = X_train.drop(columns=dropcols), X_test.drop(columns=dropcols)
print(X_train.shape, X_test.shape)

(628669, 26) (194421, 26)


### Evaluate Models

In [282]:
# build model and fit on VIF features
cols = vif.features.tolist()
X = X_train[cols].copy()
scl = StandardScaler()
X = scl.fit_transform(X)
X = sm.add_constant(X)
log_reg = sm.Logit(y_train, X).fit()

# printing the summary table 
cols = ['constant'] + cols
print(log_reg.summary(xname=cols))

Optimization terminated successfully.
         Current function value: 0.153525
         Iterations 8
                                Logit Regression Results                                
Dep. Variable:     financialDistress_12_periods   No. Observations:               628669
Model:                                    Logit   Df Residuals:                   628661
Method:                                     MLE   Df Model:                            7
Date:                          Thu, 22 Apr 2021   Pseudo R-squ.:                0.007273
Time:                                  13:16:23   Log-Likelihood:                -96516.
converged:                                 True   LL-Null:                       -97224.
Covariance Type:                      nonrobust   LLR p-value:                3.126e-301
                                 coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------

In [234]:
# build model and fit on LASSO features
cols = lasso_train.columns.tolist()
X = X_train[cols].copy()
scl = StandardScaler()
X = scl.fit_transform(X)
X2 = scl.transform(X_test[cols])
X = sm.add_constant(X)
log_reg = sm.Logit(y_train, X).fit()

# printing the summary table 
cols = ['constant'] + cols
print(log_reg.summary(xname=cols))

Optimization terminated successfully.
         Current function value: 0.131760
         Iterations 10
                                Logit Regression Results                                
Dep. Variable:     financialDistress_12_periods   No. Observations:               764180
Model:                                    Logit   Df Residuals:                   764175
Method:                                     MLE   Df Model:                            4
Date:                          Wed, 21 Apr 2021   Pseudo R-squ.:                0.008633
Time:                                  23:34:15   Log-Likelihood:            -1.0069e+05
converged:                                 True   LL-Null:                   -1.0156e+05
Covariance Type:                      nonrobust   LLR p-value:                     0.000
                                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------

In [323]:
# build model and fit on Significant features
cols = o.index.tolist()
X = X_train[cols].copy()
scl = StandardScaler()
X = scl.fit_transform(X)
X2 = scl.transform(X_test[cols])
X = sm.add_constant(X)
log_reg = sm.Logit(y_train, X).fit()

# printing the summary table 
cols = ['constant'] + cols
print(log_reg.summary(xname=cols))

Optimization terminated successfully.
         Current function value: 0.153765
         Iterations 8
                                Logit Regression Results                                
Dep. Variable:     financialDistress_12_periods   No. Observations:               628669
Model:                                    Logit   Df Residuals:                   628665
Method:                                     MLE   Df Model:                            3
Date:                          Thu, 22 Apr 2021   Pseudo R-squ.:                0.005724
Time:                                  14:59:16   Log-Likelihood:                -96667.
converged:                                 True   LL-Null:                       -97224.
Covariance Type:                      nonrobust   LLR p-value:                5.463e-241
                                 coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------

### Single Stepwise Feature Selection

In [283]:
# split data into test, train
dropcols = [y_col,'date_month']
cols = ['NITA','NIMTA','TLTA','TLMTA','CASHMTA','EXRET','RSIZE','PRC','SIGMA','SIGMA_TLTA',
        'Gunning_Fog_Index','ARI','SMOG_Index','ParagraphCount','CharCount','WordCount',
        'ComplexWordCount','SentenceCount','averageWordsPerParagraph','CharCountTokens',
        'FinTerms_Negative','FinTerms_ModalWeak_count','FinTerms_ModalStrong_count', 
        'FinTerms_Positive','HarvardIV_Negative','FinTerms_Litigious_count','FinTerms_Uncertainty_count',
        'FinTerms_ModalWeak','FinTerms_Litigious','LM_Master_Dictionary','FinTerms_Uncertainty', 
        'LM_Master_Dictionary_count','HarvardIV_Negative_count','FinTerms_Negative_count',
        'FinTerms_ModalStrong','FinTerms_Positive_count'] + dropcols
merged2 = merged[cols].dropna()
X_train = merged2[merged2['date_month'] < '2010-1']
X_test = merged2[merged2['date_month'] >= '2010-1']
y_train = X_train[y_col]
y_test = X_test[y_col]
X_train, X_test = X_train.drop(columns=dropcols), X_test.drop(columns=dropcols)
print(X_train.shape, X_test.shape)

(628669, 36) (194421, 36)


In [284]:
# define individual test function
def test(cols):
    # scale data
    scl = StandardScaler()
    X = scl.fit_transform(X_train[cols])
    X2 = scl.transform(X_test[cols])

    # test logit model
    lgt = LogisticRegression(class_weight='balanced', random_state=10, max_iter=10000)
    lgt.fit(X, y_train)
    y_pred = lgt.predict(X2)
    y_prob = pd.DataFrame(lgt.predict_proba(X2))[0]
    
    # statsmodels for easy beta and r2
    log_reg = sm.Logit(y_train, X).fit()
    log_r2 = float(log_reg.summary2().tables[0][3][0])
    log_beta = float(log_reg.summary2().tables[1]['Coef.'][-1])
    log_z = float(log_reg.summary2().tables[1]['z'][-1])
    
    # statsmodels for easy beta and r2
    reg = OLS(y_train, X).fit()
    reg_r2 = float(reg.summary2().tables[0][3][0])
    reg_beta = float(reg.summary2().tables[1]['Coef.'][-1])
    reg_t = float(reg.summary2().tables[1]['t'][-1])

    # get performance
    acc = lgt.score(X2, y_test)
    cr = classification_report(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    prf = precision_recall_fscore_support(y_test, y_pred)
    output =  [reg_r2, reg_beta, reg_t, log_r2, log_beta, log_z, acc, auc, prf[0][1],
            prf[1][1], prf[2][1], prf[3][1], cr]
    return output

# define stepwise selection function
def stepwise(start_cols, test_cols):
    out = pd.DataFrame()
    if start_cols != []: out['none'] = test(start_cols)
    else: out['none'] = 0
    for col in test_cols:
        cols = start_cols + [col]
        out[col]  = test(cols)

    # final output processing
    out.index = ['reg_r2','reg_beta','reg_t','log_r2','log_beta','log_z','accuracy','auc',
                 'percision','recall','fscore','support','CR']
    out.loc[['reg_beta','reg_t','log_beta','log_z'],'none'] = 0
    return out.T

In [319]:
# test VIF features
add = [] # ['FinTerms_Litigious_count']
start_cols = ['NITA','TLTA','CASHMTA','EXRET','RSIZE','PRC','SIGMA','SIGMA_TLTA'] + add
test_cols = ['Gunning_Fog_Index','ARI','SMOG_Index','ParagraphCount','CharCount','WordCount',
        'ComplexWordCount','SentenceCount','averageWordsPerParagraph','CharCountTokens',
        'FinTerms_Negative','FinTerms_ModalWeak_count','FinTerms_ModalStrong_count', 
        'FinTerms_Positive','HarvardIV_Negative','FinTerms_Litigious_count','FinTerms_Uncertainty_count',
        'FinTerms_ModalWeak','FinTerms_Litigious','LM_Master_Dictionary','FinTerms_Uncertainty', 
        'LM_Master_Dictionary_count','HarvardIV_Negative_count','FinTerms_Negative_count',
        'FinTerms_ModalStrong','FinTerms_Positive_count']
test_cols = vif.features.tolist()
#[test_cols.remove(a) for a in add]
#out = stepwise(start_cols, test_cols[:])
cols = ['reg_r2','reg_beta','reg_t','log_beta','log_z']
o = out[cols].apply(abs).sort_values('reg_beta', ascending=False)
o = o[o.reg_t >= 2]
o

Unnamed: 0,reg_r2,reg_beta,reg_t,log_beta,log_z
FinTerms_Positive,0.113,0.00129108,5.61571,0.00558541,2.13159
FinTerms_Litigious,0.113,0.00118197,5.17313,0.00358446,1.37507
FinTerms_ModalStrong_count,0.113,0.00105666,4.66689,0.00140006,0.540576


In [326]:
merged[o.index].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
FinTerms_Positive,823090.0,0.001838,0.003347,0.0,0.0,0.0,0.003222,0.030519
FinTerms_Litigious,823090.0,0.002478,0.005468,0.0,0.0,0.0,0.002786,0.079869
FinTerms_ModalStrong_count,823090.0,7.264345,24.165918,0.0,0.0,0.0,1.0,1715.0


In [320]:
# stepwise test as lone feature
#out_lone = stepwise([], test_cols[:])
cols = ['reg_r2','reg_beta','reg_t','log_beta','log_z']
ol = out_lone[cols].apply(abs).sort_values('log_beta', ascending=False)
ol = ol[ol.reg_t >= 2]
ol

Unnamed: 0,reg_r2,reg_beta,reg_t,log_beta,log_z
FinTerms_Positive,0.001,0.00669002,28.0176,0.0267674,10.6074
FinTerms_Litigious,0.001,0.00479209,20.063,0.0191793,7.597
FinTerms_ModalWeak,0.0,0.00392343,16.4245,0.0156999,6.22044
FinTerms_ModalStrong,0.0,0.00307376,12.8665,0.0122964,4.87401
FinTerms_Litigious_count,0.0,0.0010122,4.23649,0.00405328,1.60425
FinTerms_ModalStrong_count,0.0,0.000768463,3.21632,0.00307434,1.21851


### VIF for Full Features

In [275]:
# For each X, calculate VIF and save in dataframe
cols = ['NITA','NIMTA','TLTA','TLMTA','CASHMTA','EXRET','RSIZE','PRC','SIGMA','SIGMA_TLTA',
        'Gunning_Fog_Index','ARI','SMOG_Index','ParagraphCount','CharCount','WordCount',
        'ComplexWordCount','SentenceCount','averageWordsPerParagraph','CharCountTokens',
        'FinTerms_Negative','FinTerms_ModalWeak_count','FinTerms_ModalStrong_count', 
        'FinTerms_Positive','HarvardIV_Negative','FinTerms_Litigious_count','FinTerms_Uncertainty_count',
        'FinTerms_ModalWeak','FinTerms_Litigious','LM_Master_Dictionary','FinTerms_Uncertainty', 
        'LM_Master_Dictionary_count','HarvardIV_Negative_count','FinTerms_Negative_count',
        'FinTerms_ModalStrong','FinTerms_Positive_count']
vifcols = ['CharCountTokens','CharCount','WordCount','SMOG_Index','Gunning_Fog_Index',
           'LM_Master_Dictionary_count','ComplexWordCount','ARI','LM_Master_Dictionary',
           'HarvardIV_Negative_count','SentenceCount','FinTerms_Uncertainty_count',
           'HarvardIV_Negative','FinTerms_Negative_count','SentenceCount','FinTerms_Uncertainty', 
           'FinTerms_Positive_count','FinTerms_ModalWeak_count','NIMTA','TLMTA','RSIZE',
           'FinTerms_Negative','ParagraphCount','FinTerms_ModalStrong_count']
#cols = ['NITA','TLTA','CASHMTA','EXRET','PRC','SIGMA','SIGMA_TLTA'] # 'RSIZE'
X2 = merged[cols].dropna()
X2 = X2.drop(columns=vifcols)
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X2.values, i) for i in range(X2.shape[1])]
vif["features"] = X2.columns

# Inspect VIF scores
vifcols = list(vif.features)
vif = vif.round(1).sort_values('VIF Factor', ascending=False).reset_index(drop=True)
vif

Unnamed: 0,VIF Factor,features
0,5.0,FinTerms_Positive
1,4.3,FinTerms_ModalStrong
2,3.5,FinTerms_Litigious
3,3.1,FinTerms_ModalWeak
4,2.9,TLTA
5,2.6,SIGMA_TLTA
6,1.7,PRC
7,1.6,SIGMA
8,1.6,FinTerms_Litigious_count
9,1.4,CASHMTA


In [336]:
print(vif.features.tolist())

['PRC', 'TLTA', 'FinTerms_Litigious', 'FinTerms_Positive', 'SIGMA_TLTA', 'FinTerms_ModalWeak_count', 'SIGMA', 'FinTerms_Litigious_count', 'FinTerms_Negative', 'FinTerms_ModalWeak', 'FinTerms_ModalStrong', 'CASHMTA', 'NITA', 'EXRET', 'averageWordsPerParagraph']


### Raw SEC Data Merge

In [36]:
# load linking data
links = pd.read_csv('firm-links.csv')
links = links[['LPERMNO','cik','datadate']]
links['date_month'] = pd.to_datetime(links['datadate']).dt.to_period('M')
links = links.rename(columns={'LPERMNO':'PERMNO'})
links = links[['PERMNO','cik','date_month']]

In [34]:
# load firm data
data = pd.read_csv('data-clean.csv', low_memory=False)
#data = pd.read_csv('data-exclude.csv', low_memory=False)
data['PRC'] = np.where(data['PRC'] >= 15, 15, data['PRC'])
data['year'] = pd.to_datetime(data['date_month']).dt.to_period('Y')
data['date_month'] = pd.to_datetime(data['date_month']).dt.to_period('M')

In [37]:
# load text data
text = pd.read_csv('sec_report_links.csv', encoding='latin-1')
text['date_month'] = pd.to_datetime(text['secpdate'], format='%Y%m%d').dt.to_period('M')
text.form.value_counts()

8-K     1790579
10-Q     719149
10-K     247967
Name: form, dtype: int64

In [39]:
# merge process
print('Size pre-cik merge:', data[data['date_month'].between('1995-01','2018-12')].shape)
print('Date range:',data['date_month'].min(), data['date_month'].max(),'\n')

d = data[data['date_month'].between('1995-01','2018-12')]
merged = d.merge(links, on=['PERMNO','date_month'], how='left')
print('Size post-cik merge:', merged.shape)
print('Cik nan count:', merged.cik.isna().sum())
print('Date range:',merged['date_month'].min(), merged['date_month'].max(),'\n')

merged = merged.merge(text, on=['cik','date_month'], how='left')
print('Size post-text merge:', merged.shape)
print('Cik nan count:', merged.cik.isna().sum())
print('Date range:', merged['date_month'].min(), merged['date_month'].max(),'\n')

Size pre-cik merge: (1065641, 17)
Date range: 1970-02 2018-12 

Size post-cik merge: (1065641, 18)
Cik nan count: 634851
Date range: 1995-01 2018-12 

Size post-text merge: (1237690, 25)
Cik nan count: 634851
Date range: 1995-01 2018-12 



In [41]:
merged.to_csv('data-text-links.csv',index=False)

In [42]:
merged

Unnamed: 0,date_month,PERMNO,RET,RET-shifted,MKTCAP,NITA,NIMTA,TLTA,TLMTA,CASHMTA,...,year,SIGMA_TLTA,cik,fdate,form,wrdsfname,fsize,doccount,fname,secpdate
0,1995-01,10001,-0.031250,-0.026210,1.723600e+04,-0.000220,-0.000022,0.012525,0.001273,0.000028,...,1995,0.007149,43350.0,,,,,,,
1,1995-02,10001,-0.026210,0.006377,1.678426e+04,0.000469,0.000048,0.013574,0.001381,0.000035,...,1995,0.008103,43350.0,,,,,,,
2,1995-03,10001,0.006377,0.000000,1.683000e+04,0.000468,0.000048,0.013538,0.001378,0.000035,...,1995,0.007774,43350.0,,,,,,,
3,1995-04,10001,0.000000,0.050000,1.683000e+04,0.000468,0.000048,0.013538,0.001378,0.000035,...,1995,0.007479,43350.0,,,,,,,
4,1995-05,10001,0.050000,0.060317,1.767150e+04,0.000581,0.000059,0.011584,0.001177,0.000003,...,1995,0.007466,43350.0,19950515.0,10-Q,000004/43350/0000043350-95-000003.txt,44265.0,2.0,edgar/data/43350/0000043350-95-000003.txt,19950530.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1237685,2018-07,93436,-0.130660,0.011806,5.086060e+07,-0.000139,-0.000014,0.004215,0.000424,0.000055,...,2018,0.002156,,,,,,,,
1237686,2018-08,93436,0.011806,-0.122290,5.146108e+07,-0.000139,-0.000014,0.004377,0.000440,0.000046,...,2018,0.002416,,,,,,,,
1237687,2018-09,93436,-0.122290,0.274011,4.542871e+07,-0.000157,-0.000016,0.004954,0.000498,0.000052,...,2018,0.003075,,,,,,,,
1237688,2018-10,93436,0.274011,0.039013,5.792898e+07,-0.000123,-0.000012,0.003890,0.000391,0.000041,...,2018,0.002847,,,,,,,,


In [None]:
# final processing
m = merged.groupby(['date_month','PERMNO']).mean()
m['doc_count'] = merged.groupby(['date_month','PERMNO']).count()['RET']
m['doc_count'] = np.where(m['FinTerms_Litigious'].isna(),0,m['doc_count'])
m = m.reset_index().drop(columns=['CIK','FDATE'])
m = m.replace(np.nan,0) # interaction variable btw presence & magnitude of report
print('Final size:', m.shape)
vc = m['financialDistress_12_periods'].value_counts(dropna=False)
print('Final distress count:', vc[1], '/', (vc[0]+vc[1]))
print('Final date range:', m['date_month'].min(), m['date_month'].max())