In [1]:
import pandas as pd
import numpy as np
import os
from tqdm.notebook import tqdm     # progress bar on loops
from NEAR_regex import NEAR_regex  # copy this file into the asgn folder
from bs4 import BeautifulSoup
import re

## Download the Compustat-EDGAR merged data

This dataset has 2007 and 2008 data for each of 169 firms. The 2007 observations have variables to help find the corresponding 10-K on EDGAR. 

In [2]:
# download and store the input file locally. only do this once, thereafter just load it...
    
ccm_path = "input/ccm_and_edgarinfo.dta"
ccm = pd.read_stata(ccm_path)
ccm = ccm.drop(labels=['level_0','index'],axis=1)    

## Defining the searches

### Tax risk exposure

Technology firms are often involved in large amounts of creative accounting to reduce tax bills. To identify when a firm is negatively exposed to possible tax changes, I look for a firm mentioning a "risk term" near "tax" (or similar) and "changes". 

**HIT:** "A change to tax policies could negatively affect profits"

**NOT A HIT:** "A change to tax policies is likely"

In [3]:
# this will look for mentions with 25 word gaps maximum
tax_risks = ['(risk|risks|could harm|negative|negatively|uncertain)',
            '(tax|taxes|taxation)',
            '(change|new|law|policy|policies|regulation|regulations)']

### Tariffs

Technology firms often ship product across international borders. To identify when a firm is negatively exposed to possible tariff changes, I look for a firm mentioning a "risk term" near "tariff" (or similar) and "changes". 

**HIT:** "A change to tariff policies could negatively affect profits"

**NOT A HIT:** "A change to tariff policies is likely"

In [4]:
# this will look for mentions with 25 word gaps maximum
tariff_search = ['(risk|risks|could harm|negative|negatively)',
                '(tariff|tariffs)',
                '(change|new|law|policy|policies|regulation|regulations)']

### Financial constraints

Technology firms tend to be younger and smaller than other public firms. According to published research, young and small firms also tend to be financially constrained.

Following [Hoberg and Maksimovic](https://poseidon01.ssrn.com/delivery.php?ID=875082005085007108066003027097109092018052053087053016092066101124083072025114105026038106063111031098097099020098001110068066029018023080043026109080070118114124088008042110092095070091123122124087109120115122022004003119096075106076087081087092093&EXT=pdf), I define firms as financially constrained if a firm discusses "curtailing" near "investment". The full lists, below, come from the paper.

In [5]:
# this list comes from page 9 of the WP version of Hoberg and Maksimovic (link above)

# allow for partial matches and a max gap of 25 (they use 12, but our text is messier)
fin_constraints = ['(delay|abandon|eliminate|curtail|scale back|postpone)',
                   '(construction|expansion|acquisition|restructuring|project|research|development|exploration|product|expenditure|manufactur|entry|renovat|growth|activities|capital improvement|capital spend|capital proj|commercial release|business plan|transmitter deployment|opening restaurants)' ]

### Proprietary Information Leak Risk

A crucial task for technology firms is protecting their IP. Following [Hoberg and Maksimovic](https://poseidon01.ssrn.com/delivery.php?ID=875082005085007108066003027097109092018052053087053016092066101124083072025114105026038106063111031098097099020098001110068066029018023080043026109080070118114124088008042110092095070091123122124087109120115122022004003119096075106076087081087092093&EXT=pdf) again, I define firms worried about IP leaks as those that discuss "protecting" near "trade secrets" or "proprietary information". I could use a larger list, but this definition has been vetted.


In [6]:
proprietary_information_risks = ['(protect|safeguard)',
                                '(trade secret|proprietary  information|confidential  information)']

In [7]:
# add blank new variables for each of the searches

ccm['tax_risks']   = np.nan
ccm['tariff_risk'] = np.nan
ccm['fincon']      = np.nan
ccm['proprietary'] = np.nan

## Loop over and parse/search filings


In [8]:
# go through each doc and look...

for index, row in tqdm(ccm.iterrows(), total=len(ccm)):
        
    # parse_filing() only works when CIK and FName are valid...
    if (np.isnan(row['CIK']) == False) & (row['FName'] != ''):
        
        # path to file
        path_dir = './edgar_filings/cik_' + str(int(row['CIK'])) + '/raw/'
        filename = row['FName'].split('/')[-1] 
        filepath = path_dir + filename
        
        # open file
        with open(filepath,'r') as f:
            text = f.read()
        
        # clean the 10k before searching
        lower = BeautifulSoup(text).get_text().lower()
        no_punc = re.sub(r'\W',' ',lower)
        cleaned = re.sub(r'\s+',' ',no_punc).strip()
        
        # search    
        rgx   = NEAR_regex(tax_risks,max_words_between=25)
        ccm.loc[index,"tax_risks"] = len(re.findall(rgx,cleaned)) 
        
        rgx   = NEAR_regex(tariff_search,max_words_between=25)
        ccm.loc[index,"tariff_risk"] = len(re.findall(rgx,cleaned)) 

        rgx   = NEAR_regex(fin_constraints,max_words_between=25,partial=True)
        ccm.loc[index,"fincon"] = len(re.findall(rgx,cleaned)) 

        rgx   = NEAR_regex(proprietary_information_risks,max_words_between=25,partial=True)
        ccm.loc[index,"proprietary"] = len(re.findall(rgx,cleaned)) 
                

HBox(children=(FloatProgress(value=0.0, max=338.0), HTML(value='')))




## Summary stats on new variables

In [9]:
ccm.iloc[:,-4:].describe()

Unnamed: 0,tax_risks,tariff_risk,fincon,proprietary
count,148.0,148.0,148.0,148.0
mean,0.932432,0.074324,6.959459,2.114865
std,1.176078,0.263189,6.550566,1.911382
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.75,0.0
50%,1.0,0.0,5.0,2.0
75%,2.0,0.0,10.25,3.0
max,5.0,1.0,26.0,8.0


## Save the new variables (with the whole dataset)

In [10]:
output_folder = 'output/'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
    
ccm.to_stata(output_folder+'ccm_with_risk.dta')