# Hoberg, Phillips (2010, 2016)

> 10-K Text-based Network Industry Classifications (TNIC) data

This module downloads and processes data developed by:

- Text-Based Network Industries and Endogenous Product Differentiation. Gerard Hoberg and Gordon Phillips, 2016, Journal of Political Economy 124 (5), 1423-1465.
- Product Market Synergies and Competition in Mergers and Acquisitions: A Text-Based Analysis.Gerard Hoberg and Gordon Phillips, 2010, Review of Financial Studies 23 (10), 3773-3811.

See the authors' dedicated website for more information on this dataset: <https://hobergphillips.tuck.dartmouth.edu/industryclass.htm>

In [None]:
#| default_exp papers.hoberg_phillips_2010

In [None]:
#| export
from __future__ import annotations
from pathlib import Path 
import os

import requests
import zipfile
import io
import pandas as pd

from finsets import wrds

In [None]:
#| exports
PROVIDER = 'Gerard Hoberg and Gordon Phillips, 2010, 2016'
URL = 'https://hobergphillips.tuck.dartmouth.edu/idata/tnic3_data.zip' 
TXT_FILE = 'tnic3_data.txt'
HOST_WEBSITE = 'https://hobergphillips.tuck.dartmouth.edu/industryclass.htm'
FREQ = 'A'
MIN_YEAR = 1989
MAX_YEAR = 2021
ENTITY_ID_IN_RAW_DSET = 'gvkey' 
ENTITY_ID_IN_CLEAN_DSET = 'gvkey' 
TIME_VAR_IN_RAW_DSET = 'date'
TIME_VAR_IN_CLEAN_DSET = f'{FREQ}date'

In [None]:
#| eval: false
#| hide
DATA_REPO = Path(os.getenv('DATA_REPO')) #path to folder with all needed datasets (the four below)

SAVE_PATH_RAW = DATA_REPO/'finsets/papers/hoberg_phillips_2010/raw.parquet'
SAVE_PATH_PROCESSED = DATA_REPO/'finsets/papers/hoberg_phillips_2010/processed.pkl.zip'

In [None]:
#| export
def get_raw_data(url: str=URL,
                 txt_file: str=TXT_FILE, # Name of the data txt file inside the zip file found at `url` 
            ) -> pd.DataFrame:
    """Download raw data from `url`"""

    response = requests.get(url)
    if response.status_code == 200:
        # Decompress the file first with zip
        with io.BytesIO(response.content) as compressed_file:
            with zipfile.ZipFile(compressed_file, 'r') as zip_ref:
                with zip_ref.open(txt_file) as data_file:
                    df = pd.read_csv(io.BytesIO(data_file.read()),delimiter='\t' ,header=0)
    else:
        print("Failed to download the file. Status code:", response.status_code)
    
    return df

In [None]:
#| eval: false
raw = get_raw_data()

In [None]:
#| eval: false
#| hide
if SAVE_PATH_RAW:
    os.makedirs(SAVE_PATH_RAW.parent, exist_ok=True)
    raw.to_parquet(SAVE_PATH_RAW)

In [None]:
#| export
def process_raw_data(df: pd.DataFrame=None,
                     gvkey_to_permno: bool|pd.DataFrame=True, # Whether to download permno-gvkey link. If DataFrame, must contain 'gvkey'
                     ) -> pd.DataFrame:
    """Cleans up dates and optionally adds CRSP permnos"""

    df['Adate'] = pd.to_datetime(df.year.astype('string'), format="%Y").dt.to_period('A')
    df = df.drop('year',axis=1).dropna().copy()

    if not gvkey_to_permno: return df
    if gvkey_to_permno is True: permnos = wrds.linking.gvkey_permno_a()
    permnos['gvkey'] = permnos.gvkey.astype('int64')

    df = (df.merge(permnos.rename(columns={'permno':'permno1', 'gvkey':'gvkey1'}), how='left', on=['Adate','gvkey1'])
            .merge(permnos.rename(columns={'permno':'permno2', 'gvkey':'gvkey2'}), how='left', on=['Adate','gvkey2']))
    return df 

In [None]:
#| eval: false
clean = process_raw_data(raw)

In [None]:
#| eval: false
clean

Unnamed: 0,gvkey1,gvkey2,score,Adate,permno1,permno2
0,1011,3226,0.1508,1988,10082,25022
1,1011,6282,0.0851,1988,10082,46747
2,1011,6734,0.0258,1988,10082,49606
3,1011,7609,0.0097,1988,10082,12058
4,1011,9526,0.0369,1988,10082,69519
...,...,...,...,...,...,...
25479601,349972,322154,0.0444,2021,15642,22523
25479602,349972,331856,0.0169,2021,15642,14615
25479603,349972,332115,0.0214,2021,15642,80577
25479604,349972,345556,0.0781,2021,15642,16069


In [None]:
#| eval: false
#| hide
if SAVE_PATH_PROCESSED:
    os.makedirs(SAVE_PATH_PROCESSED.parent, exist_ok=True)
    clean.to_pickle(SAVE_PATH_PROCESSED)

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()