In [1]:
import os, re, html
import pandas as pd
from tqdm import tqdm

In [2]:
company_filepath = os.path.join('C:\DATA\ComBERT\data', 'company_info_sec_cik_mapper_12057_20220802.csv')

In [3]:
company_df = pd.read_csv(company_filepath)
company_df = company_df.astype({"CIK": int}, errors='raise')
company_df.head()

Unnamed: 0,CIK,Ticker,Name,Exchange
0,1750,AIR,Aar Corp,NYSE
1,1800,ABT,Abbott Laboratories,NYSE
2,1961,WDDD,Worlds Inc,OTC
3,2098,ACU,Acme United Corp,NYSE
4,2178,AE,"Adams Resources & Energy, Inc.",NYSE


In [4]:
def subnames_of_company_name(fullname): 
    strings = fullname.split(' ')
    subnames = [' '.join(strings[:i+1]) for i in range(len(strings))] # ['amazon', 'amazon com', 'amazon com inc' ]
    subnames.reverse() # ['amazon com inc', 'amazon com', 'amazon']
    subnames = [item.strip() for item in subnames if item.strip() != '']
    return subnames

fullname = 'Amazon Com Inc'.lower()
subnames = subnames_of_company_name(fullname)
subnames

['amazon com inc', 'amazon com', 'amazon']

In [5]:
subnames_for_all_companies = []
for fullname in company_df['Name'].values:
    subnames_for_all_companies.extend(subnames_of_company_name(fullname.lower()))

subnames_for_all_companies = list(set(subnames_for_all_companies)) # unique subnames    
print(len(subnames_for_all_companies), subnames_for_all_companies[:10])

28024 ['phillips', 'airbnb, inc.', 'us xpress enterprises', 'zw', 'tmc', 'blackstar enterprise group,', 'northern trust', 'altamont pharma acquisition', 'bbx', 'accustem sciences inc.']


### The number of company names in each model's vocabulary
* Araci's FinBERT is a model post-trained on BERT, not trained from scratch; thus, its vocabulary is identical to that of BERT.

In [6]:
def get_vocabs(filepath):
    with open(filepath, encoding='utf8') as var:
        lines = var.readlines()

    return [item.strip() for item in lines]

def intersection(lst1, lst2):
    return list(set(lst1) & set(lst2))

In [7]:
filepath = os.path.join('vocab', 'BERT_vocab.txt')

vocabs = get_vocabs(filepath)
print('The total number of tokens in {} is {}.'.format(os.path.basename(filepath), len(vocabs)))

overlap = intersection(vocabs, subnames_for_all_companies)
print('The number of overlapping tokens is {}.'.format(len(overlap)))

The total number of tokens in BERT_vocab.txt is 30522.
The number of overlapping tokens is 1858.


In [8]:
filepath = os.path.join('vocab', 'Yang_vocab.txt')

vocabs = get_vocabs(filepath)
print('The total number of tokens in {} is {}.'.format(os.path.basename(filepath), len(vocabs)))

overlap = intersection(vocabs, subnames_for_all_companies)
print('The number of overlapping tokens is {}.'.format(len(overlap)))

The total number of tokens in Yang_vocab.txt is 30873.
The number of overlapping tokens is 1822.


In [9]:
filepath = os.path.join('vocab', 'SECBERT_vocab.txt')

vocabs = get_vocabs(filepath)
print('The total number of tokens in {} is {}.'.format(os.path.basename(filepath), len(vocabs)))

overlap = intersection(vocabs, subnames_for_all_companies)
print('The number of overlapping tokens is {}.'.format(len(overlap)))

The total number of tokens in SECBERT_vocab.txt is 30000.
The number of overlapping tokens is 2446.
