In [1]:
import faiss
import time
import numpy as np
import pandas as pd
import random
import spacy
import os
import json

# Textual factors

Our first approach to process the 10-K reports data is to construct the textual factors.
First, we create a dictionary that contains each document's term-frequency.
Second, we create textual clusters using locality-sensitive hashing (LSH) with word2vec embeddings and a fast clustering technique.

## 1. Create term-frequency dictionary

In [2]:
with open("edgar_w2v_200.txt") as f:
    lines = f.readlines()[1:]
    w2v_embeddings = {str(line.split()[0]): [float(x) for x in line.split()[1:]] for line in lines}

w2v_vocab = w2v_embeddings.keys()

In [3]:
nlp = spacy.load("en_core_web_sm")

def spacy_tokenizer(corpus):
    """
    Input: a corpus
    Output: a list of tokens (strings)
    """
    tokens = nlp(corpus.lower())
    tokens = [str(token) for token in tokens]    
    return tokens

def tf_dict(corpus, vocab):
    """
    Input: a corpus and a vocabulary (set or dictionary-like)
    Output: a dictionary mapping token (if the token is in the dictionary) to the frequency of it appearing 
            in the corpus 
    """
    corpus_toks = spacy_tokenizer(corpus)
    res = {}
    for token in corpus_toks:
        if token in vocab:
            res[token] = res.get(token, 0) + 1
    return res

In [4]:
#for year in range(2010, 2020):
#    for filename in os.listdir("./{}".format(year)):
#        temp = {}
#        with open("./{}/{}".format(year, filename), 'r') as f:
#            corpus = json.load(f)
#        tf_7A_w2v = tf_dict(corpus["section_7A"], w2v_vocab) # only include terms that are in w2v_vocab
#        temp = {"year": year, "cik": corpus["cik"], "section_7A": tf_7A_w2v}
#        with open("./tf/tf_w2v/{}_{}.json".format(corpus["cik"], str(year)), 'w') as outfile:
#            json.dump(temp, outfile)
#        tf_7A_glove = tf_dict(corpus["section_7A"], glove_vocab) # only include terms that are in glove_vocab
#        temp = {"year": year, "cik": corpus["cik"], "section_7A": tf_7A_glove}
#        with open("./tf/tf_glove/{}_{}.json".format(corpus["cik"], str(year)), 'w') as outfile:
#            json.dump(temp, outfile)
#    print("Year {} completed.".format(year))

## 2. Clustering via LSH

In [5]:
def cs_similarity(w, V):
    """
    w: d by 1 array
    V: n by d array
    
    return: n values of inner products Vw
    """
    w = w.reshape(-1, 1)
    d = w.shape[0]
    V = V.reshape(-1, d)
    l2norms = np.sqrt((V * V).sum(axis = 1)).reshape(-1, 1)
    V = V / l2norms
    w = w / np.sqrt(np.sum(w * w))
    
    res = np.dot(V, w)
    
    return res.squeeze()

class RP:
    
    def __init__(self, nbits, d, emb, dup):
        self.nbits = nbits
        self.d = d
        self.dup = dup
        self.ids_not_visited = set(range(len(emb.keys())))
        self.stop = False
        self.cluster = []
        counter = 0
        
        self.emb_arr = []
        self.lookup = {}
        self.RPs = []
        for _ in range(dup):
            index = faiss.IndexLSH(d, nbits)
            index.rrot.init(random.randint(0, 2 ** 30))
            self.RPs.append(index)
        print("LSH initialized.")
            
        counter = 0
        for word in emb:
            self.lookup[counter] = word
            self.emb_arr.append(emb[word])
            counter += 1
            
        self.emb_arr = np.array(self.emb_arr, dtype = "float32")
        for i in range(dup):
            self.RPs[i].add(self.emb_arr)
        print("Embedding completed.")
        
    def nn(self, q_vec, k):
        cand_ids = set()
        for i in range(self.dup):
            _, I = self.RPs[i].search(q_vec, k)
            I = set(I.tolist()[0])
            cand_ids.update(I)
        return cand_ids
    
    def nn_refined(self, q_vec, k, thres, in_clustering = False):
        cand_ids = self.nn(q_vec, k)
        if in_clustering:
            cand_ids = [i for i in cand_ids if i in self.ids_not_visited]
            
        temp = cs_similarity(q_vec.reshape(-1, 1), self.emb_arr[cand_ids,:])
        res_ids = (temp > thres).nonzero()[0].astype("int")
        return (np.array(cand_ids)[res_ids]).tolist()
    
    def seq_one_step_update(self, k, thres):
        if self.stop:
            print("All words visited")
            return
        query = self.ids_not_visited.pop()
        q_vec = self.emb_arr[query,:].reshape(1, self.d)
        res_ids = self.nn_refined(q_vec, k, thres, in_clustering = True)
        
        self.cluster.append([self.lookup[i] for i in res_ids] + [self.lookup[query]])
        self.ids_not_visited = self.ids_not_visited.difference(set(res_ids))
        if len(self.ids_not_visited) == 0:
            self.stop = True
            
    def seq_clustering(self, k, thres):
        counter = 0
        start_time = time.time()
        while not self.stop:
            self.seq_one_step_update(k, thres)
            if counter > 99 and counter % 100 == 0:
                elps_time = time.time() - start_time
                print(f"  {len(self.ids_not_visited)} words in queue; {elps_time:.2f} sec passed")
            counter += 1

Power calculation

In [6]:
def sensitivity(cs_sim1, cs_sim2, nbits, dup):
    ang1 = np.arccos(cs_sim1) * 180 / np.pi
    ang2 = np.arccos(cs_sim2) * 180 / np.pi
    p1 = (180 - ang1) / 180
    p2 = (180 - ang2) / 180
    
    p1 = p1 ** nbits
    p2 = p2 ** nbits
    
    p1 = 1 - ((1 - p1) ** dup)
    p2 = 1 - ((1 - p2) ** dup)
    
    return p1, p2

print(sensitivity(0.5, 0, 10, 128))

(0.8934550065054789, 0.1175569941988367)


### 2.1 LSH using W2V embedding

In [7]:
w2v_RP = RP(10, 200, w2v_embeddings, 128)
w2v_RP.seq_clustering(300, 0.45)

LSH initialized.
Embedding completed.
  96680 words in queue; 9.47 sec passed
  91260 words in queue; 18.63 sec passed
  85829 words in queue; 27.84 sec passed
  82405 words in queue; 36.91 sec passed
  78144 words in queue; 45.91 sec passed
  74039 words in queue; 54.88 sec passed
  71684 words in queue; 63.87 sec passed
  68187 words in queue; 72.74 sec passed
  63670 words in queue; 81.60 sec passed
  59723 words in queue; 90.43 sec passed
  57803 words in queue; 99.25 sec passed
  56483 words in queue; 107.94 sec passed
  53949 words in queue; 116.68 sec passed
  51762 words in queue; 125.29 sec passed
  49491 words in queue; 133.94 sec passed
  47706 words in queue; 142.54 sec passed
  46247 words in queue; 151.08 sec passed
  43097 words in queue; 159.67 sec passed
  42226 words in queue; 168.31 sec passed
  40544 words in queue; 177.93 sec passed
  39001 words in queue; 187.28 sec passed
  34085 words in queue; 196.14 sec passed
  26433 words in queue; 205.06 sec passed
  16917 

  res_ids = (temp > thres).nonzero()[0].astype("int")


In [8]:
print(len(w2v_RP.cluster))
queries = ["healthcare", "economy", "competitors", "prices", "investors", "oil", "geopolitical",
           "recession", "wage", "inflation", "inventory"]

for q in queries:
    print(f"query: {q}")
    for i in range(len(w2v_RP.cluster)):
        if q in w2v_RP.cluster[i]:
            print(w2v_RP.cluster[i])
            print('-' * 20)

6677
query: healthcare
['healthcare', 'reimbursement', 'givers', 'medicare', 'education', 'trend', 'hospitals', 'hospital', 'medicaid', 'student', 'living', 'surgical', 'educational', 'emergency', 'pharmacy', 'affordable', 'vision', 'nursing', 'schools', 'fitness', 'rehabilitation', 'beneficiaries', 'dialysis', 'outpatient', 'eye', 'containment', 'pharmacies', 'beauty', 'cosmetic', 'respiratory', 'dietary', 'inpatient', 'nutrition', 'workplace', 'funeral', 'behavioral', 'aids', 'aca', 'portability', 'child', 'settings', 'sleep', 'wellness', 'hospice', 'orthopedic', 'nurses', 'prescriptions', 'correctional', 'doctor', 'ambulatory', 'curriculum', 'pps', 'mental', 'infant', 'veterans', 'psychiatric', 'acuity', 'nurse', 'pbm', 'elderly', 'counseling', 'affordability', 'dentists', 'youth', 'ppo', 'illnesses', 'seniors', 'anesthesia', 'capitation', 'mma', 'hospitalization', 'instructional', 'capitated', 'advocacy', 'hygiene', 'ambulance', 'stays', 'pharmacists', 'drg', 'urology', 'charity', 

## 3. Compute textual factors

In [9]:
pre_factors = {}

for i in range(len(w2v_RP.cluster)):
    pre_factors[i] = {}
    for tok in w2v_RP.cluster[i]:
        pre_factors[i][tok] = 0       
        
print(pre_factors[100])

{'commission': 0, 'currency': 0, '1934': 0, 'exchanges': 0, 'exchange': 0}


In [10]:
count = 0
for filename in os.listdir("./tf/tf_w2v"):
    yr = int(filename.split(".")[0].split("_")[1])
    
    # only use 2010-2017 data to train textual factors
    if yr > 2017:
        continue    
    with open("./tf/tf_w2v/{}".format(filename), "r") as f:
        dta = json.load(f)        
    tf = dta["section_7A"]
    for tok in tf:
        for i in range(len(w2v_RP.cluster)):
            if tok in pre_factors[i]:
                pre_factors[i][tok] += tf[tok]
                break
    count += 1
    if count > 999 and count % 1000 == 0:
        print(f"{count} document processed.")

1000 document processed.
2000 document processed.
3000 document processed.
4000 document processed.
5000 document processed.
6000 document processed.
7000 document processed.
8000 document processed.
9000 document processed.
10000 document processed.
11000 document processed.
12000 document processed.
13000 document processed.
14000 document processed.
15000 document processed.
16000 document processed.
17000 document processed.
18000 document processed.
19000 document processed.
20000 document processed.
21000 document processed.
22000 document processed.
23000 document processed.
24000 document processed.
25000 document processed.
26000 document processed.
27000 document processed.
28000 document processed.
29000 document processed.
30000 document processed.
31000 document processed.
32000 document processed.
33000 document processed.
34000 document processed.
35000 document processed.
36000 document processed.
37000 document processed.
38000 document processed.
39000 document proces

In [11]:
factor_word_counts = []
for i in range(len(w2v_RP.cluster)):
    temp = 0
    for tok in pre_factors[i]:
        temp += pre_factors[i][tok]
    factor_word_counts.append(temp)
    
factors = []
null_factor_ids = []
for i in range(len(w2v_RP.cluster)):
    toks = w2v_RP.cluster[i]
    temp = []
    for tok in toks:
        if factor_word_counts[i] == 0:
            null_factor_ids.append(i)
            temp.append(pre_factors[i][tok])
        else:
            temp.append(pre_factors[i][tok] / factor_word_counts[i])
    factors.append(temp)
    
null_factor_ids = list(set(null_factor_ids))

## 4. Compute factor loadings

In [12]:
file_name = "company_tickers.json"
cik_to_tickers_dict = {}

f = open(file_name)
file = json.load(f)
for company in file.values():
    if cik_to_tickers_dict.get(str(company["cik_str"]), None):
        cik_to_tickers_dict[str(company["cik_str"])].append(company["ticker"])
    else:
        cik_to_tickers_dict[str(company["cik_str"])] = [company["ticker"]]
    
f.close()

In [13]:
cik_span = {}

for filename in os.listdir("./tf/tf_w2v"):
    temp = filename.split(".")[0]
    temp = temp.split("_")
    cik_span[temp[0]] = cik_span.get(temp[0], []) + [int(temp[1])]
    
cik_span = [cik for cik, year in cik_span.items() if set(range(2010, 2020)).issubset(set(year))]

In [14]:
cik_final = []

for cik in cik_span:
    if cik not in cik_to_tickers_dict:
        continue
    save = True
    for year in range(2010, 2020):
        filename = "_".join([cik, str(year)]) + ".json"
        with open("./tf/tf_w2v/{}".format(filename), "r") as f:
            temp = json.load(f)
            temp = temp["section_7A"]
        if len(temp.keys()) < 30:
            save = False
            break
    if save:
        cik_final.append(cik)

In [15]:
factor_2norms = []

for i in range(len(factors)):
    factor_2norms.append(np.sum(np.array(factors[i]) ** 2))
    

def loadings(dt_vec, factor, toks, factor_2norm):
    """
    Input:
        dt_vec: doc-term vector (dictionary)
        factor: textual factor (list)
        toks: tokens corresponding to the factor support (list)
        factor_2norms: L2 norm of the factor (scalar)
    
    Output:
        factor loading (scalar)
    """
    if factor_2norm == 0:
        return 0
    res = 0
    for j in range(len(toks)):
        tok = toks[j]
        if tok not in dt_vec:
            continue
        res += dt_vec[tok] * factor[j]
    return res / factor_2norm

In [16]:
final_data = []
count = 0

for cik in cik_final:
    for year in range(2010, 2020):
        temp = [cik, year, cik_to_tickers_dict[cik][0]]
        filename = "_".join([cik, str(year)]) + ".json"
        with open("./tf/tf_w2v/{}".format(filename), "r") as f:
            doc_term_vec = json.load(f)["section_7A"]
        for i in range(len(factors)):
            if factor_2norms[i] == 0:
                temp.append(0)
            else:
                temp.append(loadings(doc_term_vec, factors[i], w2v_RP.cluster[i], factor_2norms[i]))
        final_data.append(temp)
    count += 1
    if count > 99 and count % 100 == 0:
        print(f"{count} ciks processed")

print(final_data)

100 ciks processed
200 ciks processed
300 ciks processed
400 ciks processed
500 ciks processed
600 ciks processed
700 ciks processed
800 ciks processed
900 ciks processed
1000 ciks processed
1100 ciks processed
1200 ciks processed
1300 ciks processed
1400 ciks processed
1500 ciks processed
1600 ciks processed
1700 ciks processed


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [17]:
df = pd.DataFrame(final_data)
df.to_csv("./w2v_textual_factors.csv", index = False, header = False)