In [16]:
import re
import heapq
import nltk
import numpy as np
import pandas as pd
from operator import itemgetter 
from sklearn.model_selection import train_test_split

In [3]:
quality_phrases_0 = pd.read_csv('../data/AutoPhrase2_multi-words.txt', sep = '\t', header = None)
quality_phrases_0.head(5)

Unnamed: 0,0,1
0,0.991664,shuffle master
1,0.989858,estee lauder
2,0.989815,stifel nicolaus
3,0.98966,herman miller
4,0.989437,navigant consulting


In [4]:
quality_phrases_1 = pd.read_csv('../data/AutoPhrase_multi-words.txt', sep = '\t', header = None)

In [5]:
def clean(text):
    return str(text).lower()

quality_phrases_1['cleaned'] = quality_phrases_1[1].apply(clean)

In [6]:
top_phrases = quality_phrases_1['cleaned'].loc[quality_phrases_1[0] > 0.9].reset_index()
top_phrases.head()

Unnamed: 0,index,cleaned
0,0,personal property
1,1,credit rating
2,2,environmental liabilities
3,3,contractual obligations
4,4,severance benefits


In [7]:
data = pd.read_pickle("../data/feature_encoded_data.pkl")
data.head()

Unnamed: 0,date,time,event_type,cleaned_event,full_text,symbol,Surprise(%),Reported EPS,Consensus EPS,hr,pre_market,date_idx,price_change_7,price_change_30,price_change_90,price_change_365,unigram_vec,phrase_vec
0,2005-08-18,92410,EVENTS:\tResults of Operations and Financial C...,results of operations and financial condition,\n<DOCUMENT>\nFILE:LANC/LANC-8K-20050818092410...,LANC,5.36,0.59,0.56,9.24,True,1021,-0.98,-1.5,-1.21,9.06,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,2005-10-28,80708,EVENTS:\tResults of Operations and Financial C...,results of operations and financial condition,\n<DOCUMENT>\nFILE:LANC/LANC-8K-20051028080708...,LANC,-10.17,0.53,0.59,8.07,True,1092,-2.49,-5.89,-8.48,-5.29,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2006-01-30,81137,EVENTS:\tResults of Operations and Financial C...,results of operations and financial condition,\n<DOCUMENT>\nFILE:LANC/LANC-8K-20060130081137...,LANC,4.69,0.67,0.64,8.11,True,1186,3.92,8.86,2.6,-2.05,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,2006-04-28,81811,EVENTS:\tResults of Operations and Financial C...,results of operations and financial condition,\n<DOCUMENT>\nFILE:LANC/LANC-8K-20060428081811...,LANC,-28.0,0.36,0.5,8.18,True,1274,-2.27,-1.97,5.41,2.44,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,2006-10-30,82612,EVENTS:\tResults of Operations and Financial C...,results of operations and financial condition,\n<DOCUMENT>\nFILE:LANC/LANC-8K-20061030082612...,LANC,-21.82,0.43,0.55,8.26,True,1459,-9.19,-5.68,11.18,7.63,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


# Train, Val, Test Summary Stats

In [8]:
len(data.groupby('symbol'))

1444

In [9]:
X_train, X_test = train_test_split(data, test_size = 0.51, random_state = 42)
X_val, X_test = train_test_split(X_test, test_size = 0.51, random_state = 42)

In [10]:
# Train

train_stats = []
train_stats.append(X_train.shape[0])
train_stats.append(sum(X_train['full_text'].apply(lambda x: len(x))))
train_stats.append(len(X_train.groupby('symbol')))

In [11]:
# Val

val_stats = []
val_stats.append(X_val.shape[0])
val_stats.append(sum(X_val['full_text'].apply(lambda x: len(x))))
val_stats.append(len(X_val.groupby('symbol')))

In [12]:
# Test

test_stats = []
test_stats.append(X_test.shape[0])
test_stats.append(sum(X_test['full_text'].apply(lambda x: len(x))))
test_stats.append(len(X_test.groupby('symbol')))

In [13]:
indices = ['Train', 'Val', 'Test']
columns = ["# of 8-K's", "# of words", "# of firms"]
pd.DataFrame([train_stats, val_stats, test_stats], index = indices, columns = columns)

Unnamed: 0,# of 8-K's,# of words,# of firms
Train,17098,313867921,1410
Val,8720,164041583,1372
Test,9076,163871871,1380


# Knowledge Base Analysis

In [14]:
knowledge_base_comp = pd.concat([quality_phrases_0.head(10), top_phrases.head(10)], axis = 1)
knowledge_base_comp = knowledge_base_comp[[1, "cleaned"]]
knowledge_base_comp.rename(columns = {1: 'Wiki Base', 'cleaned': 'Investopedia Base'})

Unnamed: 0,Wiki Base,Investopedia Base
0,shuffle master,personal property
1,estee lauder,credit rating
2,stifel nicolaus,environmental liabilities
3,herman miller,contractual obligations
4,navigant consulting,severance benefits
5,sioux falls,accounting policies
6,teco coal's,annual salary
7,calvin klein,withholding tax
8,analog devices,service provider
9,novatel wireless,debt financing


# Unigram Analysis

In [15]:
# Create unigrams

word_count = {}
stopwords = nltk.corpus.stopwords.words('english')
count = 0

for form in X_train['full_text']:
    cleaned_form = re.sub(r'\W',' ', form)
    cleaned_form = re.sub(r'\s+',' ', cleaned_form)
    cleaned_form = cleaned_form.lower()
    tokens = nltk.word_tokenize(cleaned_form)
    for token in tokens:
        if token in stopwords:
            continue
        if token not in word_count.keys():                 
            word_count[token] = 1
        else: 
            word_count[token] += 1
    count += 1
    if count % 1000 == 0:
        print(count)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000


In [49]:
count = np.array([0] * 1000)
for form in X_train['unigram_vec'].values:
    count += np.array(form)

In [53]:
most_freq = heapq.nlargest(10, word_count, key=word_count.get)
most_freq

['quarter',
 'million',
 'company',
 'financial',
 '1',
 'year',
 'results',
 '2',
 'net',
 'income']

In [56]:
unigram_count = pd.DataFrame([most_freq, count[:10] / X_train.shape[0]]).T
unigram_count = unigram_count.rename(columns = {0: 'unigram', 1: "% of 8-K's"})
unigram_count = unigram_count.sort_values(by = ["% of 8-K's"], ascending = False).reset_index(drop = True)

In [57]:
unigram_count.head(10)

Unnamed: 0,unigram,% of 8-K's
0,2,1.0
1,1,0.999649
2,financial,0.994034
3,results,0.97953
4,quarter,0.973681
5,company,0.972044
6,year,0.952451
7,net,0.944262
8,million,0.940578
9,income,0.886946


# Phrase Analysis

In [46]:
count = np.array([0] * top_phrases.shape[0])
for form in X_train['phrase_vec'].values:
    count += np.array(form)

In [47]:
phrase_count = pd.DataFrame([top_phrases['cleaned'].values, count / X_train.shape[0]]).T
phrase_count = phrase_count.rename(columns = {0: 'phrase', 1: "% of 8-K's"})
phrase_count = phrase_count.sort_values(by = ["% of 8-K's"], ascending = False).reset_index(drop = True)

In [48]:
phrase_count.head(10)

Unnamed: 0,phrase,% of 8-K's
0,financial condition,0.951866
1,financial statement,0.946368
2,financial statements,0.945491
3,press release,0.895192
4,financial results,0.725056
5,net income,0.711604
6,quarter end,0.657562
7,executive officer,0.634987
8,quarter ended,0.633641
9,chief executive,0.627208
