In [1]:
import re
import heapq
import nltk
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# Events Cleaning

In [2]:
merged_data = pd.read_csv('../data/processed/merged_all_data.csv')
merged_data.head()

Unnamed: 0,date,time,event_type,full_text,symbol,Surprise(%),Reported EPS,Consensus EPS,hr,pre_market,date_idx,price_change_7,price_change_30,price_change_90,price_change_365,targe_price_change,prev_vix_values
0,2004-07-20,143800,EVENTS:\t\tFinancial statements and exhibits\n,\n<DOCUMENT>\nFILE:FULT/FULT-8K-20040720143800...,FULT,-3.13,0.31,0.32,14.38,False,627,-0.5,2.57,3.01,8.21,0.414034,14.17
1,2004-10-19,174320,EVENTS:\tResults of Operations and Financial C...,\n<DOCUMENT>\nFILE:FULT/FULT-8K-20041019174320...,FULT,0.0,0.32,0.32,17.43,False,718,0.16,0.39,7.58,14.93,-1.208981,15.13
2,2005-01-18,123338,EVENTS:\tResults of Operations and Financial C...,\n<DOCUMENT>\nFILE:FULT/FULT-8K-20050118123338...,FULT,0.0,0.33,0.33,12.33,False,809,0.53,0.61,5.15,13.6,-0.25099,12.47
3,2005-04-13,140932,EVENTS:\tResults of Operations and Financial C...,\n<DOCUMENT>\nFILE:FULT/FULT-8K-20050413140932...,FULT,0.0,0.33,0.33,14.09,False,894,-1.75,-2.19,-1.37,8.56,0.070178,13.31
4,2005-07-19,132220,EVENTS:\tResults of Operations and Financial C...,\n<DOCUMENT>\nFILE:FULT/FULT-8K-20050719132220...,FULT,0.0,0.27,0.27,13.22,False,991,-1.28,2.88,11.54,17.17,0.604141,10.45


In [3]:
def event_clean(text):
        result = re.sub('\n', '', text)
        result = re.sub('\t+', '\t', result)
        result = re.split('\t', result)

        if len(result) > 0:
            # nonlocal counter
            # counter += 1
            # print(result)
            result = [s.lower() for s in result[1:]] # exclude the first item
            cleaned_result = []
            for s in result:
                if ';' in s:
                    for sub in s.split(';'):
                        cleaned_result.append(sub.strip())
                else:
                    cleaned_result.append(s.strip())
            # print(cleaned_result)
            # print()
            # print()
            return cleaned_result
        else:
            return ['Missing']

In [4]:
cleaned_event = merged_data['event_type'].apply(event_clean)
merged_data.insert(3, 'cleaned_event', cleaned_event)

In [5]:
merged_data.head()

Unnamed: 0,date,time,event_type,cleaned_event,full_text,symbol,Surprise(%),Reported EPS,Consensus EPS,hr,pre_market,date_idx,price_change_7,price_change_30,price_change_90,price_change_365,targe_price_change,prev_vix_values
0,2004-07-20,143800,EVENTS:\t\tFinancial statements and exhibits\n,[financial statements and exhibits],\n<DOCUMENT>\nFILE:FULT/FULT-8K-20040720143800...,FULT,-3.13,0.31,0.32,14.38,False,627,-0.5,2.57,3.01,8.21,0.414034,14.17
1,2004-10-19,174320,EVENTS:\tResults of Operations and Financial C...,[results of operations and financial condition...,\n<DOCUMENT>\nFILE:FULT/FULT-8K-20041019174320...,FULT,0.0,0.32,0.32,17.43,False,718,0.16,0.39,7.58,14.93,-1.208981,15.13
2,2005-01-18,123338,EVENTS:\tResults of Operations and Financial C...,[results of operations and financial condition...,\n<DOCUMENT>\nFILE:FULT/FULT-8K-20050118123338...,FULT,0.0,0.33,0.33,12.33,False,809,0.53,0.61,5.15,13.6,-0.25099,12.47
3,2005-04-13,140932,EVENTS:\tResults of Operations and Financial C...,[results of operations and financial condition...,\n<DOCUMENT>\nFILE:FULT/FULT-8K-20050413140932...,FULT,0.0,0.33,0.33,14.09,False,894,-1.75,-2.19,-1.37,8.56,0.070178,13.31
4,2005-07-19,132220,EVENTS:\tResults of Operations and Financial C...,[results of operations and financial condition...,\n<DOCUMENT>\nFILE:FULT/FULT-8K-20050719132220...,FULT,0.0,0.27,0.27,13.22,False,991,-1.28,2.88,11.54,17.17,0.604141,10.45


In [6]:
def clean_event_type_2(e):
    result = []
    for event in e:
        cleaned = event.replace('2.02', '').strip()
        if cleaned != '' and cleaned not in result:
            result.append(cleaned)
    return result

In [7]:
merged_data['cleaned_event'] = merged_data['cleaned_event'].apply(clean_event_type_2)

In [8]:
merged_data.head()

Unnamed: 0,date,time,event_type,cleaned_event,full_text,symbol,Surprise(%),Reported EPS,Consensus EPS,hr,pre_market,date_idx,price_change_7,price_change_30,price_change_90,price_change_365,targe_price_change,prev_vix_values
0,2004-07-20,143800,EVENTS:\t\tFinancial statements and exhibits\n,[financial statements and exhibits],\n<DOCUMENT>\nFILE:FULT/FULT-8K-20040720143800...,FULT,-3.13,0.31,0.32,14.38,False,627,-0.5,2.57,3.01,8.21,0.414034,14.17
1,2004-10-19,174320,EVENTS:\tResults of Operations and Financial C...,[results of operations and financial condition...,\n<DOCUMENT>\nFILE:FULT/FULT-8K-20041019174320...,FULT,0.0,0.32,0.32,17.43,False,718,0.16,0.39,7.58,14.93,-1.208981,15.13
2,2005-01-18,123338,EVENTS:\tResults of Operations and Financial C...,[results of operations and financial condition...,\n<DOCUMENT>\nFILE:FULT/FULT-8K-20050118123338...,FULT,0.0,0.33,0.33,12.33,False,809,0.53,0.61,5.15,13.6,-0.25099,12.47
3,2005-04-13,140932,EVENTS:\tResults of Operations and Financial C...,[results of operations and financial condition...,\n<DOCUMENT>\nFILE:FULT/FULT-8K-20050413140932...,FULT,0.0,0.33,0.33,14.09,False,894,-1.75,-2.19,-1.37,8.56,0.070178,13.31
4,2005-07-19,132220,EVENTS:\tResults of Operations and Financial C...,[results of operations and financial condition...,\n<DOCUMENT>\nFILE:FULT/FULT-8K-20050719132220...,FULT,0.0,0.27,0.27,13.22,False,991,-1.28,2.88,11.54,17.17,0.604141,10.45


# Target Creation

In [9]:
def up_down_stay(price):
    if abs(price) < 1:
        return 'STAY'
    if price < 0:
        return 'DOWN'
    else:
        return 'UP'

In [10]:
merged_data['target'] = merged_data['targe_price_change'].apply(up_down_stay)

In [11]:
merged_data.head()

Unnamed: 0,date,time,event_type,cleaned_event,full_text,symbol,Surprise(%),Reported EPS,Consensus EPS,hr,pre_market,date_idx,price_change_7,price_change_30,price_change_90,price_change_365,targe_price_change,prev_vix_values,target
0,2004-07-20,143800,EVENTS:\t\tFinancial statements and exhibits\n,[financial statements and exhibits],\n<DOCUMENT>\nFILE:FULT/FULT-8K-20040720143800...,FULT,-3.13,0.31,0.32,14.38,False,627,-0.5,2.57,3.01,8.21,0.414034,14.17,STAY
1,2004-10-19,174320,EVENTS:\tResults of Operations and Financial C...,[results of operations and financial condition...,\n<DOCUMENT>\nFILE:FULT/FULT-8K-20041019174320...,FULT,0.0,0.32,0.32,17.43,False,718,0.16,0.39,7.58,14.93,-1.208981,15.13,DOWN
2,2005-01-18,123338,EVENTS:\tResults of Operations and Financial C...,[results of operations and financial condition...,\n<DOCUMENT>\nFILE:FULT/FULT-8K-20050118123338...,FULT,0.0,0.33,0.33,12.33,False,809,0.53,0.61,5.15,13.6,-0.25099,12.47,STAY
3,2005-04-13,140932,EVENTS:\tResults of Operations and Financial C...,[results of operations and financial condition...,\n<DOCUMENT>\nFILE:FULT/FULT-8K-20050413140932...,FULT,0.0,0.33,0.33,14.09,False,894,-1.75,-2.19,-1.37,8.56,0.070178,13.31,STAY
4,2005-07-19,132220,EVENTS:\tResults of Operations and Financial C...,[results of operations and financial condition...,\n<DOCUMENT>\nFILE:FULT/FULT-8K-20050719132220...,FULT,0.0,0.27,0.27,13.22,False,991,-1.28,2.88,11.54,17.17,0.604141,10.45,STAY


# Train, Val, Test Split

In [12]:
X_train, X_test = train_test_split(merged_data, test_size = 0.51, random_state = 42)

In [13]:
X_val, X_test = train_test_split(X_test, test_size = 0.51, random_state = 42)

In [14]:
X_train.shape

(17096, 19)

# Unigram Encoding

In [15]:
def uni_encoding(data, category):
    word_count = {}
    stopwords = nltk.corpus.stopwords.words('english')
    temp = data.loc[data['target'] == category]
    for form in temp['full_text']:
        cleaned_form = re.sub(r'\W',' ', form)
        cleaned_form = re.sub(r'\s+',' ', cleaned_form)
        cleaned_form = re.sub(r'\d','', cleaned_form)
        cleaned_form = cleaned_form.lower()
        tokens = nltk.word_tokenize(cleaned_form)
        for token in tokens:
            if token in stopwords:
                continue
            if token not in word_count.keys():                 
                word_count[token] = 1
            else: 
                word_count[token] += 1
    return word_count

In [18]:
%%time

up_dict = uni_encoding(X_train, 'UP')
down_dict = uni_encoding(X_train, 'DOWN')
stay_dict = uni_encoding(X_train, 'STAY')

CPU times: user 5min 2s, sys: 914 ms, total: 5min 2s
Wall time: 5min 3s


In [31]:
all_word_count = {**up_dict, **stay_dict, **down_dict}
all_word_count = {key:val for key, val in all_word_count.items() if val > 10}

In [32]:
%%time

total_freq = sum(all_word_count.values())
pmi_dict = {}
for token in all_word_count.keys():
    p_x = all_word_count[token] / total_freq
    max_cond = []
    for i in [up_dict, down_dict, stay_dict]:
        if token in i.keys():
            temp_sum = sum(i.values())
            max_cond.append(i[token] / temp_sum)
        else:
            max_cond.append(0)
    pmi_dict[token] = np.log(np.mean(max_cond) / p_x)

CPU times: user 17.1 s, sys: 101 ms, total: 17.2 s
Wall time: 17.2 s


In [33]:
highest_pmi = heapq.nlargest(2319, pmi_dict, key = pmi_dict.get)

In [40]:
%%time

form_vectors = []
for form in tqdm(merged_data['full_text']):
    cleaned_form = re.sub(r'\W',' ', form)
    cleaned_form = re.sub(r'\s+',' ', cleaned_form)
    cleaned_form = re.sub(r'\d','', cleaned_form)
    cleaned_form = cleaned_form.lower()
    tokens = nltk.word_tokenize(cleaned_form)
    temp = []
    for token in highest_pmi:
        if token in cleaned_form:                 
            temp.append(1)
        else: 
            temp.append(0)
    form_vectors.append(temp)

CPU times: user 25min 51s, sys: 2.74 s, total: 25min 54s
Wall time: 25min 54s


In [41]:
merged_data['unigram_vec'] = form_vectors
merged_data.head()

Unnamed: 0,date,time,event_type,cleaned_event,full_text,symbol,Surprise(%),Reported EPS,Consensus EPS,hr,pre_market,date_idx,price_change_7,price_change_30,price_change_90,price_change_365,targe_price_change,prev_vix_values,target,unigram_vec
0,2004-07-20,143800,EVENTS:\t\tFinancial statements and exhibits\n,[financial statements and exhibits],\n<DOCUMENT>\nFILE:FULT/FULT-8K-20040720143800...,FULT,-3.13,0.31,0.32,14.38,False,627,-0.5,2.57,3.01,8.21,0.414034,14.17,STAY,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,2004-10-19,174320,EVENTS:\tResults of Operations and Financial C...,[results of operations and financial condition...,\n<DOCUMENT>\nFILE:FULT/FULT-8K-20041019174320...,FULT,0.0,0.32,0.32,17.43,False,718,0.16,0.39,7.58,14.93,-1.208981,15.13,DOWN,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2005-01-18,123338,EVENTS:\tResults of Operations and Financial C...,[results of operations and financial condition...,\n<DOCUMENT>\nFILE:FULT/FULT-8K-20050118123338...,FULT,0.0,0.33,0.33,12.33,False,809,0.53,0.61,5.15,13.6,-0.25099,12.47,STAY,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,2005-04-13,140932,EVENTS:\tResults of Operations and Financial C...,[results of operations and financial condition...,\n<DOCUMENT>\nFILE:FULT/FULT-8K-20050413140932...,FULT,0.0,0.33,0.33,14.09,False,894,-1.75,-2.19,-1.37,8.56,0.070178,13.31,STAY,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,2005-07-19,132220,EVENTS:\tResults of Operations and Financial C...,[results of operations and financial condition...,\n<DOCUMENT>\nFILE:FULT/FULT-8K-20050719132220...,FULT,0.0,0.27,0.27,13.22,False,991,-1.28,2.88,11.54,17.17,0.604141,10.45,STAY,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


# Quality Phrase Encoding

In [46]:
quality_phrases = pd.read_csv('../data/AutoPhrase_multi-words.txt', sep = '\t', header = None)
quality_phrases.head(10)

Unnamed: 0,0,1
0,0.996891,personal property
1,0.995891,credit rating
2,0.995655,environmental liabilities
3,0.995593,contractual obligations
4,0.995225,severance benefits
5,0.995058,accounting policies
6,0.995004,annual salary
7,0.994275,withholding tax
8,0.994225,service provider
9,0.994216,debt financing


In [47]:
def clean(text):
    return str(text).lower()

In [48]:
quality_phrases['cleaned'] = quality_phrases[1].apply(clean)
quality_phrases.head()

Unnamed: 0,0,1,cleaned
0,0.996891,personal property,personal property
1,0.995891,credit rating,credit rating
2,0.995655,environmental liabilities,environmental liabilities
3,0.995593,contractual obligations,contractual obligations
4,0.995225,severance benefits,severance benefits


In [49]:
top_phrases = quality_phrases['cleaned'].loc[quality_phrases[0] > 0.9].values

In [50]:
%%time

phrase_vectors = []
for form in tqdm(merged_data['full_text']):
    cleaned_form = form.lower()
    temp = []
    for phrase in top_phrases:
        if phrase in cleaned_form:                 
            temp.append(1)
        else: 
            temp.append(0)
    phrase_vectors.append(temp)

100%|██████████| 34891/34891 [1:59:51<00:00,  4.85it/s]   

CPU times: user 1h 59min 27s, sys: 34.3 s, total: 2h 2s
Wall time: 1h 59min 51s





In [51]:
merged_data['phrase_vec'] = phrase_vectors

In [None]:
merged_data.head()

In [52]:
merged_data.to_pickle('../data/feature_encoded_data.pkl')