In [1]:
import re
import heapq
import nltk
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# Events Cleaning

In [2]:
merged_data = pd.read_csv('merged_all_data.csv')
merged_data.head()

Unnamed: 0,date,time,event_type,full_text,symbol,Surprise(%),Reported EPS,Consensus EPS,hr,pre_market,date_idx,price_change_7,price_change_30,price_change_90,price_change_365
0,2005-08-18,92410,EVENTS:\tResults of Operations and Financial C...,\n<DOCUMENT>\nFILE:LANC/LANC-8K-20050818092410...,LANC,5.36,0.59,0.56,9.24,True,1021,-0.98,-1.5,-1.21,9.06
1,2005-10-28,80708,EVENTS:\tResults of Operations and Financial C...,\n<DOCUMENT>\nFILE:LANC/LANC-8K-20051028080708...,LANC,-10.17,0.53,0.59,8.07,True,1092,-2.49,-5.89,-8.48,-5.29
2,2006-01-30,81137,EVENTS:\tResults of Operations and Financial C...,\n<DOCUMENT>\nFILE:LANC/LANC-8K-20060130081137...,LANC,4.69,0.67,0.64,8.11,True,1186,3.92,8.86,2.6,-2.05
3,2006-04-28,81811,EVENTS:\tResults of Operations and Financial C...,\n<DOCUMENT>\nFILE:LANC/LANC-8K-20060428081811...,LANC,-28.0,0.36,0.5,8.18,True,1274,-2.27,-1.97,5.41,2.44
4,2006-10-30,82612,EVENTS:\tResults of Operations and Financial C...,\n<DOCUMENT>\nFILE:LANC/LANC-8K-20061030082612...,LANC,-21.82,0.43,0.55,8.26,True,1459,-9.19,-5.68,11.18,7.63


In [3]:
def event_clean(text):
    result = re.sub('\n', '', text)
    result = re.split('\t', result)
    if len(result) > 0:
        return str.lower(result[1])
    else:
        return 'Missing'

In [4]:
cleaned_event = merged_data['event_type'].apply(event_clean)
merged_data.insert(3, 'cleaned_event', cleaned_event)
merged_data.head()

Unnamed: 0,date,time,event_type,cleaned_event,full_text,symbol,Surprise(%),Reported EPS,Consensus EPS,hr,pre_market,date_idx,price_change_7,price_change_30,price_change_90,price_change_365
0,2005-08-18,92410,EVENTS:\tResults of Operations and Financial C...,results of operations and financial condition,\n<DOCUMENT>\nFILE:LANC/LANC-8K-20050818092410...,LANC,5.36,0.59,0.56,9.24,True,1021,-0.98,-1.5,-1.21,9.06
1,2005-10-28,80708,EVENTS:\tResults of Operations and Financial C...,results of operations and financial condition,\n<DOCUMENT>\nFILE:LANC/LANC-8K-20051028080708...,LANC,-10.17,0.53,0.59,8.07,True,1092,-2.49,-5.89,-8.48,-5.29
2,2006-01-30,81137,EVENTS:\tResults of Operations and Financial C...,results of operations and financial condition,\n<DOCUMENT>\nFILE:LANC/LANC-8K-20060130081137...,LANC,4.69,0.67,0.64,8.11,True,1186,3.92,8.86,2.6,-2.05
3,2006-04-28,81811,EVENTS:\tResults of Operations and Financial C...,results of operations and financial condition,\n<DOCUMENT>\nFILE:LANC/LANC-8K-20060428081811...,LANC,-28.0,0.36,0.5,8.18,True,1274,-2.27,-1.97,5.41,2.44
4,2006-10-30,82612,EVENTS:\tResults of Operations and Financial C...,results of operations and financial condition,\n<DOCUMENT>\nFILE:LANC/LANC-8K-20061030082612...,LANC,-21.82,0.43,0.55,8.26,True,1459,-9.19,-5.68,11.18,7.63


In [8]:
events = ["Material definitive agreements",
"Bankruptcies or receiverships",
"Director is elected",
"Director departs",
"Asset movement: acquisition or sale",
"Results of operations and financial condition",
"Material Direct Financial obligations",
"Triggering events that accelerate material obligations",
"Exit or disposal plans",
"Material impairments",
"Delisting or transfer exchange notices",
"Unregistered equity sales",
"Modifications to shareholder rights",
"Change in accountant",
"SEC investigations and internal reviews",
"Financial non-reliance notices",
"Changes in control of the company",
"Changes in executive management",
"Departure or appointment of company officers",
"Amendments to company Governance Policies",
"Trading suspension",
"Change in credit",
"Change in company status",
"Other events"]

# Train, Val, Test Split

In [5]:
X_train, X_test = train_test_split(merged_data, test_size = 0.51, random_state = 42)

In [6]:
X_val, X_test = train_test_split(X_test, test_size = 0.51, random_state = 42)

In [11]:
X_train.shape

(17098, 16)

# Unigram Encoding

In [7]:
word_count = {}
stopwords = nltk.corpus.stopwords.words('english')

for form in X_train['full_text']:
    cleaned_form = re.sub(r'\W',' ', form)
    cleaned_form = re.sub(r'\s+',' ', cleaned_form)
    cleaned_form = cleaned_form.lower()
    tokens = nltk.word_tokenize(cleaned_form)
    for token in tokens:
        if token in stopwords:
            continue
        if token not in word_count.keys():                 
            word_count[token] = 1
        else: 
            word_count[token] += 1

In [8]:
most_freq = heapq.nlargest(1000, word_count, key=word_count.get)

In [9]:
form_vectors = []
for form in merged_data['full_text']:
    cleaned_form = re.sub(r'\W',' ', form)
    cleaned_form = re.sub(r'\s+',' ', cleaned_form)
    cleaned_form = cleaned_form.lower()
    tokens = nltk.word_tokenize(cleaned_form)
    temp = []
    for token in most_freq:
        if token in cleaned_form:                 
            temp.append(1)
        else: 
            temp.append(0)
    form_vectors.append(temp)

In [10]:
merged_data['unigram_vec'] = form_vectors
merged_data.head()

Unnamed: 0,date,time,event_type,cleaned_event,full_text,symbol,Surprise(%),Reported EPS,Consensus EPS,hr,pre_market,date_idx,price_change_7,price_change_30,price_change_90,price_change_365,unigram_vec
0,2005-08-18,92410,EVENTS:\tResults of Operations and Financial C...,results of operations and financial condition,\n<DOCUMENT>\nFILE:LANC/LANC-8K-20050818092410...,LANC,5.36,0.59,0.56,9.24,True,1021,-0.98,-1.5,-1.21,9.06,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ..."
1,2005-10-28,80708,EVENTS:\tResults of Operations and Financial C...,results of operations and financial condition,\n<DOCUMENT>\nFILE:LANC/LANC-8K-20051028080708...,LANC,-10.17,0.53,0.59,8.07,True,1092,-2.49,-5.89,-8.48,-5.29,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ..."
2,2006-01-30,81137,EVENTS:\tResults of Operations and Financial C...,results of operations and financial condition,\n<DOCUMENT>\nFILE:LANC/LANC-8K-20060130081137...,LANC,4.69,0.67,0.64,8.11,True,1186,3.92,8.86,2.6,-2.05,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ..."
3,2006-04-28,81811,EVENTS:\tResults of Operations and Financial C...,results of operations and financial condition,\n<DOCUMENT>\nFILE:LANC/LANC-8K-20060428081811...,LANC,-28.0,0.36,0.5,8.18,True,1274,-2.27,-1.97,5.41,2.44,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ..."
4,2006-10-30,82612,EVENTS:\tResults of Operations and Financial C...,results of operations and financial condition,\n<DOCUMENT>\nFILE:LANC/LANC-8K-20061030082612...,LANC,-21.82,0.43,0.55,8.26,True,1459,-9.19,-5.68,11.18,7.63,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, ..."


In [None]:
merged_data.shape

# Quality Phrase Encoding

In [11]:
quality_phrases = pd.read_csv('AutoPhrase_multi-words.txt', sep = '\t', header = None)
quality_phrases.head(10)

Unnamed: 0,0,1
0,0.996891,personal property
1,0.995891,credit rating
2,0.995655,environmental liabilities
3,0.995593,contractual obligations
4,0.995225,severance benefits
5,0.995058,accounting policies
6,0.995004,annual salary
7,0.994275,withholding tax
8,0.994225,service provider
9,0.994216,debt financing


In [12]:
def clean(text):
    return str(text).lower()

In [13]:
quality_phrases['cleaned'] = quality_phrases[1].apply(clean)
quality_phrases.head()

Unnamed: 0,0,1,cleaned
0,0.996891,personal property,personal property
1,0.995891,credit rating,credit rating
2,0.995655,environmental liabilities,environmental liabilities
3,0.995593,contractual obligations,contractual obligations
4,0.995225,severance benefits,severance benefits


In [14]:
top_phrases = quality_phrases['cleaned'].loc[quality_phrases[0] > 0.9].values

In [15]:
%%time

phrase_vectors = []
for form in merged_data['full_text']:
    cleaned_form = form.lower()
    temp = []
    for phrase in top_phrases:
        if phrase in cleaned_form:                 
            temp.append(1)
        else: 
            temp.append(0)
    phrase_vectors.append(temp)

CPU times: user 1h 36min, sys: 42.6 s, total: 1h 36min 43s
Wall time: 1h 37min 41s


In [16]:
merged_data['phrase_vec'] = phrase_vectors

In [None]:
merged_data.head()

In [18]:
merged_data.to_pickle('feature_encoded_data.pkl')