## Imports

In [1]:
import pandas as pd
import numpy as np
import spacy
from collections import Counter
from spacy import displacy
import re

In [2]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.util import bigrams, trigrams, ngrams
from joblib import dump

In [3]:
df = pd.read_csv("Data/FinancialNewsData.csv", encoding='Windows-1252', names=['Label', 'Headline'])

# Extracting Features from Parts of Speech, Bigrams, and Trigrams

Bag of words suffers from the curse of dimensionality quite a bit. Many words only appear in one or two headlines. So here we will extract features based on the part of speech that the word is tagged with. This is a very simple task with spacy and the parts of speech that we are interested in are nouns, verbs, adjectives, adverbs

In [4]:
nlp = spacy.load("en_core_web_md")

In [5]:
df.Headline = df.Headline.apply(nlp)

In [6]:
noun_counter = Counter()
verb_counter = Counter()
adj_counter = Counter()
adverb_counter = Counter()

bigram_counter = Counter()
trigram_counter = Counter()

In [7]:
def increment_counter(doc):
    tokens = [token for token in doc if not token.is_punct]
    
    building = [i for i in tokens if (i.lower_ == 'building' and i.pos == 'VERB')]
    
        
        
    bi_grams = list(bigrams(tokens))
    tri_grams = list(trigrams(tokens))
    
    for token in tokens:
        if token.pos_ == 'NOUN':
            noun_counter[token.lower_] += 1
        elif token.pos_ == 'VERB':
            verb_counter[token.lower_] += 1
            
        elif token.pos_ == 'ADJ':
            adj_counter[token.lower_] += 1
        
        elif token.pos_ == 'ADV':
            adverb_counter[token.lower_] += 1
    
    for bi in bi_grams:
        bis = list([bi[0].text, bi[1].text])
        # Ensure both words in the bigram are not stop words i.e (is, the) or (is, a) will not be included
        if ('`' not in bis) and ("'s" not in bis) and not (bi[0].is_stop and bi[1].is_stop):
            bigram_counter[bi[0].lower_, bi[1].lower_] += 1
    
    for tri in tri_grams:
        tris = list([tri[0].text, tri[1].text, tri[2].text])
        # Ensure all three words in trigram are not all stop words i.e (will, be, a) and (and, is, a) will not be included
        if ('`' not in tris) and ("'s" not in tris) and not (tri[0].is_stop and tri[1].is_stop and tri[2].is_stop) :
            trigram_counter[(tri[0].lower_, tri[1].lower_, tri[2].lower_)] += 1
   

In [8]:
_ = df.Headline.apply(lambda x: increment_counter(x))

In [9]:
def create_df(verb=200, adj=200, adv=200, noun=200, bigram=100, trigram=100):
    
    column_names = ["Headline"]
    
    # loop through the X most common verbs and add them to column names
    for i in verb_counter.most_common(verb):
        column_names.append(i[0] + "_VERB")
    
    for i in adj_counter.most_common(adj):
        column_names.append(i[0] + "_ADJ")
    
    for i in adverb_counter.most_common(adv):
        column_names.append(i[0] + "_ADV")
        
    for i in noun_counter.most_common(noun):
        column_names.append(i[0] + "_NOUN")
        
    #same for bigrams
    for i in bigram_counter.most_common(bigram):
        column_names.append(str(i[0]))
    #same for trigrams
    for i in trigram_counter.most_common(trigram):
        column_names.append(str(i[0]))
    
    #return empty dataframe with column names
    return pd.DataFrame(columns=column_names)

In [10]:
feature_df = create_df()

In [11]:
feature_df.head()

Unnamed: 0,Headline,said_VERB,m_VERB,operating_VERB,be_VERB,compared_VERB,based_VERB,according_VERB,has_VERB,increased_VERB,...,"('stock', 'exchange', 'release')","('a', 'net', 'loss')","('a', 'net', 'profit')","('of', 'the', 'board')","('net', 'sales', 'increased')","('in', 'the', 'fourth')","('fourth', 'quarter', 'of')","('period', 'in', '2007')","('month', 'period', 'increased')","('to', 'the', 'corresponding')"


Remove duplicate columns and instantiate everything to 0 so it can be incremented

In [12]:
feature_df['Headline'] = df.Headline
feature_df.iloc[:, 1:] = 0

In [13]:
feature_df.filter(regex='building').columns

Index(['building_VERB', 'building_NOUN'], dtype='object')

In [14]:
verb_counter.most_common(200)

[('said', 544),
 ('m', 252),
 ('operating', 220),
 ('be', 189),
 ('compared', 149),
 ('based', 127),
 ('according', 123),
 ('has', 110),
 ('increased', 109),
 ('rose', 102),
 ('expected', 83),
 ('decreased', 79),
 ('signed', 75),
 ('have', 72),
 ('announced', 70),
 ('is', 69),
 ('including', 69),
 ('increase', 57),
 ('includes', 56),
 ('fell', 55),
 ('reported', 53),
 ('made', 53),
 ('totalled', 51),
 ('include', 50),
 ('expects', 49),
 ('estimated', 49),
 ('totaled', 48),
 ('was', 47),
 ('says', 45),
 ('start', 42),
 ('developed', 41),
 ('sell', 41),
 ('completed', 41),
 ('leading', 39),
 ('agreed', 38),
 ('had', 38),
 ('added', 38),
 ('amounted', 37),
 ('acquired', 37),
 ('continue', 37),
 ('provide', 36),
 ('make', 36),
 ('awarded', 35),
 ('grew', 34),
 ('used', 34),
 ('won', 34),
 ('following', 33),
 ('related', 33),
 ('headquartered', 33),
 ('use', 33),
 ('been', 32),
 ('provides', 32),
 ('are', 32),
 ('owned', 31),
 ('delivered', 31),
 ('built', 31),
 ('set', 31),
 ('started', 31

As you can see the same word could be tagged as a noun in one sentence but tagged as a verb in another. This happened on 30 occasions, so our final feature df has 970 features

In [15]:
def increment_df(index, doc):
    
    tokens = [token for token in doc if not token.is_punct]
    
    
        
    
    bi_grams = list(bigrams(tokens))
    tri_grams = list(trigrams(tokens))
    
    for token in tokens:
        
        if token.pos_ == "NOUN":
            if token.lower_ + "_NOUN" in list(feature_df.columns):
                feature_df.loc[index, token.lower_ + "_NOUN"] += 1
        
        elif token.pos_ == "VERB":
            if token.lower_ + "_VERB" in list(feature_df.columns):
                feature_df.loc[index, token.lower_ + "_VERB"] += 1
        
        elif token.pos_ == "ADJ":    
            if token.lower_ + "_ADJ" in list(feature_df.columns):
                feature_df.loc[index, token.lower_ + "_ADJ"] += 1
        
        elif token.pos_ == "ADV":
            if token.lower_ + "_ADV" in list(feature_df.columns):
                feature_df.loc[index, token.lower_ + "_ADV"] += 1
    
    for bi in bi_grams:
        bis = str((bi[0].lower_, bi[1].lower_))
        # Ensure both words in the bigram are not stop words i.e (is, the) or (is, a) will not be included
        if bis in feature_df.columns:
            feature_df.loc[index, str((bi[0].lower_, bi[1].lower_))] += 1
    
    for tri in tri_grams:
        tris = str((tri[0].lower_, tri[1].lower_, tri[2].lower_))
        # Ensure all three words in trigram are not all stop words i.e (will, be, a) and (and, is, a) will not be included
        if tris in feature_df.columns :
            feature_df.loc[index, tris] += 1
    

In [16]:
# loop through each headline and increment the corresponding column
_ = feature_df['Headline'].apply(lambda x: increment_df(feature_df[feature_df.Headline == x].index[0], x))

In [17]:
feature_df.iloc[:, 1:].sum().sort_values(ascending=True).head(5)

80mn_ADV              1
aggressively_ADV      1
geographically_ADV    1
markedly_ADV          1
broad_ADV             1
dtype: object

In [18]:
feature_df.drop("Headline", axis=1, inplace=True)

All columns have atleast 1 occurance of the word. We do still have a very sparse dataframe; but not as sparce as the bag of words approach.

# Save Features to Disk

In [19]:
dump(feature_df, "Objects/pos.joblib")

['Objects/pos.joblib']