# Named Entity Extraction

In [None]:
import pandas as pd 
import re
import json
from nltk.tag import pos_tag
from nltk.chunk import conlltags2tree, tree2conlltags
import nltk
import math
stopwords = nltk.corpus.stopwords.words('english')

In [12]:
sample = pd.read_csv("sample.csv", index_col=[0])

In [3]:
## Named entities are NP that are an OPTIONAL DT followed by any num of JJ and then a NN
## https://www.nltk.org/book/ch07.html
## Regex is 
pattern = 'NP: {<DT>?<JJ>*<NN>}'

In [4]:
## this is what is taught in class
for line in sample[:1].iterrows():
    ## converts the content into lower and split thereby tokenising
    content = re.sub("[^0-9a-zA-Z\&]+", " ",line[1]['content']).split(" ")
    content = [ _ for _ in content if _ not in ["", "xli", "x", "xle", "xar", "xly"] ]

    ## tags the content by parts of speech
    content_pos = pos_tag(content)

    ## get named entities using our pattern
    cp = nltk.RegexpParser(pattern)
    chunk_sentence = cp.parse(content_pos)
    iob_tagged = tree2conlltags(chunk_sentence)

    for _ in iob_tagged:
        if _[1] == "NNP":
            print (_[0])

## results arent that good

Thursday
January
Thomas
Hughes
Industrial
Industrial
Sector
S&P
Industrial
Sector
ETF
XLI
Yes
Energy
Sector
EPS
Energy
Sector
XLE
EPS
Industrial
Sector
S&P
Energy
Sector
Industrial
Energy
Don
Bet
Boeing
Boeing
BA
Industrial
Sector
SPDR
Honeywell
HON
Max
Boeing
CEO
Dennis
Muilenberg
Muilenberg
Boeing
Max
NativeDisplayAdID
Buffett
Which
Wall
Street
Legend
Buffett
Buffett
America
XLI
Caterpillar
CAT
Deere
Company
DE
Cummins
Inc
CMI
Phase
One
Deal
Cummins
Inc
Move
Cummins
Inc
Cummins
August
Phase
One
Deal
Cummins
Cummins
XLI
Deere
Company
China
Key
Growth
Deere
Company
Phase
One
Deal
Phase
One
Deal
China
China
Deere
Company
EPS
EPS
Cummins
Deere
Company
Caterpillar
A
Dividend
Aristocrat
Capital
Gains
Shares
Caterpillar
Caterpillar
CEO
Flag
Pattern
Caterpillar
Dividend
Aristocrat
Stocks
Will
Benefit
Federal
Reserve
Federal
Reserve
Federal
Reserve
China
Fed
July
Fed
October
January
February
February
Fed
U
S
Covid
View
Federal
Reserve
Complete


In [5]:
## use nltk named entity function
for line in sample[:1].iterrows():
    ## converts the content into lower and split thereby tokenising
    content = re.sub("[^0-9a-zA-Z\&]+", " ",line[1]['content']).split(" ")
    content = [ _ for _ in content if _ not in ["", "xli", "x", "xle", "xar", "xly"] ]

    ## create named entity tree
    ne_tree = nltk.ne_chunk(pos_tag(content), binary=True)
    for entity in ne_tree:
        if len(entity) == 1:
            ## use entity[0] to get the token
            print(entity[0][0])

## slightly better but still pretty bad

Industrial
Industrial
ETF
Energy
EPS
Energy
XLE
EPS
Industrial
Energy
Industrial
Energy
Industrial
Honeywell
XLI
Buffett
Which
Buffett
Buffett
America
XLI
Cummins
Cummins
Cummins
XLI
China
China
Company
EPS
Company
CEO
China
Fed
Fed


In [6]:
## use spaCy
import spacy 
from spacy import displacy 
## python -m spacy download en_core_web_sm 
## theres md and lg 
## en_core_web_sm is a CNN for token vectors, POS tags
## pretrained statistical models for English
import en_core_web_sm
nlp = en_core_web_sm.load()

In [7]:
for line in sample[:1].iterrows(): 
    content = re.sub("[^a-zA-Z\&]+", " ",line[1]['content'])
    content = nlp(content)
    print ([(_.text, _.label) for _ in content.ents])

[('Thursday January', 391), ('Thomas Hughes', 380), ('the Energy Sector', 383), ('EPS', 383), ('XLE', 383), ('EPS', 383), ('this year', 391), ('The Industrial Sector', 388), ('next year', 391), ('S&P', 383), ('only this year', 391), ('two year', 391), ('Energy', 383), ('Don t', 380), ('Boeing Boeing BA', 383), ('the Industrial Sector SPDR', 385), ('Honeywell', 380), ('HON', 383), ('XLI', 383), ('today', 391), ('Max', 380), ('Boeing', 383), ('Dennis Muilenberg Muilenberg', 380), ('Boeing', 383), ('Max', 380), ('rel nofollow', 380), ('href https www', 380), ('aspx NativeDisplayAdID &ImpressionID &UserID &Placement', 383), ('this Wall Street Legend s', 387), ('Buffett', 380), ('Buffett', 380), ('America', 384), ('XLI', 383), ('three', 397), ('today', 391), ('Caterpillar CAT Deere & Company DE', 383), ('Cummins Inc CMI', 383), ('One', 397), ('Cummins', 383), ('last August', 391), ('One', 397), ('Cummins', 383), ('Cummins', 383), ('today', 391), ('XLI', 383), ('Deere & Company China', 383),

# Naive Bayes Classification (manual)

### P(news article|Topic1) == P(word1|Topic1) * P(word2|Topic1) * ... 

### P(news article|Topic2) == P(word1|Topic2) * P(word2|Topic2) * ... 

. 
.
.

### Find the probability of a news article to each topic to find which topic it is most likely

In [46]:
def createWordToTopic(df):
    word_to_topic = {}
    num_of_words = {}

    for row in df.iterrows():
        topic = row[1]["topic_area"]
        content = re.sub("[^a-zA-Z\&]+", " ", row[1]['content']).lower().split(" ")
        content = [ _ for _ in content if _ not in stopwords ]

        if word_to_topic.get(topic, None) == None:
            word_to_topic[topic] = {}

        if num_of_words.get(topic, None) == None:
            num_of_words[topic] = 0

        for word in content:
            if word_to_topic[topic].get(word, None) == None:
                word_to_topic[topic][word] = 0
            word_to_topic[topic][word] += 1
            num_of_words[topic] += 1
    
    # for topic, value in word_to_topic.items():
    #     for word, num_occurence in value.items():
    #         value[word] = num_occurence / num_of_words[topic] * 100
    
    return (word_to_topic)


In [50]:
trainTest = pd.read_csv("sample.csv", index_col=[0])

In [51]:
trainTest.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29047 entries, 0 to 29046
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         29047 non-null  object
 1   url           29047 non-null  object
 2   crawled_time  29047 non-null  object
 3   date          29047 non-null  object
 4   domain        29047 non-null  object
 5   author        19635 non-null  object
 6   content       29047 non-null  object
 7   topic_area    29047 non-null  object
dtypes: object(8)
memory usage: 2.0+ MB


In [52]:
train = trainTest[: int(len(trainTest) * 0.9) ]
test = trainTest[ int(len(trainTest) * 0.9): ]

## call the function to generate it again or used the pre generated JSON file
# word_to_topic = createWordToTopic(train)
# with open("word_to_topic_raw_count.json", "w+") as fp:
#     json.dump(word_to_topic, fp, indent=2)
word_to_topic = json.load(open("word_to_topic_raw_count.json"))


In [15]:
print (word_to_topic.keys())

dict_keys(['business', 'general', 'science', 'finance', 'tech', 'healthcare', 'automotive', 'environment', 'ai'])


In [55]:
vocab = set()
for key, val in word_to_topic.items():
    for word in val:
        vocab.add(word)

print (len(vocab))

123189


In [57]:
num_correct = 0
manual_result = []
for row in test[:2].iterrows():
    actual_topic = row[1]["topic_area"]
    
    tokens = re.sub("[^a-zA-Z\&]+", " ",row[1]["content"]).lower().split(" ")
    tokens = [ x for x in tokens if x != "" and x not in stopwords ]

    highest_score = 0
    guess_topic = ""
    for topic, array in word_to_topic.items():
        score = 0
        for word in tokens:
            score += math.log( (array.get(word, 0) + 1) / len(train[train["topic_area"].str.contains(topic)]) + len(vocab) )
        
        if score > highest_score:
            highest_score = score
            guess_topic = topic
    
    if topic == actual_topic:
        num_correct += 1
    manual_result.append( {"actual": actual_topic, "guess": guess_topic} )

In [58]:
for _ in manual_result:
    for k, v in _.items():
        print (f"{k}: {v}")
    print ()

actual: science
guess: environment

actual: science
guess: environment



# Naive Bayes Classifier (library)

https://towardsdatascience.com/naive-bayes-document-classification-in-python-e33ff50f937e

In [81]:
trainTest = pd.read_csv("train_test.csv")

## convert label to a number
topic_areas = list(trainTest["topic_area"].unique())
topic_to_number = { topic_areas[idx]:idx for idx in range(len(topic_areas)) }

trainTest["label"] = trainTest["topic_area"].apply(lambda x: topic_to_number[x])

In [82]:
trainTest["label"].value_counts()
## this might be bad

0    184128
1     51634
3     19115
4      3855
2      1768
5       359
7       335
6       222
8        15
Name: label, dtype: int64

In [83]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(trainTest["content"], trainTest["label"], random_state=69)

In [84]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(strip_accents="ascii", token_pattern=u"(?ui)\\b\\w*[a-z]+\\w*\\b", lowercase=True, stop_words="english")

X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

In [85]:
from sklearn.naive_bayes import MultinomialNB

naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_cv, y_train)
predictions = naive_bayes.predict(X_test_cv)

In [86]:
from sklearn.metrics import accuracy_score
print('Accuracy score: ', accuracy_score(y_test, predictions))


Accuracy score:  0.591955078184767
