In [1]:
# Only need to run once per machine 
# import nltk
# nltk.download('popular', halt_on_error=False)
# nltk.download('all', halt_on_error=False)

In [2]:
import nltk as nltk
import nltk.corpus  
from nltk.text import Text
import pandas as pd
import re
import sys
import os
import pdb

In [3]:
directory = os.path.expanduser('~/Datasets/32018/')
df = pd.read_pickle(os.path.join(directory, 'news_cat.pkl'))
df = df[df.language == 'english']

In [4]:
df.head()

Unnamed: 0,crawled,language,text,title
0,2018-01-30T23:03:51.004+02:00,english,by Abhishek K Global Telehandler Market 2023 D...,Global Telehandler Market 2023 Demand by Segme...
1,2018-01-30T23:06:46.024+02:00,english,favorite this post 2014 Caterpillar 314E LCR h...,2014 Caterpillar 314E LCR
2,2018-01-30T23:18:35.023+02:00,english,By: MAX NISEN The Amazon health care threat ha...,"Amazon, Berkshire, JPMorgan health announcemen..."
3,2018-01-30T23:20:54.012+02:00,english,QR Code Link to This Post MONTHLY PUBLIC AUCTI...,2005 Caterpillar CB534D Tandem Vibratory Rolle...
4,2018-01-30T23:28:30.000+02:00,english,QR Code Link to This Post 2007 CATERPILLAR D4G...,2007 CATERPILLAR D4G LGP CAB SCREEN/SWEEPS - O...


In [5]:
def labeler(text):
    
    entities = []
    labels = []

    for sent in nltk.sent_tokenize(text):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)), binary = False):
            if hasattr(chunk, 'label'):
                entities.append(' '.join(c[0] for c in chunk)) #Add space as between multi-token entities
                labels.append(chunk.label())

    entities_labels = list(zip(entities, labels))
    entities_df = pd.DataFrame(entities_labels)
    if entities_df.empty:
        return pd.DataFrame({'entity':[], 'label':[], 'n_mentions':[]})
    else:
        entities_df.columns = ["entity", "label"]
        return entities_df.groupby(['entity']).label.value_counts().reset_index(name='n_mentions')

def label_aggregator(label_df):
    label_df = label_df.sort_values('n_mentions', ascending=False)
    return label_df.groupby('entity').agg({'label' : lambda x: x.iloc[0], 'n_mentions' : 'sum', }).reset_index()

In [6]:
labeler(df.title[0])

Unnamed: 0,entity,label,n_mentions
0,Demand,GPE,1
1,Demand Forecast,ORGANIZATION,1
2,Global,PERSON,1
3,Major Customer,PERSON,1
4,Segment,ORGANIZATION,1
5,Swot Analysis,PERSON,1
6,Telehandler,ORGANIZATION,1


This is a pretty garbage list. I think that the NER is having trouble with all the capitalization in the title. I think the body of the article should do a better job.

In [7]:
labeler(df.text[0]).head(10)

Unnamed: 0,entity,label,n_mentions
0,Abhishek K Global Telehandler,PERSON,1
1,Application,GPE,1
2,Application2 Market Segment,PERSON,1
3,Ask,PERSON,1
4,Business Analysis Telehandler Market Competition,ORGANIZATION,1
5,CAGR,ORGANIZATION,1
6,Canada,GPE,1
7,Caterpillar,GPE,1
8,Chain,PERSON,1
9,Channel,PERSON,1


Labeling on the text does somewhat better. Running the standord NE parser would probably be best but I can't get it to work on my machine.

In some cases, the same entity is picked up as two different labels.  
Example:

In [8]:
example = labeler(df.text[0])
example= example[example.entity == "Telehandler Market"]
example

Unnamed: 0,entity,label,n_mentions
84,Telehandler Market,ORGANIZATION,2
85,Telehandler Market,PERSON,1


In this case I am going to assign all of the instances to the dominant label

In [9]:
label_aggregator(example)

Unnamed: 0,entity,label,n_mentions
0,Telehandler Market,ORGANIZATION,3


Now to do it on all the articles

In [10]:
text_labels_dfs = []
title_labels_dfs = []

for i, row in df.iterrows():
    
    text_labels = label_aggregator(labeler(row['text']))
    text_labels['n_articles'] = 1
    text_labels_dfs.append(text_labels)
    
    title_labels = label_aggregator(labeler(row['title']))
    title_labels['n_articles'] = 1
    title_labels_dfs.append(title_labels)
    
df_text  = pd.concat(text_labels_dfs)
df_title = pd.concat(title_labels_dfs)

In [11]:
df_out = pd.merge(
    df_text.groupby(['entity', 'label'])\
        .agg({'n_mentions':'sum', 'n_articles': 'sum'})\
        .reset_index()\
        .rename(columns={'n_mentions':'article_mentions'}),
    df_title.groupby(['entity', 'label'])\
        .agg({'n_mentions':'sum', 'n_articles': 'sum'})\
        .reset_index()\
        .rename(columns={
            'n_mentions':'title_mentions', 
            'n_articles': 'n_titles'}),
    how='outer',
    on=['entity', 'label']).fillna(0)

In [12]:
df_out['total_mentions'] = df_out.article_mentions + df_out.title_mentions
df_out.sort_values('total_mentions', ascending=False, inplace=True)

In [13]:
df_out.head(10)

Unnamed: 0,entity,label,article_mentions,n_articles,title_mentions,n_titles,total_mentions
356,Caterpillar,GPE,253.0,38.0,14.0,13.0,267.0
358,Caterpillar,PERSON,131.0,11.0,5.0,5.0,136.0
366,Caterpillar Inc.,ORGANIZATION,83.0,23.0,13.0,13.0,96.0
1302,NYSE,ORGANIZATION,59.0,22.0,4.0,4.0,63.0
283,CAT,ORGANIZATION,43.0,25.0,12.0,12.0,55.0
357,Caterpillar,ORGANIZATION,39.0,11.0,0.0,0.0,39.0
351,Cat,ORGANIZATION,36.0,17.0,0.0,0.0,36.0
426,Company,ORGANIZATION,27.0,16.0,0.0,0.0,27.0
72,Amazon,PERSON,24.0,4.0,0.0,0.0,24.0
1598,SEC,ORGANIZATION,23.0,14.0,0.0,0.0,23.0


Okay so I need to exclude Caterpillar and all instances and abbreviations. I'm not too thrilled with the label definition. I don't think I can exclude Person or GPE as not a company so I won't filter on that field for now.

In [14]:
df_out = df_out[~df_out.entity.isin(['Caterpillar', 'Caterpillar Inc.', 'CAT', 'Cat'])]
df_out.head(10)

Unnamed: 0,entity,label,article_mentions,n_articles,title_mentions,n_titles,total_mentions
1302,NYSE,ORGANIZATION,59.0,22.0,4.0,4.0,63.0
426,Company,ORGANIZATION,27.0,16.0,0.0,0.0,27.0
72,Amazon,PERSON,24.0,4.0,0.0,0.0,24.0
1598,SEC,ORGANIZATION,23.0,14.0,0.0,0.0,23.0
1867,U.S.,GPE,22.0,15.0,0.0,0.0,22.0
935,JPMorgan,ORGANIZATION,21.0,12.0,1.0,1.0,22.0
1925,Vista,GPE,22.0,2.0,0.0,0.0,22.0
1744,Stock,PERSON,20.0,3.0,1.0,1.0,21.0
1440,Peoria,GPE,21.0,3.0,0.0,0.0,21.0
1840,Transportation,ORGANIZATION,20.0,18.0,0.0,0.0,20.0


Much better. I can probably exclude GPEs now. Also 'Stock' and any combination of SEC.

In [15]:
df_out = df_out[(df_out.label != 'GPE')] 
df_out = df_out[~df_out.entity.isin(['Stock','SEC', 'Securities', 'Transportation', 'Exchange Commission'])]
df_out = df_out[~df_out.entity.isin(['Company', 'Resource Industries', 'Energy', 'Financial Products', 'Construction Industries'])]
df_out = df_out[~df_out.entity.isin(['Get', 'NOT', 'LLC', 'Partners', 'Thomas', 'Investment', 'News', 'Bank'])]
df_out.head(20)

Unnamed: 0,entity,label,article_mentions,n_articles,title_mentions,n_titles,total_mentions
1302,NYSE,ORGANIZATION,59.0,22.0,4.0,4.0,63.0
72,Amazon,PERSON,24.0,4.0,0.0,0.0,24.0
935,JPMorgan,ORGANIZATION,21.0,12.0,1.0,1.0,22.0
1917,Vetr,PERSON,17.0,15.0,1.0,1.0,18.0
407,Citigroup,PERSON,13.0,13.0,0.0,0.0,13.0
1438,Pellette,PERSON,13.0,13.0,0.0,0.0,13.0
263,Buffett,PERSON,12.0,2.0,0.0,0.0,12.0
1094,Lincolnian Online,ORGANIZATION,12.0,6.0,0.0,0.0,12.0
189,Beyond Wonderland,PERSON,12.0,4.0,0.0,0.0,12.0
1515,Ratings,ORGANIZATION,11.0,11.0,0.0,0.0,11.0


Still not a great list but pretty solid I think.