## Parts Of Speech Tagging and Name Entity Recognition

In [125]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
import re
import pandas as pd
import matplotlib.pyplot as plt

#### Load Data

In [128]:
bbc_data = pd.read_csv('bbc_news.csv')

In [130]:
bbc_data.head()

Unnamed: 0.1,Unnamed: 0,index,title,pubDate,guid,link,description
0,0,6684,Can I refuse to work?,"Wed, 10 Aug 2022 15:46:18 GMT",https://www.bbc.co.uk/news/business-62147992,https://www.bbc.co.uk/news/business-62147992?a...,With much of the UK enduring another period of...
1,1,9267,'Liz Truss the Brief?' World reacts to UK poli...,"Mon, 17 Oct 2022 11:35:12 GMT",https://www.bbc.co.uk/news/world-63285480,https://www.bbc.co.uk/news/world-63285480?at_m...,The UK's political chaos has been watched arou...
2,2,7387,Rationing energy is nothing new for off-grid c...,"Wed, 31 Aug 2022 05:20:18 GMT",https://www.bbc.co.uk/news/uk-scotland-highlan...,https://www.bbc.co.uk/news/uk-scotland-highlan...,Scoraig in the north west Highlands has long h...
3,3,767,The hunt for superyachts of sanctioned Russian...,"Tue, 22 Mar 2022 14:37:01 GMT",https://www.bbc.co.uk/news/60739336,https://www.bbc.co.uk/news/60739336?at_medium=...,"Wealthy Russians sanctioned by the US, EU and ..."
4,4,3712,Platinum Jubilee: 70 years of the Queen in 70 ...,"Wed, 01 Jun 2022 23:17:33 GMT",https://www.bbc.co.uk/news/uk-61660128,https://www.bbc.co.uk/news/uk-61660128?at_medi...,A quick look back at the Queen's 70 years on t...


In [132]:
bbc_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   1000 non-null   int64 
 1   index        1000 non-null   int64 
 2   title        1000 non-null   object
 3   pubDate      1000 non-null   object
 4   guid         1000 non-null   object
 5   link         1000 non-null   object
 6   description  1000 non-null   object
dtypes: int64(2), object(5)
memory usage: 54.8+ KB


In [134]:
bbc_titles = pd.DataFrame(bbc_data['title'])

In [136]:
bbc_titles.head()

Unnamed: 0,title
0,Can I refuse to work?
1,'Liz Truss the Brief?' World reacts to UK poli...
2,Rationing energy is nothing new for off-grid c...
3,The hunt for superyachts of sanctioned Russian...
4,Platinum Jubilee: 70 years of the Queen in 70 ...


#### Clean Data

In [139]:
# lowercase
bbc_titles['lowercase'] = bbc_titles['title'].str.lower()
bbc_titles.head()

Unnamed: 0,title,lowercase
0,Can I refuse to work?,can i refuse to work?
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...


In [141]:
# stop words removal
en_stopwords = stopwords.words('english')
bbc_titles['no_stopwords'] = bbc_titles['lowercase'].apply(lambda x: ' '.join([word for word in x.split() if word not in (en_stopwords)]))
bbc_titles.head()

Unnamed: 0,title,lowercase,no_stopwords
0,Can I refuse to work?,can i refuse to work?,refuse work?
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...,'liz truss brief?' world reacts uk political t...
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...,rationing energy nothing new off-grid community
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...,hunt superyachts sanctioned russian oligarchs
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...,platinum jubilee: 70 years queen 70 seconds


In [143]:
# punctuation removal
bbc_titles['no_stopwords_no_punct'] = bbc_titles.apply(lambda x: re.sub(r"([^\w\s])","",x['no_stopwords']),axis = 1)
bbc_titles.head()

Unnamed: 0,title,lowercase,no_stopwords,no_stopwords_no_punct
0,Can I refuse to work?,can i refuse to work?,refuse work?,refuse work
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...,'liz truss brief?' world reacts uk political t...,liz truss brief world reacts uk political turmoil
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...,rationing energy nothing new off-grid community,rationing energy nothing new offgrid community
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...,hunt superyachts sanctioned russian oligarchs,hunt superyachts sanctioned russian oligarchs
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...,platinum jubilee: 70 years queen 70 seconds,platinum jubilee 70 years queen 70 seconds


In [145]:
# tokenize
bbc_titles['tokens_clean'] = bbc_titles.apply(lambda x: word_tokenize(x['no_stopwords_no_punct']), axis=1)
bbc_titles.head()

Unnamed: 0,title,lowercase,no_stopwords,no_stopwords_no_punct,tokens_clean
0,Can I refuse to work?,can i refuse to work?,refuse work?,refuse work,"[refuse, work]"
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...,'liz truss brief?' world reacts uk political t...,liz truss brief world reacts uk political turmoil,"[liz, truss, brief, world, reacts, uk, politic..."
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...,rationing energy nothing new off-grid community,rationing energy nothing new offgrid community,"[rationing, energy, nothing, new, offgrid, com..."
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...,hunt superyachts sanctioned russian oligarchs,hunt superyachts sanctioned russian oligarchs,"[hunt, superyachts, sanctioned, russian, oliga..."
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...,platinum jubilee: 70 years queen 70 seconds,platinum jubilee 70 years queen 70 seconds,"[platinum, jubilee, 70, years, queen, 70, seco..."


In [147]:
# lemmatize
lemmatizer = WordNetLemmatizer()
bbc_titles['tokens_clean_lemmatized'] = bbc_titles['tokens_clean'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])
bbc_titles.head()

Unnamed: 0,title,lowercase,no_stopwords,no_stopwords_no_punct,tokens_clean,tokens_clean_lemmatized
0,Can I refuse to work?,can i refuse to work?,refuse work?,refuse work,"[refuse, work]","[refuse, work]"
1,'Liz Truss the Brief?' World reacts to UK poli...,'liz truss the brief?' world reacts to uk poli...,'liz truss brief?' world reacts uk political t...,liz truss brief world reacts uk political turmoil,"[liz, truss, brief, world, reacts, uk, politic...","[liz, truss, brief, world, reacts, uk, politic..."
2,Rationing energy is nothing new for off-grid c...,rationing energy is nothing new for off-grid c...,rationing energy nothing new off-grid community,rationing energy nothing new offgrid community,"[rationing, energy, nothing, new, offgrid, com...","[rationing, energy, nothing, new, offgrid, com..."
3,The hunt for superyachts of sanctioned Russian...,the hunt for superyachts of sanctioned russian...,hunt superyachts sanctioned russian oligarchs,hunt superyachts sanctioned russian oligarchs,"[hunt, superyachts, sanctioned, russian, oliga...","[hunt, superyachts, sanctioned, russian, oliga..."
4,Platinum Jubilee: 70 years of the Queen in 70 ...,platinum jubilee: 70 years of the queen in 70 ...,platinum jubilee: 70 years queen 70 seconds,platinum jubilee 70 years queen 70 seconds,"[platinum, jubilee, 70, years, queen, 70, seco...","[platinum, jubilee, 70, year, queen, 70, second]"


In [188]:
#  lists for just our tokens
tokens_clean_list = sum(bbc_titles['tokens_clean_lemmatized'], [])
tokens_clean_list

['refuse',
 'work',
 'liz',
 'truss',
 'brief',
 'world',
 'reacts',
 'uk',
 'political',
 'turmoil',
 'rationing',
 'energy',
 'nothing',
 'new',
 'offgrid',
 'community',
 'hunt',
 'superyachts',
 'sanctioned',
 'russian',
 'oligarch',
 'platinum',
 'jubilee',
 '70',
 'year',
 'queen',
 '70',
 'second',
 'red',
 'bull',
 'found',
 'guilty',
 'breaking',
 'formula',
 '1',
 'budget',
 'cap',
 'world',
 'triathlon',
 'championship',
 'series',
 'flora',
 'duffy',
 'beat',
 'georgia',
 'taylorbrown',
 'womens',
 'title',
 'terry',
 'hall',
 'coventry',
 'scooter',
 'rideout',
 'pay',
 'tribute',
 'singer',
 'post',
 'office',
 'fujitsu',
 'face',
 'inquiry',
 'horizon',
 'scandal',
 'pavement',
 'parking',
 'frightens',
 'me',
 'uk',
 'interest',
 'rate',
 'rise',
 'affect',
 'high',
 'could',
 'go',
 'stayed',
 'storm',
 'happens',
 'now',
 'six',
 'nation',
 'scotland',
 'best',
 'since',
 '99',
 'beat',
 'best',
 'ireland',
 'ever',
 'long',
 'liz',
 'truss',
 'survive',
 'prime',
 'm

#### POS Tagging

In [152]:
nlp = spacy.load('en_core_web_sm')

In [154]:
# raw text

bbc_titles['tokens_raw'] = bbc_titles.apply(lambda x: word_tokenize(x['title']), axis=1)

In [156]:
# unpack our text into a single list

tokens_raw_list = sum(bbc_titles['tokens_raw'], [])

In [158]:
# create a spacy doc from our raw text - better for pos tagging

spacy_doc = nlp(' '.join(tokens_raw_list))

In [166]:
# extract the tokens and pos tags into a dataframe

pos_df= pd.DataFrame(columns = ['token', 'pos_tag'])

for token in spacy_doc:
    pos_df = pd.concat([pos_df, pd.DataFrame.from_records([{'token': token.text, 'pos_tag':token.pos_}])],ignore_index=True)

pos_df.head()

Unnamed: 0,token,pos_tag
0,Can,AUX
1,I,PRON
2,refuse,VERB
3,to,PART
4,work,VERB


In [168]:
# token frequency count

pos_df_freq = pos_df.groupby(['token','pos_tag']).size().reset_index(name='counts').sort_values(by='counts',ascending=False)
pos_df_freq.head()

Unnamed: 0,token,pos_tag,counts
94,:,PUNCT,543
7,',PUNCT,315
2895,in,ADP,187
4072,to,PART,173
3262,of,ADP,172


In [182]:
# most common nouns
nouns = pos_df_freq[pos_df_freq.pos_tag=='NOUN'][0:10]
nouns

Unnamed: 0,token,pos_tag,counts
4258,war,NOUN,35
3545,record,NOUN,15
4350,year,NOUN,14
3999,tax,NOUN,13
3411,police,NOUN,13
3362,people,NOUN,12
2323,day,NOUN,12
4307,win,NOUN,11
2029,boss,NOUN,11
2565,fans,NOUN,11


In [186]:
# most common adjectives
adj = pos_df_freq[pos_df_freq.pos_tag=='ADJ'][0:10]
adj

Unnamed: 0,token,pos_tag,counts
3239,new,ADJ,28
1399,Russian,ADJ,22
2602,final,ADJ,16
18,-,ADJ,14
2621,first,ADJ,13
3195,more,ADJ,10
2832,high,ADJ,10
2998,last,ADJ,9
1992,big,ADJ,9
3298,other,ADJ,8


#### NER

In [201]:
# extract the tokens and entity tags into a dataframe

ner_df = pd.DataFrame(columns=['token','ner_tag'])

for token in spacy_doc.ents:
    if pd.isna(token.label_) is False:
        ner_df = pd.concat([ner_df, pd.DataFrame.from_records([{'token':token.text, 'ner_tag':token.label_}])], ignore_index=True)

ner_df.head()
    

Unnamed: 0,token,ner_tag
0,Liz Truss,PERSON
1,UK,GPE
2,superyachts,CARDINAL
3,Russian,NORP
4,Platinum Jubilee,PERSON


In [203]:
# token frequency count
ner_df_counts = ner_df.groupby(['token','ner_tag']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)
ner_df_counts.head(10)

Unnamed: 0,token,ner_tag,counts
984,Ukraine,GPE,44
978,UK,GPE,37
320,England,GPE,34
979,US,GPE,22
837,Russian,NORP,21
1056,World Cup 2022,EVENT,18
1088,first,ORDINAL,15
780,Queen,PERSON,11
1054,World Cup,EVENT,11
219,China,GPE,11


In [205]:
# most common people
people = ner_df_counts[ner_df_counts.ner_tag == "PERSON"][0:10]
people

Unnamed: 0,token,ner_tag,counts
780,Queen,PERSON,11
245,Covid,PERSON,9
776,Putin,PERSON,8
163,Boris Johnson,PERSON,6
564,Liz Truss,PERSON,6
317,Emma Raducanu,PERSON,4
808,Rishi Sunak,PERSON,4
511,Jurgen Klopp,PERSON,4
827,Rory McIlroy,PERSON,3
110,Andy Murray,PERSON,3


In [207]:
# most common places
places = ner_df_counts[ner_df_counts.ner_tag == "GPE"][0:10]
places

Unnamed: 0,token,ner_tag,counts
984,Ukraine,GPE,44
978,UK,GPE,37
320,England,GPE,34
979,US,GPE,22
219,China,GPE,11
368,France,GPE,11
836,Russia,GPE,10
451,India,GPE,8
125,Australia,GPE,7
567,London,GPE,7
