## Loading required libraries

In [None]:
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt


import nltk
from nltk.tag import StanfordNERTagger
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 

import spacy 
from spacy import displacy

from sklearn.datasets import fetch_20newsgroups


nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Required Functions

In [None]:
def decontracted(phrase):
    ''' 
      Function used to decontact the words in the phrase
      Input: phrase
      Output: decontracted phrase
    '''
    phrase = re.sub(r"\S+@\S+"," ",phrase)
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    phrase = re.sub(r"  ", " ", phrase)
    
    return phrase
  

In [None]:
stop_words=stopwords.words('english')
temp = ['we','THE']
stop_words = stop_words+temp
for i in stop_words:
  print(i)
  temp.append(i[0].upper()+i[1:])
  temp.append(i.upper())
stop_words = stop_words+temp

i
me
my
myself
we
our
ours
ourselves
you
you're
you've
you'll
you'd
your
yours
yourself
yourselves
he
him
his
himself
she
she's
her
hers
herself
it
it's
its
itself
they
them
their
theirs
themselves
what
which
who
whom
this
that
that'll
these
those
am
is
are
was
were
be
been
being
have
has
had
having
do
does
did
doing
a
an
the
and
but
if
or
because
as
until
while
of
at
by
for
with
about
against
between
into
through
during
before
after
above
below
to
from
up
down
in
out
on
off
over
under
again
further
then
once
here
there
when
where
why
how
all
any
both
each
few
more
most
other
some
such
no
nor
not
only
own
same
so
than
too
very
s
t
can
will
just
don
don't
should
should've
now
d
ll
m
o
re
ve
y
ain
aren
aren't
couldn
couldn't
didn
didn't
doesn
doesn't
hadn
hadn't
hasn
hasn't
haven
haven't
isn
isn't
ma
mightn
mightn't
mustn
mustn't
needn
needn't
shan
shan't
shouldn
shouldn't
wasn
wasn't
weren
weren't
won
won't
wouldn
wouldn't
we
THE


In [None]:
def pipeline(data): 

  processed_data = []
  for sentence in tqdm(data):
  
    sentence = decontracted(sentence) #decontact the sentence in the review
    sentence = re.sub('[^A-Za-z]+', ' ', sentence) #retaining only alphabets in the sentence 

    #Word tokenization
    word_tokens = word_tokenize(sentence) 

    #Stop word removal
    filtered_sentence = [w for w in word_tokens if  w not in stop_words]
    
    #Lemmatization
    wnl = WordNetLemmatizer()
    filtered_sentence = [wnl.lemmatize(w) for w in filtered_sentence ]

    final = ' '.join(filtered_sentence)
    processed_data.append(final)
  return processed_data

In [None]:
def frequency(lst):
  my_dict = {}
  for i in tqdm(range(len(lst))):
    # To not consider the single character tags
    if len(lst[i]) != 1:              
      # To check the tag in the dictionary or not  
      if lst[i] not in my_dict.keys():  
        # To check the underscore in the tag
        if ("_" in lst[i]):            
          l = lst[i].split('_')
          for j in range(len(l)):
            # For all the words in the tag count only the already existing word      
            if l[j] != "New":
              # Check only the tags which do not have 'New' in it because we can 
              # have new york and new zealand which are different but if we do not 
              # filter it will consider as same word
              if l[j] in my_dict.keys():
                my_dict[l[j]] += 1
            else:
                my_dict[lst[i]] = 1
        else:
          my_dict[lst[i]] = 1
      else:
        my_dict[lst[i]] += 1
  # Sort in the decreasing order of the counts
  lst_s = sorted(my_dict.items(), key=lambda x: x[1], reverse= True) 
  return lst_s

## Downloading the 20newsgroup dataset from sklearn

In [None]:
twenty_data = fetch_20newsgroups(subset='all', shuffle=False, remove=('headers', 'quotes'))

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [None]:
len(twenty_data.target)

18846

In [None]:
#twenty_data.target_names

## Cleaning the data

In [None]:
data = pipeline(twenty_data.data)

100%|██████████| 18846/18846 [00:42<00:00, 444.69it/s]


In [None]:
data[0]

'morgan guzman era run higher last year cub idiot pitch harkey much hibbard castillo good think stud pitcher season far Morgan Guzman helped lead Cubs top ERA even better rotation Atlanta Cubs ERA Braves know early season Cubs fan learned enjoy short triumph still'

## Named Entities using Stanford NER

### Downloading stanford NER package and loading

In [None]:
!gdown https://drive.google.com/uc?id=1N0xYCtTK12H83-4avQugpM_JuoJXbsT8
!unzip './stanford-ner-4.2.0.zip'
!rm './stanford-ner-4.2.0.zip'

Downloading...
From: https://drive.google.com/uc?id=1N0xYCtTK12H83-4avQugpM_JuoJXbsT8
To: /content/stanford-ner-4.2.0.zip
180MB [00:03, 49.7MB/s]
Archive:  ./stanford-ner-4.2.0.zip
   creating: stanford-ner-2020-11-17/
   creating: stanford-ner-2020-11-17/lib/
  inflating: stanford-ner-2020-11-17/lib/jollyday-0.4.9.jar  
  inflating: stanford-ner-2020-11-17/lib/stanford-ner-resources.jar  
  inflating: stanford-ner-2020-11-17/lib/joda-time.jar  
  inflating: stanford-ner-2020-11-17/stanford-ner-4.2.0.jar  
  inflating: stanford-ner-2020-11-17/NERDemo.java  
  inflating: stanford-ner-2020-11-17/LICENSE.txt  
  inflating: stanford-ner-2020-11-17/sample-conll-file.txt  
  inflating: stanford-ner-2020-11-17/stanford-ner-4.2.0-javadoc.jar  
  inflating: stanford-ner-2020-11-17/stanford-ner-4.2.0-sources.jar  
  inflating: stanford-ner-2020-11-17/stanford-ner.jar  
  inflating: stanford-ner-2020-11-17/sample.txt  
  inflating: stanford-ner-2020-11-17/build.xml  
  inflating: stanford-ner-202

In [None]:
model = './stanford-ner-2020-11-17/classifiers/english.all.3class.distsim.crf.ser.gz'
jar = './stanford-ner-2020-11-17/stanford-ner.jar'

tagger = StanfordNERTagger(model, jar,encoding='utf-8')

The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.tag.corenlp.CoreNLPPOSTagger[0m or [91mnltk.tag.corenlp.CoreNLPNERTagger[0m instead.
  super(StanfordNERTagger, self).__init__(*args, **kwargs)


### Entities and Labels list creation

In [None]:
lst_list_data = []
for i in range(0,10):
  lst_data = ''
  for j in tqdm(data[int(len(data)*0.1*i) : int(len(data)*0.1*(i+1))]):
    lst_data = lst_data + j
  lst_list_data.append(lst_data)

100%|██████████| 1884/1884 [00:00<00:00, 201331.72it/s]
100%|██████████| 1885/1885 [00:00<00:00, 393738.20it/s]
100%|██████████| 1884/1884 [00:00<00:00, 381724.01it/s]
100%|██████████| 1885/1885 [00:00<00:00, 297216.76it/s]
100%|██████████| 1885/1885 [00:00<00:00, 392877.31it/s]
100%|██████████| 1884/1884 [00:00<00:00, 392571.35it/s]
100%|██████████| 1885/1885 [00:00<00:00, 400074.03it/s]
100%|██████████| 1884/1884 [00:00<00:00, 408206.88it/s]
100%|██████████| 1885/1885 [00:00<00:00, 444946.99it/s]
100%|██████████| 1885/1885 [00:00<00:00, 505838.97it/s]


In [None]:
NER = []
for i in tqdm(lst_list_data):
  words = nltk.word_tokenize(i)
  tagged = tagger.tag(words)
  NER = NER + tagged

100%|██████████| 10/10 [03:27<00:00, 20.74s/it]


Removing 'O' tagged NER i.e NER with background tag for words that did not fit any of the named entity category labels.

In [None]:
final_NER = []
for i in NER:
  if((i[1] != 'O')):
    final_NER.append(i)

### Finding the top 100 LOC and PERSON entities from the data set

In [None]:
#Finding Location and person Entity
loc = []
person = []
for i in final_NER:
  if(i[1]== 'LOCATION'):
    loc.append(i[0])
  if(i[1]== 'PERSON'):
    person.append(i[0])


In [None]:
# The words having spaces will be joined using underscore
for i in range(len(loc)):
  if (" " in loc[i]):
    loc[i] = loc[i].replace(" ", "_")
for i in range(len(person)):
  if (" " in person[i]):
    person[i] = person[i].replace(" ", "_")

In [None]:
# Use the frequency function defined in the code to get the tags its corresponding count in the dataset
lst_loc_stan = frequency(loc)  
lst_person_stan = frequency(person)

100%|██████████| 22137/22137 [00:00<00:00, 778623.60it/s]
100%|██████████| 75187/75187 [00:00<00:00, 1114245.61it/s]


In [None]:
lst_loc_freq_s = []
lst_loc_entity_s = []
lst_person_freq_s = []
lst_person_entity_s = []
for i in range(100):
  a, b = lst_loc_stan[i]
  c, d = lst_person_stan[i]
  lst_loc_freq_s.append(b)
  lst_loc_entity_s.append(a)
  lst_person_freq_s.append(d)
  lst_person_entity_s.append(c)

In [None]:
loc_person_NER_stan = {'Location': lst_loc_entity_s, 'Location Frequency': lst_loc_freq_s,
                  'Person': lst_person_entity_s, 'Person Frequency': lst_person_freq_s}
df_stan = pd.DataFrame(loc_person_NER_stan)
df_stan

Unnamed: 0,Location,Location Frequency,Person,Person Frequency
0,Israel,848,Jesus,1190
1,US,811,David,889
2,Armenia,409,John,851
3,New,406,Paul,598
4,United,380,Clinton,457
...,...,...,...,...
95,Norway,41,Dick,84
96,London,40,Carl,83
97,Houston,40,Sam,83
98,Pluto,40,Tommy,82


In [None]:
df_stan.to_csv('Top_100_Stan.csv') #saving the top 100 location and person entities 

## Named Entities using Spacy NER

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
ent_spacy = []
lab_spacy = []
for text in tqdm(data):
  doc = nlp(text)
  for ent in doc.ents:
    ent_spacy.append(ent)
    lab_spacy.append(ent.label_)

100%|██████████| 18846/18846 [21:49<00:00, 14.39it/s]


### Finding the top 100 LOC and PERSON entities from the data set

In [None]:
Loc = []
Person = []
for i in tqdm(range(len(ent_spacy))):
  if(lab_spacy[i] == 'LOC' or lab_spacy[i] == 'GPE'):
    Loc.append(str(ent_spacy[i]))
  if(lab_spacy[i] == 'PERSON'):
    Person.append(str(ent_spacy[i]))


100%|██████████| 202183/202183 [00:00<00:00, 367428.37it/s]


In [None]:
# The words having spaces will be joined using underscore
for i in range(len(Loc)):
  if (" " in Loc[i]):
    Loc[i] = Loc[i].replace(" ", "_")
for i in range(len(Person)):
  if (" " in Person[i]):
    Person[i] = Person[i].replace(" ", "_")

In [None]:
# Use the frequency function defined in the code to get the tags its corresponding count in the dataset
lst_loc = frequency(Loc)  
lst_person = frequency(Person)

100%|██████████| 18876/18876 [00:00<00:00, 713946.62it/s]
100%|██████████| 53738/53738 [00:00<00:00, 535634.78it/s]


In [None]:
lst_loc_freq = []
lst_loc_entity = []
lst_person_freq = []
lst_person_entity = []
for i in range(100):
  a, b = lst_loc[i]
  c, d = lst_person[i]
  lst_loc_freq.append(b)
  lst_loc_entity.append(a)
  lst_person_freq.append(d)
  lst_person_entity.append(c)

In [None]:
loc_person_NER = {'Location': lst_loc_entity, 'Location Frequency': lst_loc_freq,
                  'Person': lst_person_entity, 'Person Frequency': lst_person_freq}
df = pd.DataFrame(loc_person_NER)
df

Unnamed: 0,Location,Location Frequency,Person,Person Frequency
0,Israel,809,Jesus,1347
1,US,804,John,1058
2,Earth,338,David,996
3,Turkey,324,Paul,653
4,Canada,262,Mike,586
...,...,...,...,...
95,England,37,Sam,86
96,Austria,37,Roy,84
97,IL,36,Adams,84
98,Cambridge,36,Dale,84


In [None]:
df.to_csv('Top_100_Spacy.csv') #saving the top 100 location and person entities 

## Degree of correlation between stanford NER and Spacy NER on generated LOCATION and PERSON entities 

In [None]:
count_loc = 0
count_person = 0
m_words_loc = []
m_word_freq_stan_loc = []
m_word_freq_spacy_loc = []
m_words_person = []
m_word_freq_stan_person = []
m_word_freq_spacy_person = []
for i in range(100):
  if lst_loc_entity[i] in lst_loc_entity_s:
    count_loc += 1 
    m_words_loc.append(lst_loc_entity[i])
    m_word_freq_stan_loc.append(lst_loc_freq_s[i])
    m_word_freq_spacy_loc.append(lst_loc_freq[i])
  if lst_person_entity [i] in lst_person_entity_s:
    count_person += 1 
    m_words_person.append(lst_person_entity[i])
    m_word_freq_stan_person.append(lst_person_freq_s[i])
    m_word_freq_spacy_person.append(lst_person_freq[i])

In [None]:
loc_match_NER = {'Location Entity': m_words_loc, 'Frequency_Stanford_NER': m_word_freq_stan_loc,
                  'Frequency_Spacy_NER': m_word_freq_spacy_loc}
df_m_loc = pd.DataFrame(loc_match_NER)
df_m_loc

Unnamed: 0,Location Entity,Frequency_Stanford_NER,Frequency_Spacy_NER
0,Israel,848,809
1,US,811,804
2,Earth,409,338
3,Turkey,406,324
4,Canada,380,262
...,...,...,...
66,Italy,44,39
67,Cyprus,43,38
68,England,41,37
69,Cambridge,40,36


In [None]:
person_match_NER = {'PERSON Entity': m_words_person, 'Frequency_Stanford_NER': m_word_freq_stan_person,
                  'Frequency_Spacy_NER': m_word_freq_spacy_person}
df_m_person = pd.DataFrame(person_match_NER)
df_m_person

Unnamed: 0,PERSON Entity,Frequency_Stanford_NER,Frequency_Spacy_NER
0,Jesus,1190,1347
1,John,889,1058
2,David,851,996
3,Paul,598,653
4,Mike,457,586
...,...,...,...
70,Larson,86,87
71,Bruce,85,87
72,Clayton,84,86
73,Sam,84,86


In [None]:
df_m_loc.to_csv('Top_100_matched_LOC.csv') #saving the matched LOCATION entity
df_m_person.to_csv('Top_100_matched_PERSON.csv') #saving the matched PERSON entity 

In [None]:
print("Degree of correlation between LOC tags from stanford NER and Spacy NER is {}".format(count_loc/100))

Degree of correlation between LOC tags from stanford NER and Spacy NER is 0.71


In [None]:
print("Degree of correlation between PERSON tags from stanford NER and Spacy NER is {}".format(count_person/100))

Degree of correlation between PERSON tags from stanford NER and Spacy NER is 0.75
