<a href="https://colab.research.google.com/github/rypoko/NLP/blob/main/NLP_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import spacy
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import re
from contractions import CONTRACTION_MAP
import unicodedata

%matplotlib inline

In [23]:
seed_urls = ['https://inshorts.com/en/read/business',
             'https://inshorts.com/en/read/technology',
             'https://inshorts.com/en/read/world']

In [24]:
def build_dataset(seed_urls):
  news_data = []
  for url in seed_urls:
    news_category = url.split('/')[-1]
    data = requests.get(url)
    soup = BeautifulSoup(data.content, 'html.parser')

    news_articles = [{'news_headline': headline.find('span',
                                                     attrs={'itemprop': 'headline'}).string,
                      'news_article': article.find('div',
                                                   attrs={'itemprop': 'articleBody'}).string,
                      'news_category': news_category}
                     
                     for headline, article in zip(soup.find_all('div',
                                                                class_=['news-card-title news-right-box']),
                                                  soup.find_all('div',
                                                                class_=['news-card-content news-right-box']))
                     ]
    news_data.extend(news_articles)

  df = pd.DataFrame(news_data)
  df = df[['news_headline', 'news_article', 'news_category']]
  return df

In [25]:
news_df = build_dataset(seed_urls)
news_df.head(10)

Unnamed: 0,news_headline,news_article,news_category
0,India-born buyer of ₹504 cr art couldn't affor...,Indian-origin Vignesh Sundaresan and Anand Ven...,business
1,Boats honk in celebration after cargo ship stu...,A video has surfaced on social media showing t...,business
2,World Bank clears ₹9.6k cr loan to Pak days af...,The World Bank has signed agreements with Paki...,business
3,What could soon be in short supply due to the ...,The Suez Canal jam could soon cause shortages ...,business
4,Egypt Prez orders to prepare for 'third scenar...,Egyptian President Abdel Fattah Al-Sisi has or...,business
5,Latest satellite pics show excavation of sand ...,The latest satellite pictures by Maxar Technol...,business
6,Adani to buy Warora-Kurnool Transmission Limit...,Adani Transmission said it has entered into a ...,business
7,"Disinvestment target of FY22 achievable, LIC I...",Chief Economic Adviser (CEA) KV Subramanian on...,business
8,NPCI issues guidelines for capping market shar...,The NPCI has issued guidelines for capping UPI...,business
9,US cryptocurrency exchange Coinbase to set up ...,US cryptocurrency exchange Coinbase said the c...,business


In [26]:
news_df.news_category.value_counts()

technology    25
world         24
business      24
Name: news_category, dtype: int64

In [31]:
nltk.download('stopwords')
nlp = spacy.load('en_core_web_sm', parse=True, tag=True, entity=True)
#nlp_vec = spacy.load('en_vecs', parse = True, tag=True, entity=True)
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [32]:
#strip HTML
def strip_html_tags(text):
  soup = BeautifulSoup(text, "html.parser")
  stripped_text = soup.get_text()
  return stripped_text

strip_html_tags('<html><h2>Some important text</h2></html>')

'Some important text'

In [33]:
#strip accents
def remove_accented_chars(text):
  text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
  return text

remove_accented_chars('Sómě Áccěntěd těxt')

'Some Accented text'

In [35]:
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

expand_contractions("Y'all can't expand contractions I'd think")

'You all cannot expand contractions I would think'

In [36]:
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

remove_special_characters("Well this was fun! What do you think? 123#@!", 
                          remove_digits=True)

'Well this was fun What do you think '

In [37]:
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

simple_stemmer("My system keeps crashing his crashed yesterday, ours crashes daily")

'My system keep crash hi crash yesterday, our crash daili'

In [38]:
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

lemmatize_text("My system keeps crashing! his crashed yesterday, ours crashes daily")

'My system keep crash ! his crash yesterday , ours crash daily'

In [39]:
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

remove_stopwords("The, and, if are stopwords, computer is not")

', , stopwords , computer not'

In [40]:
def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
                     accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True, remove_digits=True):
    
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # strip HTML
        if html_stripping:
            doc = strip_html_tags(doc)
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # expand contractions    
        if contraction_expansion:
            doc = expand_contractions(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # remove special characters and\or digits    
        if special_char_removal:
            # insert spaces between special characters to isolate them    
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)  
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)
            
        normalized_corpus.append(doc)
        
    return normalized_corpus

In [41]:
# combining headline and article text
news_df['full_text'] = news_df["news_headline"].map(str)+ '. ' + news_df["news_article"]

# pre-process text and store the same
news_df['clean_text'] = normalize_corpus(news_df['full_text'])
norm_corpus = list(news_df['clean_text'])

# show a sample news article
news_df.iloc[1][['full_text', 'clean_text']].to_dict()

{'clean_text': 'boat honk celebration cargo ship stick suez canal move slightly video surface social medium show tugboat suez canal honking celebration able slightly move massive container ship stick suez canal tonne sand dredge tugboat pull push ship try dislodge ever give ship weigh ton',
 'full_text': 'Boats honk in celebration after cargo ship stuck in Suez Canal moves slightly. A video has surfaced on social media showing tugboats in the Suez Canal honking in celebration after they were able to slightly move the massive container ship stuck in Suez Canal. About 20,000 tonnes of sand was dredged and 14 tugboats pulled and pushed the ship to try to dislodge it. The Ever Given ship weighs 2,20,000 tons.'}

In [43]:
news_df.to_csv('news.csv', index=False, encoding='utf-8')

In [46]:
# create a basic pre-processed corpus, don't lowercase to get POS context
corpus = normalize_corpus(news_df['full_text'], text_lower_case=False, 
                          text_lemmatization=False, special_char_removal=False)

# demo for POS tagging for sample news headline
sentence = str(news_df.iloc[1].news_headline)
sentence_nlp = nlp(sentence)

# POS tagging with Spacy 
spacy_pos_tagged = [(word, word.tag_, word.pos_) for word in sentence_nlp]
pd.DataFrame(spacy_pos_tagged, columns=['Word', 'POS tag', 'Tag type'])

# POS tagging with nltk
#nltk_pos_tagged = nltk.pos_tag(sentence.split())
#pd.DataFrame(nltk_pos_tagged, columns=['Word', 'POS tag'])

Unnamed: 0,Word,POS tag,Tag type
0,Boats,NNS,NOUN
1,honk,VBP,VERB
2,in,IN,ADP
3,celebration,NN,NOUN
4,after,IN,ADP
5,cargo,NN,NOUN
6,ship,NN,NOUN
7,stuck,VBD,VERB
8,in,IN,ADP
9,Suez,NNP,PROPN


In [49]:
from nltk.corpus import conll2000

data = conll2000.chunked_sents()
train_data = data[:10900]
test_data = data[10900:] 

print(len(train_data), len(test_data))
print(train_data[1]) 

10900 48
(S
  Chancellor/NNP
  (PP of/IN)
  (NP the/DT Exchequer/NNP)
  (NP Nigel/NNP Lawson/NNP)
  (NP 's/POS restated/VBN commitment/NN)
  (PP to/TO)
  (NP a/DT firm/NN monetary/JJ policy/NN)
  (VP has/VBZ helped/VBN to/TO prevent/VB)
  (NP a/DT freefall/NN)
  (PP in/IN)
  (NP sterling/NN)
  (PP over/IN)
  (NP the/DT past/JJ week/NN)
  ./.)


In [55]:
from nltk import conlltags2tree, tree2conlltags
def conll_tag_chunks(chunk_sents):
    tagged_sents = [tree2conlltags(tree) for tree in chunk_sents]
    return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]


def combined_tagger(train_data, taggers, backoff=None):
    for tagger in taggers:
        backoff = tagger(train_data, backoff=backoff)
    return backoff 

In [56]:
from nltk.tag import UnigramTagger, BigramTagger
from nltk.chunk import ChunkParserI

# define the chunker class
class NGramTagChunker(ChunkParserI):
    
  def __init__(self, train_sentences, tagger_classes=[UnigramTagger, BigramTagger]):
    train_sent_tags = conll_tag_chunks(train_sentences)
    self.chunk_tagger = combined_tagger(train_sent_tags, tagger_classes)

  def parse(self, tagged_sentence):
    if not tagged_sentence: 
        return None
    pos_tags = [tag for word, tag in tagged_sentence]
    chunk_pos_tags = self.chunk_tagger.tag(pos_tags)
    chunk_tags = [chunk_tag for (pos_tag, chunk_tag) in chunk_pos_tags]
    wpc_tags = [(word, pos_tag, chunk_tag) for ((word, pos_tag), chunk_tag)
                     in zip(tagged_sentence, chunk_tags)]
    return conlltags2tree(wpc_tags)
  
# train chunker model  
ntc = NGramTagChunker(train_data)

# evaluate chunker model performance
print(ntc.evaluate(test_data))

ChunkParse score:
    IOB Accuracy:  90.0%%
    Precision:     82.1%%
    Recall:        86.3%%
    F-Measure:     84.1%%


In [57]:
chunk_tree = ntc.parse(nltk_pos_tagged)
print(chunk_tree)

(S
  (NP Boats/NNS)
  (VP honk/VBP)
  (PP in/IN)
  (NP celebration/NN)
  (PP after/IN)
  (NP cargo/NN ship/NN)
  (VP stuck/VBN)
  (PP in/IN)
  (NP Suez/NNP Canal/NNP)
  (VP moves/VBZ slightly/RB))


In [58]:
from IPython.display import display

## download and install ghostscript from https://www.ghostscript.com/download/gsdnld.html

# often need to add to the path manually (for windows)
os.environ['PATH'] = os.environ['PATH']+";C:\\Program Files\\gs\\gs9.09\\bin\\"

display(chunk_tree)

TclError: ignored

Tree('S', [Tree('NP', [('Boats', 'NNS')]), Tree('VP', [('honk', 'VBP')]), Tree('PP', [('in', 'IN')]), Tree('NP', [('celebration', 'NN')]), Tree('PP', [('after', 'IN')]), Tree('NP', [('cargo', 'NN'), ('ship', 'NN')]), Tree('VP', [('stuck', 'VBN')]), Tree('PP', [('in', 'IN')]), Tree('NP', [('Suez', 'NNP'), ('Canal', 'NNP')]), Tree('VP', [('moves', 'VBZ'), ('slightly', 'RB')])])