In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline

In [2]:
seed_urls = ['https://inshorts.com/en/read/technology',
             'https://inshorts.com/en/read/sports',
             'https://inshorts.com/en/read/world']

def build_dataset(seed_urls):
    news_data = []
    for url in seed_urls:
        news_category = url.split('/')[-1]
        data = requests.get(url)
        soup = BeautifulSoup(data.content, 'html.parser')
        
        news_articles = [{'news_headline': headline.find('span', 
                                                         attrs={"itemprop": "headline"}).string,
                          'news_article': article.find('div', 
                                                       attrs={"itemprop": "articleBody"}).string,
                          'news_category': news_category}
                         
                            for headline, article in 
                             zip(soup.find_all('div', 
                                               class_=["news-card-title news-right-box"]),
                                 soup.find_all('div', 
                                               class_=["news-card-content news-right-box"]))
                        ]
        news_data.extend(news_articles)
        
    df =  pd.DataFrame(news_data)
    df = df[['news_headline', 'news_article', 'news_category']]
    return df

In [3]:
news_df = build_dataset(seed_urls)
news_df.head(10)

Unnamed: 0,news_headline,news_article,news_category
0,Netflix unveils posters of Sacred Games 2 shot...,Netflix collaborated OnePlus to unveil the fir...,technology
1,Nat Geo to release special edition magazine sh...,National Geographic revealed its cover for the...,technology
2,OnePlus 7 Pro to feature 200% more powerful vi...,OnePlus founder Pete Lau revealed in an interv...,technology
3,OnePlus to host ‘Experience Pop-Up’ event in D...,OnePlus will be hosting one-of-its-kind 'OnePl...,technology
4,Facebook Co-founder says it's time to break up...,"Chris Hughes, who co-founded Facebook with Har...",technology
5,Uber falls below $70bn value on 1st trading da...,Uber ended its first day of trading with a mar...,technology
6,Couple shoots sex video in a Tesla on Autopilo...,Reacting to a video of two people having sex i...,technology
7,4-year degree not necessary for coding: Apple ...,Apple CEO Tim Cook has said he believes that a...,technology
8,"Facebook rejects breakup call, asks for govt r...",Facebook has rejected a call from Co-founder C...,technology
9,Louis Vuitton unveils handbags with built-in f...,French luxury fashion house Louis Vuitton unve...,technology


In [11]:
import spacy
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import re
from bs4 import BeautifulSoup
import unicodedata
nlp = spacy.load('en_core_web_sm', parse=True, tag=True, entity=True)
#nlp_vec = spacy.load('en_vecs', parse = True, tag=True, #entity=True)
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

In [13]:
stopword_list.append('pep')

In [14]:
stopword_list

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 'her',
 'hers',
 'herself',
 'it',
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each',
 'few',
 'more',
 'most',
 'other',
 'some',
 'such',
 'nor',
 'only',


In [15]:
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

strip_html_tags('<html><h2>Some important text</h2></html>')

'Some important text'

In [16]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

remove_accented_chars('Sómě Áccěntěd těxt')

'Some Accented text'

In [18]:
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

remove_special_characters("Well this was fun! What do you think? 123#@!", 
                          remove_digits=True)

'Well this was fun What do you think '

In [19]:
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

simple_stemmer("My system keeps crashing his crashed yesterday, ours crashes daily")

'My system keep crash hi crash yesterday, our crash daili'

In [20]:
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

lemmatize_text("My system keeps crashing! his crashed yesterday, ours crashes daily")

'My system keep crash ! his crashed yesterday , ours crash daily'

In [21]:
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

remove_stopwords("The, and, if are stopwords, computer is not")

', , stopwords , computer not'

In [22]:
def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
                     accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True, remove_digits=True):
    
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # strip HTML
        if html_stripping:
            doc = strip_html_tags(doc)
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # expand contractions    
        if contraction_expansion:
            doc = expand_contractions(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # remove special characters and\or digits    
        if special_char_removal:
            # insert spaces between special characters to isolate them    
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)  
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)
            
        normalized_corpus.append(doc)
        
    return normalized_corpus

In [23]:
# combining headline and article text
news_df['full_text'] = news_df["news_headline"].map(str)+ '. ' + news_df["news_article"]

# pre-process text and store the same
news_df['clean_text'] = normalize_corpus(news_df['full_text'])
norm_corpus = list(news_df['clean_text'])

# show a sample news article
news_df.iloc[1][['full_text', 'clean_text']].to_dict()

{'full_text': 'Nat Geo to release special edition magazine shot on OnePlus 7 Pro. National Geographic revealed its cover for the upcoming special-edition magazine, shot on the OnePlus 7 Pro. Notably, all images of the magazine will be shot on the smartphone. "With the OnePlus 7 Pro, you have a whole camera bag in your pocket which...allowed us to be able to shoot a whole magazine issue on a smartphone," said photographer Krystle Wright.',
 'clean_text': 'nat geo release special edition magazine shoot oneplus pro national geographic reveal cover upcoming special edition magazine shoot oneplus pro notably image magazine shoot smartphone oneplus pro whole camera bag pocket allow us able shoot whole magazine issue smartphone say photographer krystle wright'}

In [24]:
news_df.to_csv('news.csv', index=False, encoding='utf-8')

In [26]:
# create a basic pre-processed corpus, don't lowercase to get POS context
corpus = normalize_corpus(news_df['full_text'], text_lower_case=False, 
                          text_lemmatization=False, special_char_removal=False)

# demo for POS tagging for sample news headline
sentence = str(news_df.iloc[1].news_headline)
sentence_nlp = nlp(sentence)

# POS tagging with Spacy 
spacy_pos_tagged = [(word, word.tag_, word.pos_) for word in sentence_nlp]
pd.DataFrame(spacy_pos_tagged, columns=['Word', 'POS tag', 'Tag type'])

Unnamed: 0,Word,POS tag,Tag type
0,Nat,NNP,PROPN
1,Geo,NNP,PROPN
2,to,TO,PART
3,release,VB,VERB
4,special,JJ,ADJ
5,edition,NN,NOUN
6,magazine,NN,NOUN
7,shot,VBN,VERB
8,on,IN,ADP
9,OnePlus,NNP,PROPN


In [27]:
# POS tagging with nltk
nltk_pos_tagged = nltk.pos_tag(sentence.split())
pd.DataFrame(nltk_pos_tagged, columns=['Word', 'POS tag'])

Unnamed: 0,Word,POS tag
0,Nat,NNP
1,Geo,NNP
2,to,TO
3,release,VB
4,special,JJ
5,edition,NN
6,magazine,NN
7,shot,NN
8,on,IN
9,OnePlus,CC


In [34]:
sentence = str(news_df.iloc[5].full_text)
sentence_nlp = nlp(sentence)

# print named entities in article
print([(word, word.ent_type_) for word in sentence_nlp if word.ent_type_])

[(70bn, 'MONEY'), (1st, 'ORDINAL'), ($, 'MONEY'), (69.7, 'MONEY'), (billion, 'MONEY'), ($, 'MONEY'), (76, 'MONEY'), (billion, 'MONEY'), (Dara, 'PERSON'), (Khosrowshahi, 'PERSON'), (US, 'GPE'), (China, 'GPE'), (45, 'MONEY'), ($, 'MONEY'), (75.5, 'MONEY'), (billion, 'MONEY')]


In [44]:
from spacy import displacy
# visualize named entities
displacy.render(sentence_nlp, style='ent', jupyter=True)

In [43]:
doc = nlp(u'Is it a rational decision? The cat and the dog , Morgan Stanley, sleep in the basket, Fidelity Investments near the door, Soviet Union is a Democratic country.')
for np in doc.noun_chunks:
    print(np.text)

it
a rational decision
The cat
the dog
Morgan Stanley
the basket
Fidelity Investments
the door
Soviet Union
a Democratic country


In [45]:
doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text)

Apple
is
looking
at
buying
U.K.
startup
for
$
1
billion


In [48]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp(u"This is a sentence. This is another sentence.")
for sent in doc.sents:
    print(sent.text)

This is a sentence.
This is another sentence.


In [46]:
from nltk.tokenize import sent_tokenize, word_tokenize

data = "All work and no play makes jack a dull boy, all work and no play"
print(word_tokenize(data))

['All', 'work', 'and', 'no', 'play', 'makes', 'jack', 'a', 'dull', 'boy', ',', 'all', 'work', 'and', 'no', 'play']


In [47]:
from nltk.tokenize import sent_tokenize, word_tokenize

data = "All work and no play makes jack dull boy. All work and no play makes jack a dull boy."
print(sent_tokenize(data))

['All work and no play makes jack dull boy.', 'All work and no play makes jack a dull boy.']


In [53]:
from textblob import TextBlob

In [54]:
blob = TextBlob('Great Learning is a gret platfrm to learn data scence')
blob.correct()

TextBlob("Great Learning is a great platform to learn data science")

In [55]:
blob.words[4].spellcheck()

[('great', 0.5351351351351351),
 ('get', 0.3162162162162162),
 ('grew', 0.11216216216216217),
 ('grey', 0.026351351351351353),
 ('greet', 0.006081081081081081),
 ('fret', 0.002702702702702703),
 ('grit', 0.0006756756756756757),
 ('cret', 0.0006756756756756757)]

In [56]:
blob = TextBlob("Er lebt mit seinen Eltern und seiner Schwester in Berlin")
blob.detect_language()

'de'

In [57]:
blob.translate(to= 'en')

TextBlob("He lives with his parents and sister in Berlin")

In [61]:
blob = TextBlob('Great Learning is a great platfrm to learn data science')
for ngram in blob.ngrams(2):
    print (ngram)

['Great', 'Learning']
['Learning', 'is']
['is', 'a']
['a', 'great']
['great', 'platfrm']
['platfrm', 'to']
['to', 'learn']
['learn', 'data']
['data', 'science']


In [62]:
print (blob)
blob.sentiment

Great Learning is a great platfrm to learn data science


Sentiment(polarity=0.8, subjectivity=0.75)