# Data retrieval

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

%matplotlib inline

In [2]:
url = 'https://inshorts.com/en/read/technology'

In [3]:
news_data = []

In [4]:
news_category = url.split('/')
news_category

['https:', '', 'inshorts.com', 'en', 'read', 'technology']

In [5]:
news_category = news_category[-1]

In [6]:
data = requests.get(url)
data.content

b'<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="dns-prefetch" href="https://ajax.googleapis.com"><link rel="dns-prefetch" href="https://assets.inshorts.com"><link rel="dns-prefetch" href="https://static.inshorts.com"><link rel="dns-prefetch" href="https://cdn.ampproject.org"><meta name="theme-color" content="#ffffff"><link rel="shortcut icon" href="/assets/images/favicon.png" type="image/x-icon"/><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta http-equiv="X-UA-Compatible" content="IE=edge"/><link rel="icon" sizes="192x192" href="/assets/images/logo_192.png"/><link href="https://inshorts.com" rel="canonical" />\n  <link href="https://inshorts.com" rel="alternate"/>\n  <meta name="description" content="read latest news with inshorts in less than 60 words related to business sports bollywood and technology in india and around the world in both english  hindi"/>\n\n  <meta

In [7]:
soup = BeautifulSoup(data.content, 'html.parser')

In [8]:
headlines = soup.find_all('span', itemprop='headline')

In [9]:
articles = soup.find_all('div', itemprop='articleBody')

In [10]:
news_articles = [{'news_headline': headline.text.strip(),'news_article': article.text.strip(),'news_category': news_category}
                         for headline, article in zip(headlines, articles)]

In [11]:
news_data.extend(news_articles)
news_data

[{'news_headline': 'TRAI releases 2 new mobile number series for calling and SMS',
  'news_article': 'A new number series has been released by government bodies TRAI and DoT for mobile calling and messaging. Also, the government can issue new regulations for OTT apps. Apart from this, the government is preparing to tighten the noose on messaging platforms like WhatsApp.',
  'news_category': 'technology'},
 {'news_headline': "Startup unveils world's first head transplant machine concept",
  'news_article': 'US-based startup BrainBridge has unveiled what it claims is the "world\'s first concept for a head transplant system". It integrates advanced robotics and AI to execute complete head and face transplantation procedures, the startup said. "It offers new hope to patients suffering from untreatable conditions such as stage-4 cancer, paralysis, and diseases like Alzheimer\'s and Parkinson\'s," it added.',
  'news_category': 'technology'},
 {'news_headline': "'Use glue to stick cheese to 

In [12]:
df = pd.DataFrame(news_data)

In [13]:
df = df[['news_headline', 'news_article', 'news_category']]

In [14]:
df.head(3)

Unnamed: 0,news_headline,news_article,news_category
0,TRAI releases 2 new mobile number series for c...,A new number series has been released by gover...,technology
1,Startup unveils world's first head transplant ...,US-based startup BrainBridge has unveiled what...,technology
2,"'Use glue to stick cheese to pizza,' says Goog...","On being asked how to stick cheese to pizza, G...",technology


# Prepare user defined function to extract data from inshorts.com

In [15]:
urls_list = ['https://inshorts.com/en/read/technology',
             'https://inshorts.com/en/read/sports',
             'https://inshorts.com/en/read/world']

In [16]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def datasetPrepare(urls_list):
    news_data = []

    for url in urls_list:
        news_category = url.split('/')[-1]
        data = requests.get(url)
        soup = BeautifulSoup(data.content, 'html.parser')


        headlines = soup.find_all('span', itemprop='headline')
        articles = soup.find_all('div', itemprop='articleBody')


        # Extracting data from the selected elements
        news_articles = [{'news_headline': headline.text.strip(),
                          'news_article': article.text.strip(),
                          'news_category': news_category}
                         for headline, article in zip(headlines, articles)]

        news_data.extend(news_articles)

    df = pd.DataFrame(news_data)


    df = df[['news_headline', 'news_article', 'news_category']]

    return df

In [17]:
news_df = datasetPrepare(urls_list)

In [18]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29 entries, 0 to 28
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   news_headline  29 non-null     object
 1   news_article   29 non-null     object
 2   news_category  29 non-null     object
dtypes: object(3)
memory usage: 824.0+ bytes


In [19]:
news_df.head(5)

Unnamed: 0,news_headline,news_article,news_category
0,TRAI releases 2 new mobile number series for c...,A new number series has been released by gover...,technology
1,Startup unveils world's first head transplant ...,US-based startup BrainBridge has unveiled what...,technology
2,"'Use glue to stick cheese to pizza,' says Goog...","On being asked how to stick cheese to pizza, G...",technology
3,iPhone-maker Foxconn to make Google's Pixel ph...,Google is in advanced talks with iPhone manufa...,technology
4,New study places origin of Sun’s magnetic fiel...,"Till now, scientists have believed the Sun's p...",technology


In [20]:
news_df.news_category.value_counts()

news_category
technology    10
sports        10
world          9
Name: count, dtype: int64

# Text Wrangling and Pre-processing

In [21]:
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import re
import unicodedata

In [22]:
import spacy

In [23]:
nlp = spacy.load("en_core_web_sm")

In [24]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [25]:
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

## Remove HTML tags

In [26]:
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

strip_html_tags('<html><h2>Some important text</h2></html>')

'Some important text'

## Remove accented characters

In [27]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

remove_accented_chars('Sómě Áccěntěd těxt')

'Some Accented text'

## Remove special characters

In [28]:
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

In [29]:
remove_special_characters("Well this was fun! What do you think? 123#@!", remove_digits=True)

'Well this was fun What do you think '

## Text lemmatization

In [30]:
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [31]:
lemmatize_text("My system keeps crashing! his crashed yesterday, ours crashes daily")

'my system keep crash ! his crashed yesterday , ours crash daily'

## Text stemming

In [32]:
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

simple_stemmer("My system keeps crashing his crashed yesterday, ours crashes daily")

'my system keep crash hi crash yesterday, our crash daili'

## Remove stopwords

In [33]:
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

remove_stopwords("The, and, if are stopwords, computer is not")

', , stopwords , computer not'

## Building a text normalizer

In [34]:
def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
                     accented_char_removal=True, text_lower_case=True,
                     text_lemmatization=True, special_char_removal=True,
                     stopword_removal=True, remove_digits=True):

    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # strip HTML
        if html_stripping:
            doc = strip_html_tags(doc)
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # lowercase the text
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # remove special characters and\or digits
        if special_char_removal:
            # insert spaces between special characters to isolate them
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)

        normalized_corpus.append(doc)

    return normalized_corpus

## Pre-process and normalize news articles

In [35]:
news_df['full_text'] = news_df["news_headline"].map(str)+ '. ' + news_df["news_article"]

In [36]:
news_df['clean_text'] = normalize_corpus(news_df['full_text'])
norm_corpus = list(news_df['clean_text'])
news_df.iloc[1][['full_text', 'clean_text']].to_dict()

{'full_text': 'Startup unveils world\'s first head transplant machine concept. US-based startup BrainBridge has unveiled what it claims is the "world\'s first concept for a head transplant system". It integrates advanced robotics and AI to execute complete head and face transplantation procedures, the startup said. "It offers new hope to patients suffering from untreatable conditions such as stage-4 cancer, paralysis, and diseases like Alzheimer\'s and Parkinson\'s," it added.',
 'clean_text': 'startup unveil world first head transplant machine concept us base startup brainbridge unveil claim world first concept head transplant system integrate advanced robotic ai execute complete head face transplantation procedure startup say offer new hope patient suffer untreatable condition stage cancer paralysis disease like alzheimer parkinson add'}

# Save the news articles

In [37]:
news_df.to_csv('news.csv', index=False, encoding='utf-8')

# Tagging Parts of Speech

In [38]:
news_df = pd.read_csv('news.csv')

In [39]:
corpus = normalize_corpus(news_df['full_text'], text_lower_case=False,
                          text_lemmatization=False, special_char_removal=False)

sentence = str(news_df.iloc[1].news_headline)
sentence_nlp = nlp(sentence)

In [40]:
spacy_pos_tagged = [(word, word.tag_, word.pos_) for word in sentence_nlp]
pd.DataFrame(spacy_pos_tagged, columns=['Word', 'POS tag', 'Tag type'])

Unnamed: 0,Word,POS tag,Tag type
0,Startup,NNP,PROPN
1,unveils,VBZ,VERB
2,world,NN,NOUN
3,'s,POS,PART
4,first,JJ,ADJ
5,head,NN,NOUN
6,transplant,NN,NOUN
7,machine,NN,NOUN
8,concept,NN,NOUN


In [41]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [42]:
nltk.download('maxent_treebank_pos_tagger')

[nltk_data] Downloading package maxent_treebank_pos_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/maxent_treebank_pos_tagger.zip.


True

In [43]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [44]:
nltk_pos_tagged = nltk.pos_tag(sentence.split())

In [45]:
pd.DataFrame(nltk_pos_tagged, columns=['Word', 'POS tag'])

Unnamed: 0,Word,POS tag
0,Startup,NNP
1,unveils,NNS
2,world's,VBP
3,first,JJ
4,head,NN
5,transplant,NN
6,machine,NN
7,concept,NN


# Named Entity Recognition

In [46]:
sentence = str(news_df.iloc[1].full_text)

In [47]:
sentence

'Startup unveils world\'s first head transplant machine concept. US-based startup BrainBridge has unveiled what it claims is the "world\'s first concept for a head transplant system". It integrates advanced robotics and AI to execute complete head and face transplantation procedures, the startup said. "It offers new hope to patients suffering from untreatable conditions such as stage-4 cancer, paralysis, and diseases like Alzheimer\'s and Parkinson\'s," it added.'

In [48]:
sentence_nlp = nlp(sentence)

In [49]:
sentence_nlp

Startup unveils world's first head transplant machine concept. US-based startup BrainBridge has unveiled what it claims is the "world's first concept for a head transplant system". It integrates advanced robotics and AI to execute complete head and face transplantation procedures, the startup said. "It offers new hope to patients suffering from untreatable conditions such as stage-4 cancer, paralysis, and diseases like Alzheimer's and Parkinson's," it added.

In [50]:
print([(word, word.ent_type_) for word in sentence_nlp if word.ent_type_])

[(first, 'ORDINAL'), (US, 'GPE'), (BrainBridge, 'ORG'), (first, 'ORDINAL'), (AI, 'ORG')]


In [51]:
named_entities = []
for sentence in corpus:
    temp_entity_name = ''
    temp_named_entity = None
    sentence = nlp(sentence)
    for word in sentence:
        term = word.text
        tag = word.ent_type_
        if tag:
            temp_entity_name = ' '.join([temp_entity_name, term]).strip()
            temp_named_entity = (temp_entity_name, tag)
        else:
            if temp_named_entity:
                named_entities.append(temp_named_entity)
                temp_entity_name = ''
                temp_named_entity = None

entity_frame = pd.DataFrame(named_entities,
                            columns=['Entity Name', 'Entity Type'])

In [52]:
top_entities = (entity_frame.groupby(by=['Entity Name', 'Entity Type'])
                           .size()
                           .sort_values(ascending=False)
                           .reset_index().rename(columns={0 : 'Frequency'}))
top_entities.T.iloc[:,:15]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
Entity Name,RCB,US,first,China,Kabosu,USA,AI,Google,Ebrahim Raisi,CSK,IPL,India,one,100,Iran
Entity Type,ORG,GPE,ORDINAL,GPE,ORG,GPE,ORG,ORG,PERSON,ORG,ORG,GPE,CARDINAL,CARDINAL,GPE
Frequency,9,6,6,5,4,4,4,4,3,3,3,3,3,3,3


In [53]:
top_entities = (entity_frame.groupby(by=['Entity Type'])
                           .size()
                           .sort_values(ascending=False)
                           .reset_index().rename(columns={0 : 'Frequency'}))
top_entities.T.iloc[:,:15]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
Entity Type,ORG,PERSON,GPE,CARDINAL,DATE,ORDINAL,TIME,NORP,LOC,PRODUCT,FAC,MONEY,LAW,PERCENT,LANGUAGE
Frequency,76,48,44,26,24,8,3,3,2,2,2,1,1,1,1


# Emotion and Sentiment Analysis

In [54]:
from afinn import Afinn

ModuleNotFoundError: No module named 'afinn'

In [None]:
af = Afinn()

In [None]:
sentiment_scores = [af.score(article) for article in corpus]

In [None]:
sentiment_category = ['positive' if score > 0
                          else 'negative' if score < 0
                              else 'neutral'
                                  for score in sentiment_scores]

In [None]:
df = pd.DataFrame([list(news_df['news_category']), sentiment_scores, sentiment_category]).T

In [None]:
df.columns = ['news_category', 'sentiment_score', 'sentiment_category']

In [None]:
df['sentiment_score'] = df.sentiment_score.astype('float')

In [None]:
df.groupby(by=['news_category']).describe()

In [None]:
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 4))
sp = sns.stripplot(x='news_category', y="sentiment_score",  hue='news_category', data=df, ax=ax1)
bp = sns.boxplot(x='news_category', y="sentiment_score", hue='news_category', data=df, palette="Set2", ax=ax2)
t = f.suptitle('Visualizing News Sentiment', fontsize=14)

In [None]:
pos_idx = df[(df.news_category=='technology') & (df.sentiment_score == 2)].index[0]

In [None]:
neg_idx = df[(df.news_category=='technology') & (df.sentiment_score == -2)].index[0]

In [None]:
print('Most Negative Tech News Article:', news_df.iloc[neg_idx][['news_article']][0])
print()
print('Most Positive Tech News Article:', news_df.iloc[pos_idx][['news_article']][0])

In [None]:
df1 = df[df.news_category=='world']
pd.unique(df1.sentiment_score)

In [None]:
from textblob import TextBlob

In [None]:
sentiment_scores_tb = [round(TextBlob(article).sentiment.polarity, 3) for article in news_df['clean_text']]

In [None]:
sentiment_category_tb = ['positive' if score > 0
                             else 'negative' if score < 0
                                 else 'neutral'
                                     for score in sentiment_scores_tb]

In [None]:
df = pd.DataFrame([list(news_df['news_category']), sentiment_scores_tb, sentiment_category_tb]).T
df.columns = ['news_category', 'sentiment_score', 'sentiment_category']
df['sentiment_score'] = df.sentiment_score.astype('float')
df.groupby(by=['news_category']).describe()

In [None]:
df.head()

In [None]:
pos_idx = df[(df.news_category=='world') & (df.sentiment_score > 0.)].index[0]
neg_idx = df[(df.news_category=='world') & (df.sentiment_score < -0.)].index[0]

print('Most Negative World News Article:', news_df.iloc[neg_idx][['news_article']][0])
print()
print('Most Positive World News Article:', news_df.iloc[pos_idx][['news_article']][0])