In [81]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline

# Scaping news articles from inshorts.com from three categories technology,sports and world.

In [82]:
seed_urls = ['https://inshorts.com/en/read/technology',
             'https://inshorts.com/en/read/sports',
             'https://inshorts.com/en/read/world']

def build_dataset(seed_urls):
    news_data = []
    for url in seed_urls:
        news_category = url.split('/')[-1]
        data = requests.get(url)
        soup = BeautifulSoup(data.content, 'html.parser')
        
        news_articles = [{'news_headline': headline.find('span', 
                                                         attrs={"itemprop": "headline"}).string,
                          'news_article': article.find('div', 
                                                       attrs={"itemprop": "articleBody"}).string,
                          'news_category': news_category}
                         
                            for headline, article in 
                             zip(soup.find_all('div', 
                                               class_=["news-card-title news-right-box"]),
                                 soup.find_all('div', 
                                               class_=["news-card-content news-right-box"]))
                        ]
        news_data.extend(news_articles)
        
    df =  pd.DataFrame(news_data)
    df = df[['news_headline', 'news_article', 'news_category']]
    return df

In [83]:
news_df = build_dataset(seed_urls)
news_df.head(10)

Unnamed: 0,news_headline,news_article,news_category
0,OnePlus 8 Pro to feature super fast 30W wirele...,The first OnePlus phone to support wireless ch...,technology
1,Twitter CEO to donate 28% of wealth for COVID-...,Twitter CEO Jack Dorsey has pledged $1 billion...,technology
2,Google Search for 'Living room concert' increa...,Google Trends data shows the search for 'Livin...,technology
3,"Apple plans to hire more than 1,000 interns am...",Apple in a statement to Axios said it plans to...,technology
4,Apple CEO Tim Cook to hold virtual Q&A with em...,Apple will organise a company-wide virtual mee...,technology
5,Google launches 'braille keyboard' for vision-...,Google has started rolling out a braille keybo...,technology
6,Facebook launches messaging app 'Tuned' for co...,Facebook's New Product Experimentation (NPE) T...,technology
7,WhatsApp makes group calling easier amid coron...,WhatsApp has made it easier to start a group c...,technology
8,Google bans Zoom from employees' laptops over ...,Google has banned Zoom video conferencing appl...,technology
9,Kenyan runners run solo half marathons for vir...,A group of 30 Kenyan runners from across the c...,technology


In [84]:
news_df.news_category.value_counts()

world         25
technology    24
sports        23
Name: news_category, dtype: int64

In [89]:
news_df.head()

Unnamed: 0,news_headline,news_article,news_category
0,OnePlus 8 Pro to feature super fast 30W wirele...,The first OnePlus phone to support wireless ch...,technology
1,Twitter CEO to donate 28% of wealth for COVID-...,Twitter CEO Jack Dorsey has pledged $1 billion...,technology
2,Google Search for 'Living room concert' increa...,Google Trends data shows the search for 'Livin...,technology
3,"Apple plans to hire more than 1,000 interns am...",Apple in a statement to Axios said it plans to...,technology
4,Apple CEO Tim Cook to hold virtual Q&A with em...,Apple will organise a company-wide virtual mee...,technology


# Scoring subjectivity and polarity of news articles using Text Blob

In [90]:
from textblob import TextBlob

In [91]:
# Create a function to get the subjectivity
def getSubjectivity(text):
   return TextBlob(text).sentiment.subjectivity

# Create a function to get the polarity
def getPolarity(text):
   return  TextBlob(text).sentiment.polarity

In [92]:
# Create two new columns 'Subjectivity' & 'Polarity'
news_df['Subjectivity'] = news_df["news_article"].apply(getSubjectivity)
news_df['Polarity'] = news_df["news_article"].apply(getPolarity)

# Show the new dataframe with columns 'Subjectivity' & 'Polarity'
news_df.head()

Unnamed: 0,news_headline,news_article,news_category,Subjectivity,Polarity
0,OnePlus 8 Pro to feature super fast 30W wirele...,The first OnePlus phone to support wireless ch...,technology,0.333333,0.25
1,Twitter CEO to donate 28% of wealth for COVID-...,Twitter CEO Jack Dorsey has pledged $1 billion...,technology,0.391667,-0.083333
2,Google Search for 'Living room concert' increa...,Google Trends data shows the search for 'Livin...,technology,0.6125,0.279167
3,"Apple plans to hire more than 1,000 interns am...",Apple in a statement to Axios said it plans to...,technology,0.333333,0.125
4,Apple CEO Tim Cook to hold virtual Q&A with em...,Apple will organise a company-wide virtual mee...,technology,0.0,0.0


In [93]:
# Create a function to compute negative (-1), neutral (0) and positive (+1) analysis
def getAnalysis(score):
 if score < 0:
  return 'Negative'
 elif score == 0:
  return 'Neutral'
 else:
  return 'Positive'


news_df['Analysis'] = news_df['Polarity'].apply(getAnalysis)

# Show the dataframe
news_df.head()

Unnamed: 0,news_headline,news_article,news_category,Subjectivity,Polarity,Analysis
0,OnePlus 8 Pro to feature super fast 30W wirele...,The first OnePlus phone to support wireless ch...,technology,0.333333,0.25,Positive
1,Twitter CEO to donate 28% of wealth for COVID-...,Twitter CEO Jack Dorsey has pledged $1 billion...,technology,0.391667,-0.083333,Negative
2,Google Search for 'Living room concert' increa...,Google Trends data shows the search for 'Livin...,technology,0.6125,0.279167,Positive
3,"Apple plans to hire more than 1,000 interns am...",Apple in a statement to Axios said it plans to...,technology,0.333333,0.125,Positive
4,Apple CEO Tim Cook to hold virtual Q&A with em...,Apple will organise a company-wide virtual mee...,technology,0.0,0.0,Neutral


In [94]:
fea_map_1 = {'technology':1, 'sports':2, 'world':3}
news_df['news_category'] = news_df['news_category'].map(fea_map_1)
news_df.head()

Unnamed: 0,news_headline,news_article,news_category,Subjectivity,Polarity,Analysis
0,OnePlus 8 Pro to feature super fast 30W wirele...,The first OnePlus phone to support wireless ch...,1,0.333333,0.25,Positive
1,Twitter CEO to donate 28% of wealth for COVID-...,Twitter CEO Jack Dorsey has pledged $1 billion...,1,0.391667,-0.083333,Negative
2,Google Search for 'Living room concert' increa...,Google Trends data shows the search for 'Livin...,1,0.6125,0.279167,Positive
3,"Apple plans to hire more than 1,000 interns am...",Apple in a statement to Axios said it plans to...,1,0.333333,0.125,Positive
4,Apple CEO Tim Cook to hold virtual Q&A with em...,Apple will organise a company-wide virtual mee...,1,0.0,0.0,Neutral


In [95]:
fea_map = {'Neutral':0, 'Positive':1, 'Negative':-1}
news_df['Analysis'] = news_df['Analysis'].map(fea_map)
news_df.head()

Unnamed: 0,news_headline,news_article,news_category,Subjectivity,Polarity,Analysis
0,OnePlus 8 Pro to feature super fast 30W wirele...,The first OnePlus phone to support wireless ch...,1,0.333333,0.25,1
1,Twitter CEO to donate 28% of wealth for COVID-...,Twitter CEO Jack Dorsey has pledged $1 billion...,1,0.391667,-0.083333,-1
2,Google Search for 'Living room concert' increa...,Google Trends data shows the search for 'Livin...,1,0.6125,0.279167,1
3,"Apple plans to hire more than 1,000 interns am...",Apple in a statement to Axios said it plans to...,1,0.333333,0.125,1
4,Apple CEO Tim Cook to hold virtual Q&A with em...,Apple will organise a company-wide virtual mee...,1,0.0,0.0,0


In [96]:
data = news_df.drop(["news_headline","Subjectivity", "Polarity","Analysis"],axis=1)

In [97]:
data.head()

Unnamed: 0,news_article,news_category
0,The first OnePlus phone to support wireless ch...,1
1,Twitter CEO Jack Dorsey has pledged $1 billion...,1
2,Google Trends data shows the search for 'Livin...,1
3,Apple in a statement to Axios said it plans to...,1
4,Apple will organise a company-wide virtual mee...,1


In [112]:
import re
nlp_data = str(data.iloc[:,0])
nlp_data = re.sub("[^a-zA-Z]"," ",nlp_data)

In [113]:
nlp_data

'      The first OnePlus phone to support wireless ch          Twitter CEO Jack Dorsey has pledged    billion          Google Trends data shows the search for  Livin          Apple in a statement to Axios said it plans to          Apple will organise a company wide virtual mee                                                                  A unilateral two week ceasefire called by the           Doctors in Zimbabwe have filed a lawsuit again          All the lawmakers in Botswana s parliament hav          Maritime and Mercantile International  MMI   a          Mustafa Al Kadhimi  who is the chief of the Ir    Name  news article  Length      dtype  object'

In [114]:
#After return lower case
nlp_data = nlp_data.lower()

In [115]:
import nltk as nlp
nlp_data = nlp.word_tokenize(nlp_data)
#nlp_data = nlp_data.split() or we can do so

In [116]:
#we have to find word root
lemma = nlp.WordNetLemmatizer()
nlp_data = [lemma.lemmatize(word) for word in nlp_data]

In [117]:
nlp_data

['the',
 'first',
 'oneplus',
 'phone',
 'to',
 'support',
 'wireless',
 'ch',
 'twitter',
 'ceo',
 'jack',
 'dorsey',
 'ha',
 'pledged',
 'billion',
 'google',
 'trend',
 'data',
 'show',
 'the',
 'search',
 'for',
 'livin',
 'apple',
 'in',
 'a',
 'statement',
 'to',
 'axios',
 'said',
 'it',
 'plan',
 'to',
 'apple',
 'will',
 'organise',
 'a',
 'company',
 'wide',
 'virtual',
 'mee',
 'a',
 'unilateral',
 'two',
 'week',
 'ceasefire',
 'called',
 'by',
 'the',
 'doctor',
 'in',
 'zimbabwe',
 'have',
 'filed',
 'a',
 'lawsuit',
 'again',
 'all',
 'the',
 'lawmaker',
 'in',
 'botswana',
 's',
 'parliament',
 'hav',
 'maritime',
 'and',
 'mercantile',
 'international',
 'mmi',
 'a',
 'mustafa',
 'al',
 'kadhimi',
 'who',
 'is',
 'the',
 'chief',
 'of',
 'the',
 'ir',
 'name',
 'news',
 'article',
 'length',
 'dtype',
 'object']

In [118]:
#We join our data
nlp_data = " ".join(nlp_data)

In [120]:
nlp_data

'the first oneplus phone to support wireless ch twitter ceo jack dorsey ha pledged billion google trend data show the search for livin apple in a statement to axios said it plan to apple will organise a company wide virtual mee a unilateral two week ceasefire called by the doctor in zimbabwe have filed a lawsuit again all the lawmaker in botswana s parliament hav maritime and mercantile international mmi a mustafa al kadhimi who is the chief of the ir name news article length dtype object'

#  Punctuation Removal,  lower case conversion, Tokenization and Lemmatization

In [122]:
import re
description_list = []
for description in data["news_article"]:
    description = re.sub("[^a-zA-Z]"," ",description)
    description = description.lower()   
    description = nlp.word_tokenize(description)
    lemma = nlp.WordNetLemmatizer()
    description = [ lemma.lemmatize(word) for word in description]
    description = " ".join(description)
    description_list.append(description)

# Extracting Features using covent vectorizer

In [123]:
from sklearn.feature_extraction.text import CountVectorizer 
max_features = 3000 #We use the most common word
count_vectorizer = CountVectorizer(max_features = max_features, stop_words = "english")
sparce_matrix = count_vectorizer.fit_transform(description_list).toarray()
print("the most using {} words: {}".format(max_features,count_vectorizer.get_feature_names()))

the most using 3000 words: ['abd', 'able', 'absolutely', 'access', 'accessible', 'according', 'account', 'accurate', 'accused', 'action', 'adair', 'added', 'addicted', 'addiction', 'adequate', 'adil', 'admitted', 'aged', 'ago', 'agree', 'agriculture', 'ahead', 'aic', 'aid', 'aim', 'aimed', 'airbrushed', 'airline', 'akhtar', 'al', 'alcohol', 'alex', 'alipay', 'allah', 'allegation', 'allow', 'allowed', 'allows', 'amazon', 'american', 'amid', 'android', 'animal', 'announced', 'ant', 'anthony', 'anti', 'anytime', 'apart', 'apologised', 'app', 'appearing', 'apple', 'appliance', 'application', 'apply', 'appointment', 'apps', 'april', 'arabia', 'area', 'arises', 'army', 'arrested', 'ask', 'asked', 'assigned', 'associate', 'association', 'attack', 'attention', 'attracted', 'attributed', 'auctioning', 'australia', 'australian', 'automated', 'automatic', 'available', 'avoid', 'axios', 'bachelet', 'backed', 'ball', 'bangalore', 'banned', 'barham', 'barring', 'based', 'batsman', 'batting', 'bayley

In [127]:
#We separate our data is train and test
y = data.iloc[:,1].values   # male or female classes
x = sparce_matrix
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.3, random_state = 42)

# Building model

In [128]:
#We make model for predict
from sklearn.naive_bayes import MultinomialNB
nb =  MultinomialNB(alpha=0.7)
nb.fit(x_train,y_train)
print("the accuracy of our model: {}".format(nb.score(x_test,y_test)))

the accuracy of our model: 0.8181818181818182


In [129]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter = 200)
lr.fit(x_train,y_train)
print("our accuracy is: {}".format(lr.score(x_test,y_test)))

our accuracy is: 0.6363636363636364


In [133]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 2)
knn.fit(x_train,y_train)
#print('Prediction: {}'.format(prediction))
print('With KNN (K=3) accuracy is: ',knn.score(x_test,y_test))

With KNN (K=3) accuracy is:  0.5909090909090909
