# Import packages

In [1]:
from requests import get
from bs4 import BeautifulSoup
import re
import pandas as pd

# Gets url of twenty pages

In [2]:
urls = []
for i in range(1,21,1):
    url = "https://www.ndtv.com/latest/page-"+str(i)
    urls.append(url)

# Fetch article Function

In [3]:
def fetch_article(url):
    data = get(url)
    soup = BeautifulSoup(data.content,"html.parser")
    articles = []
    for i in soup.find_all("h2",class_=["newsHdng"]):
        articles.append(i.find('a').text)
    return articles

# Get articles of all pages

In [4]:
all_articles = []
for i in urls:
    all_articles.extend(fetch_article(i))

# Article text processing

In [5]:
p_art = []
for i in all_articles:
    q = i.upper()
    q = re.sub("[^0-9A-Z ]","",q)
    from nltk.stem import PorterStemmer
    tk_q = i.split(" ")
    sent = ""
    for j in tk_q:
        ps = PorterStemmer()
        sent = sent + " " + ps.stem(j).upper()
    p_art.append(sent)

# Matrix conversion to TfidfVectorizer

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer()
A = tf.fit_transform(p_art).toarray()

# Clustering

In [7]:
from sklearn.cluster import KMeans
kc = KMeans(n_clusters=5)
cl = kc.fit(A)

In [8]:
cl.labels_

array([0, 3, 0, 1, 4, 0, 0, 3, 0, 0, 0, 0, 3, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 4, 4, 1, 2, 2, 1, 0, 0, 1, 0, 0, 0, 0, 0, 4,
       0, 0, 3, 0, 4, 0, 3, 1, 0, 4, 1, 0, 1, 3, 1, 2, 3, 1, 2, 1, 2, 0,
       0, 4, 2, 1, 0, 0, 4, 3, 0, 0, 3, 0, 0, 0, 0, 0, 2, 0, 3, 4, 0, 1,
       0, 0, 0, 3, 1, 1, 0, 0, 1, 1, 1, 1, 0, 3, 0, 3, 0, 2, 0, 0, 0, 0,
       4, 1, 1, 0, 1, 0, 0, 0, 0, 0])

# Create dataframe

In [9]:
news = pd.DataFrame(p_art,columns=["Article"])

In [10]:
news["Cluster"]=cl.labels_

In [11]:
news

Unnamed: 0,Article,Cluster
0,RALLI DRIVER MOHAM BEN SULAYEM ELECT AS NEW F...,0
1,"TIKTOK ""SCHOOL SHOOT CHALLENGE"" SPARK FEAR OF...",3
2,"DENMARK' CHRISTIAN ERIKSEN LEAV INTER MILAN ""...",0
3,"OVER 93,000 COVID CASE IN UK TODAY, THIRD CON...",1
4,OPINION: THE PATHET PETTI OF NOT MENTION INDI...,4
...,...,...
115,ACTIVIST VARAVARA RAO MEDIC STABL AS PER DOCT...,0
116,UP HA TOP POSIT IN...: AKHILESH YADAV' JIBE A...,0
117,"NEENA GUPTA AND VIVEK MEHRA, DO ""SLEEPING"" AN...",0
118,VIRAL VIDEO: THI MASSIV PARATHA FROM NAGPUR L...,0


# Give the appropriate labels to the cluster

In [17]:
news[news["Cluster"]==5]

Unnamed: 0,Article,Cluster


In [19]:
E = {
    0:"Politics",
    1:"Geopolitics",
    2:"State",
    3:"Technologhy",
    4:"Politics"
}

In [21]:
R = []
for i in news.Cluster:
    R.append(E[i])
    
news["Category"] = R

In [22]:
news

Unnamed: 0,Article,Cluster,Category
0,RALLI DRIVER MOHAM BEN SULAYEM ELECT AS NEW F...,0,Politics
1,"TIKTOK ""SCHOOL SHOOT CHALLENGE"" SPARK FEAR OF...",3,Technologhy
2,"DENMARK' CHRISTIAN ERIKSEN LEAV INTER MILAN ""...",0,Politics
3,"OVER 93,000 COVID CASE IN UK TODAY, THIRD CON...",1,Geopolitics
4,OPINION: THE PATHET PETTI OF NOT MENTION INDI...,4,Politics
...,...,...,...
115,ACTIVIST VARAVARA RAO MEDIC STABL AS PER DOCT...,0,Politics
116,UP HA TOP POSIT IN...: AKHILESH YADAV' JIBE A...,0,Politics
117,"NEENA GUPTA AND VIVEK MEHRA, DO ""SLEEPING"" AN...",0,Politics
118,VIRAL VIDEO: THI MASSIV PARATHA FROM NAGPUR L...,0,Politics
