## Scrapping news data of past 24 hours

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# we need current time + 24 hour time delta for this program
from datetime import datetime, timedelta

In [2]:
# Define the URL for Google News RSS feed (Rich Site Summary)
# (ps. URL will be provided for exams)
url = "https://news.google.com/rss?h1=en-US&gl=US&ceid=US:en"

response = requests.get(url)
response

<Response [200]>

In [3]:
# Parse the XML content using BeautifulSoup
# To use xml parser you need to install lxml package
soup = BeautifulSoup(response.content, "xml")

# Find all <item> elements which represent individual news articles
articles = soup.find_all("item")

In [4]:
# list to store scrapped data
data = []

# Calculate the cutoff time for the past 24 hours
cutoff_time = datetime.now() - timedelta(hours=24)

for article in articles:
    publish_time = datetime.strptime(article.pubDate.text, "%a, %d %b %Y %H:%M:%S %Z")

    # Check if the article was published within the last 24 hours
    if publish_time >= cutoff_time:
        title = article.title.text
        link = article.link.text
        description = article.description.text

        data.append([title, link, description])

df = pd.DataFrame(data, columns=["Title", "Link", "Description"])
df

Unnamed: 0,Title,Link,Description
0,Live news: US FTC fails in legal bid to block ...,https://news.google.com/rss/articles/CBMiP2h0d...,"<ol><li><a href=""https://news.google.com/rss/a..."
1,Trump wants classified documents trial delayed...,https://news.google.com/rss/articles/CBMiXGh0d...,"<ol><li><a href=""https://news.google.com/rss/a..."
2,Catastrophic flooding swamps Vermont's capital...,https://news.google.com/rss/articles/CBMiYWh0d...,"<ol><li><a href=""https://news.google.com/rss/a..."
3,Zelenskyy blasts ‘absurd’ draft text that hedg...,https://news.google.com/rss/articles/CBMiY2h0d...,"<ol><li><a href=""https://news.google.com/rss/a..."
4,All 6 aboard helicopter carrying Mexican touri...,https://news.google.com/rss/articles/CBMiYmh0d...,"<ol><li><a href=""https://news.google.com/rss/a..."
5,Tuberville refuses to denounce White nationali...,https://news.google.com/rss/articles/CBMiW2h0d...,"<ol><li><a href=""https://news.google.com/rss/a..."
6,Rolling Hills Estates landslide: Heavy rains s...,https://news.google.com/rss/articles/CBMid2h0d...,"<ol><li><a href=""https://news.google.com/rss/a..."
7,Grand jury handling potential indictments in T...,https://news.google.com/rss/articles/CBMiVWh0d...,"<ol><li><a href=""https://news.google.com/rss/a..."
8,Ex-Congressman suggests Hunter Biden alleged l...,https://news.google.com/rss/articles/CBMidWh0d...,"<ol><li><a href=""https://news.google.com/rss/a..."
9,Northern Lights in Maryland: How you can see t...,https://news.google.com/rss/articles/CCAiC2xBY...,"<ol><li><a href=""https://news.google.com/rss/a..."


---
## Apply unsupervised clustering algorithm to get the labelled data
We are using unsupervised learning to categorize the data, ie labelling the news.  
(ps. we can apply clustering to title or description or any other attribute)  
(here we are applying KMeans clustering to Description)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [6]:
# Convert the description text into numerical features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["Description"])

In [7]:
# Apply K-means clustering
k = 5  # Number of clusters
# or use hyperparameter tuning to find the best k value

kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X)



In [8]:
labels = kmeans.labels_
# here, labels range from 0 to 4 because k = 5

labels

array([2, 3, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 4, 0, 1, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [9]:
# now we can finally add the labels for the articles
df["Cluster"] = labels

df.head()

Unnamed: 0,Title,Link,Description,Cluster
0,Live news: US FTC fails in legal bid to block ...,https://news.google.com/rss/articles/CBMiP2h0d...,"<ol><li><a href=""https://news.google.com/rss/a...",2
1,Trump wants classified documents trial delayed...,https://news.google.com/rss/articles/CBMiXGh0d...,"<ol><li><a href=""https://news.google.com/rss/a...",3
2,Catastrophic flooding swamps Vermont's capital...,https://news.google.com/rss/articles/CBMiYWh0d...,"<ol><li><a href=""https://news.google.com/rss/a...",0
3,Zelenskyy blasts ‘absurd’ draft text that hedg...,https://news.google.com/rss/articles/CBMiY2h0d...,"<ol><li><a href=""https://news.google.com/rss/a...",0
4,All 6 aboard helicopter carrying Mexican touri...,https://news.google.com/rss/articles/CBMiYmh0d...,"<ol><li><a href=""https://news.google.com/rss/a...",0


In [10]:
cluster_counts = df["Cluster"].value_counts().sort_index()
cluster_counts

0    26
1     4
2     1
3     2
4     1
Name: Cluster, dtype: int64

# Further
- after labelling the news data, 
- preprocess the labelled data
- apply supervised machine learning
- predict the labels/output...?