# Scrapping news data for last 24 hours

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# we need current time + 24 hour time delta for this program
from datetime import datetime, timedelta

In [2]:
# Define the URL for Google News RSS feed (Rich Site Summary)
# (ps. URL will be provided for exams)
url = "https://news.google.com/rss?h1=en-US&gl=US&ceid=US:en"

In [3]:
response = requests.get(url)
response

<Response [200]>

In [4]:
# Parse the XML content using BeautifulSoup
soup = BeautifulSoup(response.content, "xml")

# Find all <item> elements which represent individual news articles
articles = soup.find_all("item")

In [5]:
# list to store scrapped data
data = []

# Calculate the cutoff time for the past 24 hours
cutoff_time = datetime.now() - timedelta(hours=24)

In [6]:
for article in articles:
    # Extract relevant data from each article
    pub_date = datetime.strptime(article.pubDate.text, "%a, %d %b %Y %H:%M:%S %Z")
    # Check if the article was published within the last 24 hours
    if pub_date >= cutoff_time:
        title = article.title.text  # Extract title
        link = article.link.text  # Extract link
        description = article.description.text  # Extract description

        # Append the extracted data to the list
        data.append([title, link, description])

In [7]:
# Create a DataFrame from the scrapped data
df = pd.DataFrame(data, columns=["Title", "Link", "Description"])

In [8]:
df.head()

Unnamed: 0,Title,Link,Description
0,Biden officials must limit contact with social...,https://news.google.com/rss/articles/CBMiLGh0d...,"<ol><li><a href=""https://news.google.com/rss/a..."
1,Cocaine found in White House sparks brief evac...,https://news.google.com/rss/articles/CBMiMWh0d...,"<ol><li><a href=""https://news.google.com/rss/a..."
2,The Russia-Ukraine War Changed This Finland Co...,https://news.google.com/rss/articles/CBMiVWh0d...,"<a href=""https://news.google.com/rss/articles/..."
3,The killer in one of Philadelphia’s deadliest ...,https://news.google.com/rss/articles/CBMidGh0d...,"<ol><li><a href=""https://news.google.com/rss/a..."
4,A Solemn Walk Through Highland Park One Year A...,https://news.google.com/rss/articles/CBMiVGh0d...,"<ol><li><a href=""https://news.google.com/rss/a..."


# Apply unsupervised clustering algorithm to get the labelled data
We are using unsupervised learning to categorize the data, ie labelling the news.  
(ps. we can apply clustering to title or description or any other attribute)  
(here we are applying KMeans clustering to Description)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [10]:
# Convert the description text into numerical features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["Description"])

In [11]:
# Apply K-means clustering
k = 5  # Number of clusters
# or use hyperparameter tuning to find the best k value

kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X)

In [12]:
labels = kmeans.labels_
# here, labels range from 0 to 4 because k = 5

labels

array([3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4,
       1, 1, 0, 0, 1, 2], dtype=int32)

In [13]:
# now we can finally add the labels for the articles
df["Cluster"] = labels

df.head()

Unnamed: 0,Title,Link,Description,Cluster
0,Biden officials must limit contact with social...,https://news.google.com/rss/articles/CBMiLGh0d...,"<ol><li><a href=""https://news.google.com/rss/a...",3
1,Cocaine found in White House sparks brief evac...,https://news.google.com/rss/articles/CBMiMWh0d...,"<ol><li><a href=""https://news.google.com/rss/a...",1
2,The Russia-Ukraine War Changed This Finland Co...,https://news.google.com/rss/articles/CBMiVWh0d...,"<a href=""https://news.google.com/rss/articles/...",1
3,The killer in one of Philadelphia’s deadliest ...,https://news.google.com/rss/articles/CBMidGh0d...,"<ol><li><a href=""https://news.google.com/rss/a...",1
4,A Solemn Walk Through Highland Park One Year A...,https://news.google.com/rss/articles/CBMiVGh0d...,"<ol><li><a href=""https://news.google.com/rss/a...",1


In [14]:
cluster_counts = df["Cluster"].value_counts().sort_index()
cluster_counts

0     2
1    23
2     1
3     1
4     1
Name: Cluster, dtype: int64

# Further
- after labelling the news data, 
- preprocess the labelled data
- apply supervised machine learning
- predict the labels/output...?