# Scrapping Hacker news data for last 24 hours

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta

In [30]:
# Define the URL for Google News RSS feed (Rich Site Summary)
url = "https://news.google.com/rss?hl=en-US&gl=US&ceid=US:en"

In [31]:
# Make a request to the RSS feed URL
response = requests.get(url)


ConnectionError: HTTPSConnectionPool(host='news.google.com', port=443): Max retries exceeded with url: /rss?hl=en-US&gl=US&ceid=US:en (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000001531F25F2B0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))

In [6]:
# Parse the XML content using BeautifulSoup
soup = BeautifulSoup(response.content, "xml")

In [11]:
# Find all <item> elements which represent individual news articles
articles = soup.find_all("item")

In [27]:
articles

[<item><title>Trump arrives in Miami for arraignment on first-ever federal charges against an ex-president - CNBC</title><link>https://news.google.com/rss/articles/CBMiY2h0dHBzOi8vd3d3LmNuYmMuY29tLzIwMjMvMDYvMTIvZG9uYWxkLXRydW1wLWluZGljdG1lbnQtdHJ1bXAtYXJyaXZlcy1pbi1taWFtaS1mb3ItYXJyYWlnbm1lbnQuaHRtbNIBZ2h0dHBzOi8vd3d3LmNuYmMuY29tL2FtcC8yMDIzLzA2LzEyL2RvbmFsZC10cnVtcC1pbmRpY3RtZW50LXRydW1wLWFycml2ZXMtaW4tbWlhbWktZm9yLWFycmFpZ25tZW50Lmh0bWw?oc=5</link><guid isPermaLink="false">2128778852</guid><pubDate>Mon, 12 Jun 2023 21:49:44 GMT</pubDate><description>&lt;ol&gt;&lt;li&gt;&lt;a href="https://news.google.com/rss/articles/CBMiY2h0dHBzOi8vd3d3LmNuYmMuY29tLzIwMjMvMDYvMTIvZG9uYWxkLXRydW1wLWluZGljdG1lbnQtdHJ1bXAtYXJyaXZlcy1pbi1taWFtaS1mb3ItYXJyYWlnbm1lbnQuaHRtbNIBZ2h0dHBzOi8vd3d3LmNuYmMuY29tL2FtcC8yMDIzLzA2LzEyL2RvbmFsZC10cnVtcC1pbmRpY3RtZW50LXRydW1wLWFycml2ZXMtaW4tbWlhbWktZm9yLWFycmFpZ25tZW50Lmh0bWw?oc=5" target="_blank"&gt;Trump arrives in Miami for arraignment on first-ever federal charge

In [13]:
data = []  # List to store scraped data

In [14]:
# Calculate the cutoff time for the past 24 hours
cutoff_time = datetime.now() - timedelta(hours=24)

In [15]:
for article in articles:
    # Extract relevant data from each article
    pub_date = datetime.strptime(article.pubDate.text, "%a, %d %b %Y %H:%M:%S %Z")
    # Check if the article was published within the last 24 hours
    if pub_date >= cutoff_time:
        title = article.title.text  # Extract title
        link = article.link.text  # Extract link
        description = article.description.text  # Extract description
        
        # Append the extracted data to the list
        data.append([title, link, description])
# Create a DataFrame from the scraped data
df = pd.DataFrame(data, columns=["Title", "Link", "Description"])

In [16]:
df.head()

Unnamed: 0,Title,Link,Description
0,Trump arrives in Miami for arraignment on firs...,https://news.google.com/rss/articles/CBMiY2h0d...,"<ol><li><a href=""https://news.google.com/rss/a..."
1,Russia-Ukraine war at a glance: what we know o...,https://news.google.com/rss/articles/CBMidGh0d...,"<ol><li><a href=""https://news.google.com/rss/a..."
2,Fact checking Chris Christie’s CNN town hall -...,https://news.google.com/rss/articles/CBMiWmh0d...,"<ol><li><a href=""https://news.google.com/rss/a..."
3,'First of its kind' Illinois law will penalize...,https://news.google.com/rss/articles/CBMiW2h0d...,"<ol><li><a href=""https://news.google.com/rss/a..."
4,"Kryvyi Rih Hit, S-300 Wreckage Found In Dnipro...",https://news.google.com/rss/articles/CCAiC3VGV...,"<ol><li><a href=""https://news.google.com/rss/a..."


# Apply unsupervised clustering algorithm to get the labelled data 


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [20]:
# Convert the description text into numerical features
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["Description"])

In [21]:
# Apply K-means clustering
k = 5  # Number of clusters
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X)



In [22]:
# Get the cluster labels for each news article
labels = kmeans.labels_

In [23]:
labels

array([4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 4, 4, 2, 4, 4, 2, 4, 4, 4,
       4, 4, 4, 4, 0, 4, 0, 1, 4, 4, 4, 4, 3, 4, 4])

In [24]:
# Add the cluster labels to the DataFrame
df["Cluster"] = labels

In [25]:
# Count the number of articles in each cluster
cluster_counts = df["Cluster"].value_counts().sort_index()

In [26]:
cluster_counts

0     2
1     1
2     2
3     4
4    28
Name: Cluster, dtype: int64