**New Article Scraping**

We use Star.com news article to scrape features and create a dataset with information scraping in a bulk

In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
base_url = "https://www.thestar.com.my/news/latest?pgno={}&tag={}"
tags = ["Nation", "Business", "Education", "Aseanplus", "Sport", "Metro", "Tech", "World", "Lifestyle", "Food"]

In [3]:
all_news_urls = []

for tag in tags:
    for page_num in range(1, 11):  # Pages 1 to 10
        url = base_url.format(page_num, tag)
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        h2_tags = soup.find_all('h2', class_='f18')
        for h2_tag in h2_tags:
            a_tag = h2_tag.find('a')
            if a_tag and 'href' in a_tag.attrs:
                all_news_urls.append(a_tag['href'])

In [4]:
print("Sample article URLs:", all_news_urls[:10])
print("Total articles collected:", len(all_news_urls))

Sample article URLs: ['https://www.thestar.com.my/news/nation/2025/08/04/attempted-cable-theft-led-to-power-outage-at-kk039s-intan-campus-surrounding-areas', 'https://www.thestar.com.my/news/nation/2025/08/04/putra-heights-inferno-selangor-to-review-progress-on-homes-rebuild-on-aug-8', 'https://www.thestar.com.my/news/nation/2025/08/04/zayn-rayyan-not-neglected-or-abused-child-psychologist-tells-court', 'https://www.thestar.com.my/news/nation/2025/08/04/sabah-focusing-on-skilled-human-capital-and-strengthening-tvet-says-cm', 'https://www.thestar.com.my/news/nation/2025/08/04/police-resubmit-investigation-paper-to-dpp-on-doctor039s-alleged-indecent-acts', 'https://www.thestar.com.my/news/nation/2025/08/04/malaysia-proposes-asean-monitoring-team-for-thai-cambodian-border', 'https://www.thestar.com.my/news/nation/2025/08/04/education-reform-in-13mp-vital-for-malaysias-economic-evolution-says-rafizi', 'https://www.thestar.com.my/news/nation/2025/08/04/mindset-change-needed-for-13mp-to-succ

In [6]:
import pandas as pd
import newspaper
from newspaper import Article, Config
from tqdm import tqdm
import time

config = Config()
config.request_timeout = 90  

article_details_list = []

def fetch_article_with_retries(url, retries=3, delay=5):
    for attempt in range(retries):
        try:
            article = Article(url=url, language='en', config=config)
            article.download()
            article.parse()
            return article
        except Exception as e:
            if attempt < retries - 1:
                time.sleep(delay)  
            else:
                raise e

# Main loop
for url in tqdm(all_news_urls, desc="Processing articles"):
    try:
        article = fetch_article_with_retries(url)

        article_meta = article.meta_data
        
        article_detail = {
            "content_id": article_meta.get("content_id"),
            "title": article.title,
            "text": article.text,
            "section": article_meta.get("article_section_name"),
            "category": article_meta.get("cXenseParse", {}).get("kicker_name"),
            "content_tier": article_meta.get("content_tier"),
            "content_length": article_meta.get("content_length"),
            "authors": article_meta.get("author"),
            "published_date": str(article.publish_date),
            "keywords": article_meta.get("content_tags"),
            "summary": article_meta.get("description"),
            "url": url,
            "top_image": article.top_image,
        }
        
        article_details_list.append(article_detail)
        time.sleep(1)  
    
    except newspaper.ArticleException as e:
        print(f"[Timeout] Failed to process URL: {url}\nError: {e}")
    except Exception as e:
        print(f"[Error] Failed to process URL: {url}\nError: {e}")

# Convert to DataFrame
df = pd.DataFrame(article_details_list)


Processing articles: 100%|███████████████████████████████████████████████████████| 1800/1800 [1:21:58<00:00,  2.73s/it]


In [7]:
df.to_csv('news_dataset.csv', index=False)

In [8]:
df

Unnamed: 0,content_id,title,text,section,category,content_tier,content_length,authors,published_date,keywords,summary,url,top_image
0,1679260,Attempted cable theft led to power outage at K...,KOTA KINABALU: A power outage at the Intan cam...,News,Sabah & Sarawak,Complimentary,Short,,2025-08-04 00:00:00,"Sabah & Sarawak,Sabah Electric,Power Outage,Ca...",KOTA KINABALU: A power outage at the Intan cam...,https://www.thestar.com.my/news/nation/2025/08...,https://apicms.thestar.com.my/uploads/images/2...
1,1679254,Putra Heights inferno: Selangor to review prog...,SHAH ALAM: The Selangor government will review...,News,Nation,Complimentary,Short,,2025-08-04 00:00:00,"Putra Heights,Explosion,Fire,Inferno,Gas,Amiru...",SHAH ALAM: The Selangor government will review...,https://www.thestar.com.my/news/nation/2025/08...,https://apicms.thestar.com.my/uploads/images/2...
2,1679245,"Zayn Rayyan not neglected or abused, child psy...",PETALING JAYA: A child psychologist testified ...,News,Nation,Complimentary,Medium,,2025-08-04 00:00:00,"Courts Crime,Court,Zayn Rayyan,Neglect,Medical...",PETALING JAYA: A child psychologist testified ...,https://www.thestar.com.my/news/nation/2025/08...,https://apicms.thestar.com.my/uploads/images/2...
3,1679224,Sabah focusing on skilled human capital and st...,KOTA KINABALU: Sabah needs a highly educated w...,News,Sabah & Sarawak,Complimentary,Medium,,2025-08-04 00:00:00,"Sabah & Sarawak,Sabah,Chief Minister,Hajiji No...",KOTA KINABALU: Sabah needs a highly educated w...,https://www.thestar.com.my/news/nation/2025/08...,https://apicms.thestar.com.my/uploads/images/2...
4,1679223,Police resubmit investigation paper to DPP on ...,KEPALA BATAS: Penang police have resubmitted t...,News,Nation,Complimentary,Medium,,2025-08-04 00:00:00,"Courts Crime,PDRM,Police,Investigation Paper,D...",KEPALA BATAS: Penang police have resubmitted t...,https://www.thestar.com.my/news/nation/2025/08...,https://apicms.thestar.com.my/uploads/images/2...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1795,1627351,Thai twist on tipples,VISITING multiple bars to experience different...,Metro,Metro News,Complimentary,Medium,,2025-05-23 00:00:00,"bar hop,penang,cocktails",VISITING multiple bars to experience different...,https://www.thestar.com.my/metro/metro-news/20...,https://apicms.thestar.com.my/uploads/images/2...
1796,1627344,Packaged delights for Dragon Boat fest,Concorde Hotel Kuala Lumpur\n\nConcorde Hotel ...,Metro,Metro News,Complimentary,Medium,,2025-05-23 00:00:00,"Dragon Boat Festival,Dumpling Festival,glutino...",Xin Cuisine Chinese restaurant marks the Drago...,https://www.thestar.com.my/metro/metro-news/20...,https://apicms.thestar.com.my/uploads/images/2...
1797,1627349,Australian meats get Japanese treatment,Lovers of lamb and beef can take the opportuni...,Metro,Metro News,Complimentary,Short,,2025-05-23 00:00:00,"Australian meats,PARKROYAL COLLECTION,Koji Fuk...",Lovers of lamb and beef can take the opportuni...,https://www.thestar.com.my/metro/metro-news/20...,https://apicms.thestar.com.my/uploads/images/2...
1798,1622562,Do Italians themselves actually follow the Med...,My husband and I travelled to Italy in 2019.\n...,Lifestyle,Nutrition,Complimentary,Medium,,2025-05-22 00:00:00,"Nutrition,Diet,Mediterranean diet,nutrition,he...",Current dietary trends in Mediterranean countr...,https://www.thestar.com.my/lifestyle/health/20...,https://apicms.thestar.com.my/uploads/images/2...
