In [1]:
!pip install BeautifulSoup4



In [2]:
import os
import json
import time
import random
import zipfile
import requests
import pandas as pd
from bs4 import BeautifulSoup

# Class Explanation: `NewsScraper`

## Overview
The `NewsScraper` class is designed for scraping news articles from three different Urdu news websites: Geo, Jang, and Express. The class has methods that cater to each site's unique structure and requirements. Below, we will go through the class and its methods, detailing what each function does, the input it takes, and the output it returns.

## Class Definition

```python
class NewsScraper:
    def __init__(self, id_=0):
        self.id = id_
```


## Method 1: `get_express_articles`

### Description
Scrapes news articles from the Express website across categories like saqafat (entertainment), business, sports, science-technology, and world. The method navigates through multiple pages for each category to gather a more extensive dataset.

### Input
- **`max_pages`**: The number of pages to scrape for each category (default is 7).

### Process
- Iterates over each category and page.
- Requests each category page and finds article cards within `<ul class='tedit-shortnews listing-page'>`.
- Extracts the article's headline, link, and content by navigating through `<div class='horiz-news3-caption'>` and `<span class='story-text'>`.

### Output
- **Returns**: A tuple of:
  - A Pandas DataFrame containing columns: `id`, `title`, and `link`).
  - A dictionary `express_contents` where the key is the article ID and the value is the article content.

### Data Structure
- Article cards are identified by `<li>` tags.
- Content is structured within `<span class='story-text'>` and `<p>` tags.



In [3]:
class NewsScraper:
    def __init__(self,id_=0):
        self.id = id_


  # write functions to scrape from other websites


    def get_express_articles(self, max_pages=7):
        express_df = {
            "id": [],
            "title": [],
            "link": [],
            "content": [],
            "gold_label": [],
        }
        base_url = 'https://www.express.pk'
        categories = ['saqafat', 'business', 'sports', 'science', 'world']   # saqafat is entertainment category

        # Iterating over the specified number of pages
        for category in categories:
            for page in range(1, max_pages + 1):
                print(f"Scraping page {page} of category '{category}'...")
                url = f"{base_url}/{category}/archives?page={page}"
                response = requests.get(url)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, "html.parser")

                # Finding article cards
                cards = soup.find('ul', class_='tedit-shortnews listing-page').find_all('li')  # Adjust class as per actual site structure
                print(f"\t--> Found {len(cards)} articles on page {page} of '{category}'.")

                success_count = 0

                for card in cards:
                    try:
                        div = card.find('div',class_='horiz-news3-caption')

                        # Article Title
                        headline = div.find('a').get_text(strip=True).replace('\xa0', ' ')

                        # Article link
                        link = div.find('a')['href']

                        # Requesting the content from each article's link
                        article_response = requests.get(link)
                        article_response.raise_for_status()
                        content_soup = BeautifulSoup(article_response.text, "html.parser")


                        # Content arranged in paras inside <span> tags
                        paras = content_soup.find('span',class_='story-text').find_all('p')

                        combined_text = " ".join(
                        p.get_text(strip=True).replace('\xa0', ' ').replace('\u200b', '')
                        for p in paras if p.get_text(strip=True)
                        )

                        # Storing data
                        express_df['id'].append(self.id)
                        express_df['title'].append(headline)
                        express_df['link'].append(link)
                        express_df['gold_label'].append(category.replace('saqafat','entertainment').replace('science','science-technology'))
                        express_df['content'].append(combined_text)

                        # Increment ID and success count
                        self.id += 1
                        success_count += 1

                    except Exception as e:
                        print(f"\t--> Failed to scrape an article on page {page} of '{category}': {e}")

                print(f"\t--> Successfully scraped {success_count} articles from page {page} of '{category}'.")
            print('')

        return pd.DataFrame(express_df)
    
    def get_geo_urdu_articles(self): #only one page
        geo_urdu_df = {
            "id": [],
            "title": [],
            "link": [],
            "content": [],
            "gold_label": [],
        }
        base_url = 'https://urdu.geo.tv'
        categories = ['entertainment', 'business', 'sports', 'science-technology', 'world']


        # Iterating over the specified number of pages
        for category in categories:
            #for page in range(1, max_pages + 1):
            print(f"Scraping category '{category}'...")
            url = f"{base_url}/category/{category}"
            response = requests.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")

            # Finding article cards
            cards = soup.find_all('li', class_='border-box')  # Adjust class as per actual site structure
            print(f"\t--> Found {len(cards)} of '{category}'.")

            success_count = 0

            for card in cards:
                try:
                    #div = card.find('div',class_='border-box')
                    div = card

                    # Article Title
                    headline = div.find('a').get_text(strip=True).replace('\xa0', ' ')
                    
                    # Article link
                    link = div.find('a', class_='open-section')['href']

                    # Requesting the content from each article's link
                    article_response = requests.get(link)
                    article_response.raise_for_status()
                    content_soup = BeautifulSoup(article_response.text, "html.parser")

                    # Content arranged in paras inside <span> tags
                    paras = content_soup.find('div',class_='content-area').find_all('p')

                    combined_text = " ".join(
                        p.get_text(strip=True).replace('\xa0', ' ').replace('\u200b', '')
                        for p in paras if p.get_text(strip=True)
                    )

                    # Storing data
                    geo_urdu_df['id'].append(self.id)
                    geo_urdu_df['title'].append(headline)
                    geo_urdu_df['link'].append(link)
                    geo_urdu_df['gold_label'].append(category)
                    geo_urdu_df['content'].append(combined_text)

                    # Increment ID and success count
                    self.id += 1
                    success_count += 1

                except Exception as e:
                    print(f"\t--> Failed to scrape'{category}': {e}")

                print(f"\t--> Successfully scraped {success_count} articles from '{category}'.")
            print('')

        return pd.DataFrame(geo_urdu_df)

    def get_jang_articles(self): #only one page
        jang_df = {
            "id": [],
            "title": [],
            "link": [],
            "content": [],
            "gold_label": [],
        }
        base_url = 'https://jang.com.pk'
        categories = ['entertainment', 'business', 'sports', 'science-technology', 'world']


        # Iterating over the specified number of pages
        for category in categories:
            #for page in range(1, max_pages + 1):
            print(f"Scraping category '{category}'...")
            url = f"{base_url}/category/latest-news/{category}"
            response = requests.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")

            # Finding article cards
            cards = soup.find('ul', class_='scrollPaginationNew__').find_all('li')  # Adjust class as per actual site structure
            print(f"\t--> Found {len(cards)} of '{category}'.")

            success_count = 0

            for card in cards:
                try:
                    div = card.find('div',class_='main-heading')
                    #div = card

                    # Article Title
                    headline = div.find('a').get_text(strip=True).replace('\xa0', ' ')
                    
                    # Article link
                    link = div.find('a')['href']

                    # Requesting the content from each article's link
                    article_response = requests.get(link)
                    article_response.raise_for_status()
                    content_soup = BeautifulSoup(article_response.text, "html.parser")

                    # Content arranged in paras inside <span> tags
                    paras = content_soup.find('div',class_='detail_view_content').find_all('p')

                    combined_text = " ".join(
                        p.get_text(strip=True).replace('\xa0', ' ').replace('\u200b', '')
                        for p in paras if p.get_text(strip=True)
                    )

                    # Storing data
                    jang_df['id'].append(self.id)
                    jang_df['title'].append(headline)
                    jang_df['link'].append(link)
                    jang_df['gold_label'].append(category)
                    jang_df['content'].append(combined_text)

                    # Increment ID and success count
                    self.id += 1
                    success_count += 1

                except Exception as e:
                    print(f"\t--> Failed to scrape'{category}': {e}")
                
                print(f"\t--> Successfully scraped {success_count} articles from '{category}'.")
            
            print('')

        return pd.DataFrame(jang_df)

    def get_dunya_articles(self): #only one page
        dunya_df = {
            "id": [],
            "title": [],
            "link": [],
            "content": [],
            "gold_label": [],
        }
        base_url = 'https://urdu.dunyanews.tv'
        categories = ['Entertainment', 'Business', 'Sports', 'Technology', 'World']


        # Iterating over the specified number of pages
        for category in categories:
            #for page in range(1, max_pages + 1):
            print(f"Scraping category '{category}'...")
            url = f"{base_url}/index.php/ur/{category}"
            response = requests.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")

            # Finding article cards
            cards = soup.find('div', class_='newsBox categories').find_all('div')  # Adjust class as per actual site structure
            print(f"\t--> Found {len(cards)} of '{category}'.")

            success_count = 0

            for card in cards:
                    try:
                        div = card.find('div',class_='col-md-8')

                        # Article Title
                        headline = div.find('a').get_text(strip=True).replace('\xa0', ' ')

                        # Article link
                        link = div.find('a')['href']
                        link = 'https://urdu.dunyanews.tv'+link
                        print(link)

                        # Requesting the content from each article's link
                        article_response = requests.get(link)
                        article_response.raise_for_status()
                        content_soup = BeautifulSoup(article_response.text, "html.parser")


                        # Content arranged in paras inside <span> tags
                        paras = content_soup.find('div',class_='main-news col-md-12').find_all('p')

                        combined_text = " ".join(
                        p.get_text(strip=True).replace('\xa0', ' ').replace('\u200b', '')
                        for p in paras if p.get_text(strip=True)
                        )

                        # Storing data
                        dunya_df['id'].append(self.id)
                        dunya_df['title'].append(headline)
                        dunya_df['link'].append(link)
                        dunya_df['gold_label'].append(category.replace('Entertainment','entertainment').replace('Technology','science-technology').replace('Trade','business').replace('Sports','sports').replace('World','world'))
                        dunya_df['content'].append(combined_text)

                        # Increment ID and success count
                        self.id += 1
                        success_count += 1

                    except Exception as e:
                        print(f"\t--> Failed to scrape'{category}': {e}")
                    print(f"\t--> Successfully scraped {success_count} articles from'{category}'.")
            print('')

        return pd.DataFrame(dunya_df)

    def get_dawn_articles(self): #only one page
        dawn_df = {
            "id": [],
            "title": [],
            "link": [],
            "content": [],
            "gold_label": [],
        }
        base_url = 'https://www.dawnnews.tv'
        categories = ['life-style', 'business', 'sport', 'tech', 'world']


        # Iterating over the specified number of pages
        for category in categories:
            #for page in range(1, max_pages + 1):
            print(f"Scraping category '{category}'...")
            url = f"{base_url}/category/{category}"
            response = requests.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, "html.parser")

            # Finding article cards
            cards = soup.find_all('h2', class_='story__title')  # Adjust class as per actual site structure
            print(f"\t--> Found {len(cards)} of '{category}'.")

            success_count = 0

            for card in cards:
                try:
                    #div = card.find('div',class_='border-box')
                    div = card

                    # Article Title
                    headline = div.find('a').get_text(strip=True).replace('\xa0', ' ')
                    
                    # Article link
                    link = div.find('a', class_='story__link')['href']

                    # Requesting the content from each article's link
                    article_response = requests.get(link)
                    article_response.raise_for_status()
                    content_soup = BeautifulSoup(article_response.text, "html.parser")

                    # Content arranged in paras inside <span> tags
                    paras = content_soup.find('div',class_='story__content').find_all('p')

                    combined_text = " ".join(
                        p.get_text(strip=True).replace('\xa0', ' ').replace('\u200b', '')
                        for p in paras if p.get_text(strip=True)
                    )

                    # Storing data
                    dawn_df['id'].append(self.id)
                    dawn_df['title'].append(headline)
                    dawn_df['link'].append(link)
                    dawn_df['gold_label'].append(category.replace('life-style','entertainment').replace('technology','science-technology').replace('sport','sports').replace('tech','science-technology'))
                    dawn_df['content'].append(combined_text)

                    # Increment ID and success count
                    self.id += 1
                    success_count += 1

                except Exception as e:
                    print(f"\t--> Failed to scrape'{category}': {e}")

                print(f"\t--> Successfully scraped {success_count} articles from '{category}'.")
            print('')

        return pd.DataFrame(dawn_df)



In [4]:
scraper = NewsScraper()

In [5]:
df_express = scraper.get_express_articles(max_pages=7)
df_dunya = scraper.get_dunya_articles() 
df_jang = scraper.get_jang_articles()
df_geo = scraper.get_geo_urdu_articles()
df_dawn = scraper.get_dawn_articles()

Scraping page 1 of category 'saqafat'...
	--> Found 10 articles on page 1 of 'saqafat'.
	--> Successfully scraped 10 articles from page 1 of 'saqafat'.
Scraping page 2 of category 'saqafat'...
	--> Found 10 articles on page 2 of 'saqafat'.
	--> Successfully scraped 10 articles from page 2 of 'saqafat'.
Scraping page 3 of category 'saqafat'...
	--> Found 10 articles on page 3 of 'saqafat'.
	--> Successfully scraped 10 articles from page 3 of 'saqafat'.
Scraping page 4 of category 'saqafat'...
	--> Found 10 articles on page 4 of 'saqafat'.
	--> Successfully scraped 10 articles from page 4 of 'saqafat'.
Scraping page 5 of category 'saqafat'...
	--> Found 10 articles on page 5 of 'saqafat'.
	--> Successfully scraped 10 articles from page 5 of 'saqafat'.
Scraping page 6 of category 'saqafat'...
	--> Found 10 articles on page 6 of 'saqafat'.
	--> Successfully scraped 10 articles from page 6 of 'saqafat'.
Scraping page 7 of category 'saqafat'...
	--> Found 10 articles on page 7 of 'saqafat'.


# Output
- Save a combined csv of all 3 sites.

In [6]:
df = pd.concat([df_express, df_dunya, df_jang, df_geo, df_dawn], ignore_index=True)
df.to_csv("raw_data.csv", index=False)

In [7]:
print(df_express.shape)
print(df_dunya.shape)
print(df_dawn.shape)
print(df_jang.shape)
print(df_geo.shape)

(350, 5)
(180, 5)
(409, 5)
(494, 5)
(300, 5)
