In [3]:
import requests
from bs4 import BeautifulSoup
import json
import os
import re
import time
import random
from concurrent.futures import ThreadPoolExecutor
from fake_useragent import UserAgent

# Define user agents
ua = UserAgent()
headers = {
    'User-Agent': ua.random
}

# Define a function to retry requests
def fetch_url(url, retries=5):
    for i in range(retries):
        try:
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            return response
        except requests.RequestException as e:
            print(f"Error fetching {url}: {e}. Retrying... ({i+1}/{retries})")
            time.sleep(random.uniform(1, 3))
    return None

# Load scraped links from a file
def load_scraped_links(filename):
    try:
        with open(filename, 'r', encoding='utf-8') as file:
            return json.load(file)
    except (FileNotFoundError, json.JSONDecodeError):
        return []

# Save scraped links to a file
def save_scraped_link(link, filename):
    scraped_links = load_scraped_links(filename)
    if link not in scraped_links:
        scraped_links.append(link)
        with open(filename, 'w', encoding='utf-8') as file:
            json.dump(scraped_links, file, ensure_ascii=False, indent=4)

# Append article to the JSON file
def append_article(article, category):
    filename_articles = f"scraped_articles_{category}.json"
    articles = load_scraped_links(filename_articles)
    articles.append(article)
    with open(filename_articles, 'w', encoding='utf-8') as file:
        json.dump(articles[-5:], file, ensure_ascii=False, indent=4)  # Keep only last 5 articles

# Scrape article content
def scrape_article_content(article, category):
    response = fetch_url(article['href'])
    if not response:
        article['content'] = ['Failed to retrieve content']
    else:
        soup = BeautifulSoup(response.content, 'html.parser')
        content_div = soup.find('div', class_='ArticleBodyCont')
        if content_div:
            paragraphs = content_div.find_all('p')
            content = [re.sub(r'http\S+', '', p.text.strip()) for p in paragraphs if p.text.strip()]
            article['content'] = content
        else:
            article['content'] = ['Content not available']
    append_article(article, category)

# Scrape page and articles
def scrape_page(url, scraped_links_filename, category):
    scraped_links = load_scraped_links(scraped_links_filename)
    response = fetch_url(url)
    if not response:
        return []

    soup = BeautifulSoup(response.content, 'html.parser')
    figure_blocks = soup.find_all('figure')
    articles = []

    for block in figure_blocks[:5]:  # Limit to top 5 articles
        a_tag = block.find('a', href=True)
        if a_tag and a_tag['href'] not in scraped_links:
            href = a_tag['href']
            card_title_div = a_tag.find('div', class_='card_title')
            if card_title_div:
                title_tag = card_title_div.find('h3')
                if title_tag:
                    title = title_tag.text.strip()
                    article = {'title': title, 'href': href, 'category': category}
                    articles.append(article)
                    save_scraped_link(href, scraped_links_filename)
                    scrape_article_content(article, category)
        if len(articles) >= 5:  # Stop after collecting 5 articles
            break
    return articles

# Scrape all pages for a category
def scrape_all_pages(base_url, scraped_links_filename, category):
    page = 1
    all_articles = []
    while len(all_articles) < 5:
        url = f"{base_url}/page/{page}/"
        articles = scrape_page(url, scraped_links_filename, category)
        if not articles:
            break
        all_articles.extend(articles)
        page += 1
    return all_articles[:5]

# Scrape multiple categories concurrently
def scrape_categories(categories):
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = []
        for category, base_url in categories.items():
            scraped_links_filename = f"scraped_links_{category}.json"
            articles_filename = f"scraped_articles_{category}.json"

            # Ensure files exist
            if not os.path.exists(scraped_links_filename):
                with open(scraped_links_filename, 'w', encoding='utf-8') as file:
                    json.dump([], file)

            if not os.path.exists(articles_filename):
                with open(articles_filename, 'w', encoding='utf-8') as file:
                    json.dump([], file)

            futures.append(executor.submit(scrape_all_pages, base_url, scraped_links_filename, category))

        for future in futures:
            future.result()

# Define categories and their base URLs
categories = {
    'Entertainment': 'https://tv9telugu.com/entertainment',
    'Andhra-Pradesh': 'https://tv9telugu.com/andhra-pradesh',
    'Telangana': 'https://tv9telugu.com/telangana',
    'Sports': 'https://tv9telugu.com/sports',
    'national': 'https://tv9telugu.com/national',
    'politics': 'https://tv9telugu.com/politics',
    'Crime': 'https://tv9telugu.com/crime',
    'health': 'https://tv9telugu.com/health',
    'Business': 'https://tv9telugu.com/business',
    'Lifestyle': 'https://tv9telugu.com/lifestyle',
    'Technology': 'https://tv9telugu.com/technology',
    'Spiritual': 'https://tv9telugu.com/spiritual',
    'International': 'https://tv9telugu.com/world',
}

# Start scraping
scrape_categories(categories)
