### Text to Embeddings

In [5]:
import logging
import json
import time
import re
import random
import os
from datetime import datetime
from urllib.parse import quote
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Dict, Optional
from newspaper import Article, Config
import glob
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type

In [2]:

try:
    NEWSPAPER_AVAILABLE = True
except ImportError:
    NEWSPAPER_AVAILABLE = False
    print("Newspaper3k not installed. Running in basic mode.")

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# List of User-Agent strings
USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_2_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 14.2; rv:121.0) Gecko/20100101 Firefox/121.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_2_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
]

# Configure Newspaper3k if available
if NEWSPAPER_AVAILABLE:
    config = Config()
    config.browser_user_agent = random.choice(USER_AGENTS)
    config.request_timeout = 10
    config.fetch_images = False

# Read keywords from text file
try:
    with open('keywords.txt', 'r', encoding='utf-8') as f:
        keywords = [line.strip() for line in f if line.strip()]
    logger.info(f"Loaded {len(keywords)} keywords: {keywords}")
except Exception as e:
    logger.error(f"Error reading keywords.txt: {e}")

# Function to fetch article URLs for a keyword with pagination
def fetch_article_urls(keyword):
    try:
        # Use the correct URL format for TOI topic pages
        base_url = 'https://timesofindia.indiatimes.com/topic/'
        keyword_url = f"{base_url}{quote(keyword)}"
        logger.info(f"Fetching article URLs for keyword: {keyword} from {keyword_url}")

        article_links = []
        current_url = keyword_url
        headers = {'User-Agent': random.choice(USER_AGENTS)}

        for page_num in range(3):  # Adjust pagination limit here
            # Fetch page
            response = requests.get(current_url, headers=headers, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')

            # Extract article links - adjust selectors to match current TOI HTML structure
            links = soup.select('a[href*="/articleshow/"]')
            for link in links:
                href = link.get('href')
                if href:
                    if href.startswith('/'):
                        href = 'https://timesofindia.indiatimes.com' + href
                    if href not in article_links and re.match(r'.*/articleshow/\d+\.cms', href):
                        article_links.append(href)

            # Check for "Next" button - might need to adjust selector
            next_link = soup.select_one('a.next, a[rel="next"], a[aria-label*="next"]')
            if not next_link or not next_link.get('href'):
                break
            next_url = next_link.get('href')
            if next_url.startswith('/'):
                next_url = 'https://timesofindia.indiatimes.com' + next_url
            current_url = next_url
            headers['User-Agent'] = random.choice(USER_AGENTS)
            time.sleep(1)  # Be nice to the server

        # If we found too many articles, limit to first 10
        if len(article_links) > 10:
            article_links = article_links[:10]
            
        logger.info(f"Found {len(article_links)} article URLs for keyword: {keyword}")
        return article_links, keyword
    except requests.RequestException as e:
        logger.error(f"Error fetching article URLs for keyword {keyword}: {e}")
        return [], keyword
    except Exception as e:
        logger.error(f"Unexpected error fetching article URLs for keyword {keyword}: {e}")
        return [], keyword

# Function to process an article with or without Newspaper3k
def process_article(article_url, keyword, index):
    if NEWSPAPER_AVAILABLE:
        return process_article_with_newspaper(article_url, keyword, index)
    else:
        return process_article_with_bs4(article_url, keyword, index)

# Process article using Newspaper3k
def process_article_with_newspaper(article_url, keyword, index):
    try:
        # Update User-Agent for this article
        config.browser_user_agent = random.choice(USER_AGENTS)

        # Initialize Newspaper3k article
        article = Article(article_url, config=config)
        article.download()
        article.parse()
        article.nlp()  # For keywords

        # Extract data
        title = article.title or "No title found"
        publish_date = article.publish_date.isoformat() if article.publish_date else None
        authors = article.authors if article.authors else []
        author = authors[0] if authors else None
        full_text = article.text or ""
        article_keywords = article.keywords if article.keywords else []

        # Create article info
        article_info = {
            'title': title.strip(),
            'url': article_url,
            'publish_date': publish_date,
            'keyword': keyword,
            'author': author.strip() if author else None,
            'article_keywords': [kw.strip() for kw in article_keywords],
            'full_text': full_text.strip()
        }
        return article_info, index
    except Exception as e:
        logger.warning(f"Error processing article {article_url} with Newspaper3k: {e}")
        # Fall back to BS4 if Newspaper3k fails
        return process_article_with_bs4(article_url, keyword, index)

# Process article using BeautifulSoup (no Newspaper3k)
def process_article_with_bs4(article_url, keyword, index):
    try:
        headers = {'User-Agent': random.choice(USER_AGENTS)}
        response = requests.get(article_url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract title
        title_elem = soup.select_one('h1, h2')
        title = title_elem.get_text().strip() if title_elem else "No title found"

        # Extract publish date
        date_elem = soup.select_one('div.as_byline > div > span, time, meta[name="publish-date"]')
        publish_date = None
        if date_elem:
            publish_date = date_elem.get_text() or date_elem.get('content') or date_elem.get('datetime')
            if publish_date:
                publish_date = publish_date.strip()

        # Extract full article text
        article_body = soup.select('div._s30W > div._3WlLe, div.articlebody, div.content, div.main, article p')
        full_text = ' '.join([elem.get_text().strip() for elem in article_body]).strip() if article_body else ""

        # Extract author
        author_elem = soup.select_one('div.as_byline > a, span.author, meta[name="author"]')
        author = None
        if author_elem:
            author = author_elem.get_text() or author_elem.get('content')
            if author:
                author = author.strip()

        # Extract article-specific keywords
        article_keywords = []
        meta_keywords = soup.select_one('meta[name="keywords"]')
        if meta_keywords and meta_keywords.get('content'):
            keywords_content = meta_keywords.get('content')
            article_keywords.extend([kw.strip() for kw in keywords_content.split(',')])
        
        # Try visible tags
        tag_elems = soup.select('div.tags a, ul.tags li a')
        if tag_elems:
            article_keywords.extend([elem.get_text().strip() for elem in tag_elems])
        
        # Remove duplicates and empty strings
        article_keywords = list(set([kw for kw in article_keywords if kw]))

        # Create article info
        article_info = {
            'title': title,
            'url': article_url,
            'publish_date': publish_date,
            'keyword': keyword,
            'author': author,
            'article_keywords': article_keywords,
            'full_text': full_text
        }
        return article_info, index
    except Exception as e:
        logger.warning(f"Error processing article {article_url} with BeautifulSoup: {e}")
        return None, index

# Main scraping process
def main():
    max_workers = 3  # Reduced workers to minimize server load
    articles_data = []
    url_data = []  # Store URLs with keywords

    # Phase 1: Collect all article URLs
    try:
        logger.info("Starting Phase 1: Collecting article URLs")
        for keyword in keywords:
            # Fetch article URLs
            article_urls, keyword = fetch_article_urls(keyword)
            
            if not article_urls:
                logger.warning(f"No article URLs found for keyword: {keyword}")
                continue

            # Store URLs with keyword
            url_data.extend([{'url': url, 'keyword': keyword} for url in article_urls])
            logger.info(f"Collected {len(article_urls)} URLs for keyword: {keyword}")

        # Save URLs to file
        url_output_file = 'article_urls.json'
        try:
            with open(url_output_file, 'w', encoding='utf-8') as f:
                json.dump(url_data, f, indent=4, ensure_ascii=False)
            logger.info(f"Saved {len(url_data)} article URLs to {url_output_file}")
        except Exception as e:
            logger.error(f"Failed to save URLs to {url_output_file}: {e}")

    except Exception as e:
        logger.error(f"Error during URL collection: {e}")

    # Phase 2: Scrape article data
    if url_data:
        logger.info("Starting Phase 2: Scraping article data")
        try:
            with ThreadPoolExecutor(max_workers=max_workers) as executor:
                # Submit article processing tasks
                future_to_article = {
                    executor.submit(process_article, item['url'], item['keyword'], i): item['url']
                    for i, item in enumerate(url_data, 1)
                }

                # Collect results
                for future in as_completed(future_to_article):
                    article_info, index = future.result()
                    if article_info:
                        articles_data.append(article_info)
                        logger.info(f"Processed article {index} for keyword {article_info['keyword']}: {article_info['title']}")
                    time.sleep(0.2)  # 200ms delay
        except Exception as e:
            logger.error(f"Error during article scraping: {e}")

    # Save article data to JSON file
    output_file = 'toi_articles_by_keyword.json'
    try:
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(articles_data, f, indent=4, ensure_ascii=False)
        logger.info(f"Saved {len(articles_data)} articles to {output_file}")
    except Exception as e:
        logger.error(f"Failed to save JSON file: {e}")

    print(f"Scraping complete. Saved {len(articles_data)} articles to {output_file}")
    return articles_data

if __name__ == "__main__":
    main()

2025-05-05 10:53:00,846 - INFO - Loaded 5 keywords: ['city', 'business', 'sports', 'world', 'india']
2025-05-05 10:53:00,848 - INFO - Starting Phase 1: Collecting article URLs
2025-05-05 10:53:00,850 - INFO - Fetching article URLs for keyword: city from https://timesofindia.indiatimes.com/topic/city
2025-05-05 10:53:01,463 - INFO - Found 10 article URLs for keyword: city
2025-05-05 10:53:01,464 - INFO - Collected 10 URLs for keyword: city
2025-05-05 10:53:01,465 - INFO - Fetching article URLs for keyword: business from https://timesofindia.indiatimes.com/topic/business
2025-05-05 10:53:02,086 - INFO - Found 10 article URLs for keyword: business
2025-05-05 10:53:02,087 - INFO - Collected 10 URLs for keyword: business
2025-05-05 10:53:02,088 - INFO - Fetching article URLs for keyword: sports from https://timesofindia.indiatimes.com/topic/sports
2025-05-05 10:53:02,350 - INFO - Found 10 article URLs for keyword: sports
2025-05-05 10:53:02,354 - INFO - Collected 10 URLs for keyword: sports

Scraping complete. Saved 50 articles to toi_articles_by_keyword.json


### Increased news channels endpoint and articles fetched 

In [2]:
try:
    NEWSPAPER_AVAILABLE = True
except ImportError:
    NEWSPAPER_AVAILABLE = False
    logging.warning("Newspaper3k not installed. Falling back to BeautifulSoup or API snippets.")

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# List of User-Agent strings
USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_2_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0',
]

# NewsAPI configuration
NEWS_API_KEY = '08690429ee754ed4a26bba92b29f2688'  # Replace with your NewsAPI key from https://newsapi.org/
NEWS_API_ENDPOINT = 'https://newsapi.org/v2/everything'

# Newspaper3k configuration
if NEWSPAPER_AVAILABLE:
    newspaper_config = Config()
    newspaper_config.browser_user_agent = random.choice(USER_AGENTS)
    newspaper_config.request_timeout = 10
    newspaper_config.fetch_images = False
    newspaper_config.memoize_articles = False

# Define news sources with BeautifulSoup selectors for fallback
NEWS_SOURCES = {
    "timesofindia": {
        "api_id": "the-times-of-india",
        "content_selector": 'div._s30W > div._3WlLe, div.articlebody, div.content, div.main, article p'
    },
    "ndtv": {
        "api_id": "ndtv",
        "content_selector": 'div.sp-cn, div.content_text, article p'
    },
    "indiatoday": {
        "api_id": "india-today",
        "content_selector": 'div.story-right, div.article-body, div.description'
    },
    "bbc": {
        "api_id": "bbc-news",
        "content_selector": 'article div[data-component="text-block"], div.story-body__inner p'
    },
    "cnn": {
        "api_id": "cnn",
        "content_selector": 'div.pg-rail-tall__body, div.article__content, section.body-text'
    },
    "indianexpress": {
        "api_id": "the-indian-express",
        "content_selector": 'div#article, div.article-body, div.entry-content'
    }
}

# Create output directory
def create_directories():
    os.makedirs('data', exist_ok=True)

# Load keywords from file
def load_keywords() -> List[str]:
    try:
        with open('keywords.txt', 'r', encoding='utf-8') as f:
            keywords = [line.strip() for line in f if line.strip()]
        logger.info(f"Loaded {len(keywords)} keywords: {keywords}")
        return keywords
    except Exception as e:
        logger.error(f"Error reading keywords.txt: {e}")
        return ["covid", "economy", "climate"]  # Fallback keywords

# Scrape full text with BeautifulSoup
def scrape_with_bs4(url: str, source_name: str) -> str:
    try:
        source_config = NEWS_SOURCES[source_name]
        headers = {'User-Agent': random.choice(USER_AGENTS)}
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract full article text
        article_body = soup.select(source_config['content_selector'])
        full_text = ' '.join([elem.get_text().strip() for elem in article_body]).strip() if article_body else ""
        return full_text
    except Exception as e:
        logger.warning(f"Error scraping {url} with BeautifulSoup: {e}")
        return ""

# Fetch articles from NewsAPI and scrape full text
def fetch_articles(keyword: str, source_name: str, page: int = 1) -> Optional[Dict]:
    try:
        source_id = NEWS_SOURCES.get(source_name, {}).get('api_id')
        if not source_id:
            logger.error(f"Source {source_name} not supported by NewsAPI")
            return None

        headers = {'User-Agent': random.choice(USER_AGENTS)}
        params = {
            'q': keyword,
            'apiKey': NEWS_API_KEY,
            'sources': source_id,
            'page': page,
            'pageSize': 20,  # Reduced to manage scraping load
            'language': 'en',
            'sortBy': 'publishedAt'
        }

        logger.info(f"Fetching articles for keyword: {keyword} from {source_name}, page {page}")
        response = requests.get(NEWS_API_ENDPOINT, headers=headers, params=params, timeout=10)
        response.raise_for_status()
        data = response.json()

        if data.get('status') != 'ok':
            logger.error(f"NewsAPI error for {keyword} from {source_name}: {data.get('message')}")
            return None

        articles = data.get('articles', [])
        total_results = data.get('totalResults', 0)

        formatted_articles = []
        for article in articles:
            # Get snippet from NewsAPI
            snippet = article.get('content', '') or article.get('description', '')
            full_text = snippet
            article_url = article.get('url', '')

            # Try Newspaper3k first
            if NEWSPAPER_AVAILABLE:
                try:
                    newspaper_config.browser_user_agent = random.choice(USER_AGENTS)
                    news_article = Article(article_url, config=newspaper_config)
                    news_article.download()
                    news_article.parse()
                    full_text = news_article.text.strip() if news_article.text else snippet
                    logger.info(f"Scraped full text with Newspaper3k for {article_url}")
                except Exception as e:
                    logger.warning(f"Newspaper3k failed for {article_url}: {e}")
                    # Fall back to BeautifulSoup
                    full_text = scrape_with_bs4(article_url, source_name) or snippet
            else:
                # Use BeautifulSoup if Newspaper3k is unavailable
                full_text = scrape_with_bs4(article_url, source_name) or snippet

            formatted_articles.append({
                'title': article.get('title', 'No title found').strip(),
                'url': article_url,
                'publish_date': article.get('publishedAt', None),
                'keyword': keyword,
                'source': source_name,
                'author': article.get('author', None),
                'article_keywords': [],  # Can be enhanced with NLP
                'full_text': full_text
            })

        return {
            'articles': formatted_articles,
            'total_results': total_results,
            'keyword': keyword,
            'source': source_name,
            'page': page
        }
    except requests.RequestException as e:
        logger.error(f"Error fetching articles for {keyword} from {source_name}: {e}")
        return None
    except Exception as e:
        logger.error(f"Unexpected error for {keyword} from {source_name}: {e}")
        return None

# Save data chunk to JSON
def save_chunk(data: List[Dict], file_prefix: str, chunk_num: int) -> bool:
    try:
        chunk_file = f"data/{file_prefix}_chunk_{chunk_num}.json"
        with open(chunk_file, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=4, ensure_ascii=False)
        logger.info(f"Saved {len(data)} items to {chunk_file}")
        return True
    except Exception as e:
        logger.error(f"Failed to save chunk {chunk_num} to {chunk_file}: {e}")
        return False

# Main function
def main():
    create_directories()
    keywords = load_keywords()
    max_workers = 4  # Reduced workers to manage scraping load
    chunk_size = 50  # Moderate chunk size for full-text data
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    all_articles = []
    chunk_counter = 0
    total_articles = 0

    if not NEWSPAPER_AVAILABLE:
        logger.warning("Newspaper3k is not installed. Using BeautifulSoup for full text extraction.")

    logger.info("Starting article fetching from NewsAPI with full-text scraping")
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        for keyword in keywords:
            for source_name in NEWS_SOURCES.keys():
                # Fetch up to 2 pages per keyword/source to limit load
                for page in range(1, 3):
                    futures.append(executor.submit(fetch_articles, keyword, source_name, page))

        # Process results
        for future in as_completed(futures):
            result = future.result()
            if result and result['articles']:
                all_articles.extend(result['articles'])
                total_articles += len(result['articles'])
                logger.info(
                    f"Fetched {len(result['articles'])} articles for {result['keyword']} "
                    f"from {result['source']} (page {result['page']})"
                )

                # Save in chunks
                if len(all_articles) >= chunk_size:
                    save_chunk(all_articles[:chunk_size], f"articles_{timestamp}", chunk_counter)
                    all_articles = all_articles[chunk_size:]
                    chunk_counter += 1

            time.sleep(0.5)  # Delay to avoid overwhelming servers

    # Save remaining articles
    if all_articles:
        save_chunk(all_articles, f"articles_{timestamp}", chunk_counter)
        chunk_counter += 1

    # Create summary file
    summary = {
        "timestamp": timestamp,
        "total_articles_processed": total_articles,
        "number_of_chunks": chunk_counter,
        "keywords": keywords,
        "sources": list(NEWS_SOURCES.keys()),
        "full_text_scraping": "Newspaper3k and BeautifulSoup" if NEWSPAPER_AVAILABLE else "BeautifulSoup only"
    }
    with open(f"articles.json", 'w', encoding='utf-8') as f:
        json.dump(summary, f, indent=4, ensure_ascii=False)

    logger.info(f"Completed. Processed {total_articles} articles across {len(NEWS_SOURCES)} sources.")

if __name__ == "__main__":
    main()

2025-05-06 17:26:51,913 - INFO - Loaded 17 keywords: ['city', 'business', 'sports', 'world', 'india', 'technology', 'news', 'politics', 'ukraine war', 'us tariffs', 'climate crisis', 'gaza ceasefire', 'global protests', 'economic recession', 'india-pakistan tensions', 'inflation', 'market']
2025-05-06 17:26:51,915 - INFO - Starting article fetching from NewsAPI with full-text scraping
2025-05-06 17:26:51,916 - INFO - Fetching articles for keyword: city from timesofindia, page 1
2025-05-06 17:26:51,916 - INFO - Fetching articles for keyword: city from timesofindia, page 2
2025-05-06 17:26:51,917 - INFO - Fetching articles for keyword: city from ndtv, page 1
2025-05-06 17:26:51,917 - INFO - Fetching articles for keyword: city from ndtv, page 2
2025-05-06 17:26:52,343 - ERROR - Error fetching articles for city from ndtv: 400 Client Error: Bad Request for url: https://newsapi.org/v2/everything?q=city&apiKey=08690429ee754ed4a26bba92b29f2688&sources=ndtv&page=1&pageSize=20&language=en&sortBy

### Combining files

In [8]:
import json
import glob
import os

def combine_json_files(input_dir='data', output_file='articles.json'):
    all_articles = []

    # Find all matching JSON files
    json_files = glob.glob(os.path.join(input_dir, 'articles_*.json'))
    print(f"Found {len(json_files)} files")

    for file in sorted(json_files):
        try:
            with open(file, 'r', encoding='utf-8') as f:
                data = json.load(f)
                if isinstance(data, list):
                    all_articles.extend(data)
                else:
                    print(f"Skipping non-list JSON in file: {file}")
        except Exception as e:
            print(f"Error reading {file}: {e}")

    print(f"Total articles combined: {len(all_articles)}")

    # Save combined output
    with open(os.path.join(input_dir, output_file), 'w', encoding='utf-8') as f_out:
        json.dump(all_articles, f_out, ensure_ascii=False, indent=2)
    print(f"Saved combined articles to {output_file}")

if __name__ == "__main__":
    combine_json_files()


Found 20 files
Total articles combined: 935
Saved combined articles to articles.json
