## Converation AI

### Imports

In [3]:
import json
import newspaper
from newspaper import Article
from datetime import datetime
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

### Scrape data

In [4]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Read keywords from text file
try:
    with open('keywords.txt', 'r', encoding='utf-8') as f:
        keywords = [line.strip() for line in f if line.strip()]
    logger.info(f"Loaded {len(keywords)} keywords: {keywords}")
except FileNotFoundError:
    logger.error("keywords.txt not found. Please create the file with one keyword per line.")
    exit(1)
except Exception as e:
    logger.error(f"Error reading keywords.txt: {e}")
    exit(1)

# Initialize Times of India newspaper object
try:
    toi = newspaper.build('https://timesofindia.indiatimes.com/', 
                         memoize_articles=False, 
                         fetch_images=False, 
                         number_threads=4)
    logger.info(f"Found {len(toi.articles)} articles to process")
except Exception as e:
    logger.error(f"Failed to initialize newspaper: {e}")
    exit(1)

# Function to process a single article
def process_article(article, index):
    try:
        # Download and parse article
        article.download()
        article.parse()
        
        # Check if any keyword is in the article title or text
        article_text = (article.title + " " + article.text).lower()
        matched_keywords = [kw for kw in keywords if kw in article_text]
        
        if matched_keywords:
            # Extract relevant data
            article_info = {
                'title': article.title,
                'url': article.url,
                'publish_date': str(article.publish_date) if article.publish_date else None,
                'keywords': matched_keywords,
                'summary': article.text[:500] + "..." if len(article.text) > 500 else article.text
            }
            return article_info, index
        return None, index
    except Exception as e:
        logger.warning(f"Error processing article {article.url}: {e}")
        return None, index

# Process articles in parallel
max_workers = 8  # Adjust based on system and server tolerance
articles_data = []
try:
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all article processing tasks
        future_to_article = {executor.submit(process_article, article, i): article 
                            for i, article in enumerate(toi.articles, 1)}
        
        # Collect results as they complete
        for future in as_completed(future_to_article):
            article_info, index = future.result()
            if article_info:
                articles_data.append(article_info)
                logger.info(f"Processed article {index}: {article_info['title']} - Keywords: {article_info['keywords']}")
            # Small delay to avoid overwhelming the server
            time.sleep(0.1)  # 100ms delay between requests
except Exception as e:
    logger.error(f"Error during parallel processing: {e}")

# Save to JSON file
output_file = 'toi_articles_all.json'
try:
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(articles_data, f, indent=4, ensure_ascii=False)
    logger.info(f"Saved {len(articles_data)} articles to {output_file}")
except Exception as e:
    logger.error(f"Failed to save JSON file: {e}")

print(f"Scraping complete. Saved {len(articles_data)} articles to {output_file}")

2025-05-01 18:33:51,884 - INFO - Loaded 2 keywords: ['city', 'bangalore']
2025-05-01 18:33:59,935 - CRITICAL - [REQUEST FAILED] 404 Client Error: Not Found for url: https://timesofindia.indiatimes.com/rss
2025-05-01 18:34:00,099 - CRITICAL - [REQUEST FAILED] 404 Client Error: Not Found for url: https://timesofindia.indiatimes.com/feeds
2025-05-01 18:34:00,201 - CRITICAL - [REQUEST FAILED] 404 Client Error: Not Found for url: https://timesofindia.indiatimes.com/feed
2025-05-01 18:34:01,713 - INFO - Found 329 articles to process
2025-05-01 18:34:04,197 - INFO - Processed article 10: Infosys Eyes Experienced Tech Workers across 40+ Skill Sets - Keywords: ['city']
2025-05-01 18:34:06,349 - INFO - Processed article 20: Instagram’s head Adam Mosseri dances with influencers, gets a Bollywood welcome from Ranveer and Deepika in Mumbai - Keywords: ['city']
2025-05-01 18:34:06,687 - INFO - Processed article 24: Yolanthe OTT Release Date: When to watch Dutch reality show peeking into Yolanthe Cab

Scraping complete. Saved 13 articles to toi_articles_all.json


### Analysis
* Newspaper 3k only works for static html pages and can't handle heavy javascript rendered pages
* using different library which could handle dynamic and heavy javascript rendered pagespages and 

In [1]:
# Imports
import logging
import json
import time
import re
from urllib.parse import quote
import asyncio
from concurrent.futures import ThreadPoolExecutor, as_completed
from playwright.async_api import async_playwright

In [14]:
!pip install playwright

SyntaxError: invalid syntax (4109023801.py, line 2)

#### Sample to test with keywords

In [2]:
# For Jupyter notebooks compatibility
try:
    import nest_asyncio
    nest_asyncio.apply()
except ImportError:
    print("nest_asyncio not found. Install it with: pip install nest_asyncio")

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Read keywords from text file
keywords = ['city', 'business']  # Default keywords for demonstration
try:
    with open('keywords.txt', 'r', encoding='utf-8') as f:
        keywords = [line.strip() for line in f if line.strip()]
    logger.info(f"Loaded {len(keywords)} keywords: {keywords}")
except FileNotFoundError:
    logger.info("keywords.txt not found. Using default keywords: {keywords}")
except Exception as e:
    logger.error(f"Error reading keywords.txt: {e}")
    logger.info(f"Using default keywords: {keywords}")

# Function to fetch article URLs from a keyword search page using Playwright Async API
async def fetch_article_urls(keyword):
    browser = None
    try:
        async with async_playwright() as p:
            # Launch headless browser
            browser = await p.chromium.launch(headless=True)
            page = await browser.new_page()

            # Construct the keyword search URL
            base_url = 'https://timesofindia.indiatimes.com/topic/'
            keyword_url = f"{base_url}{quote(keyword)}"
            logger.info(f"Fetching articles for keyword: {keyword} from {keyword_url}")

            # Navigate to the keyword page and wait for content to load
            await page.goto(keyword_url, wait_until="domcontentloaded")
            # Wait for article links to appear
            await page.wait_for_selector('a[href*="/articleshow/"]', timeout=10000)

            # Extract article links from rendered HTML
            article_links = []
            links = await page.query_selector_all('a[href*="/articleshow/"]')
            for link in links:
                href = await link.get_attribute('href')
                if href:
                    # Convert relative URLs to absolute
                    if href.startswith('/'):
                        href = 'https://timesofindia.indiatimes.com' + href
                    # Ensure unique links and valid article URLs
                    if href not in article_links and re.match(r'.*/articleshow/\d+\.cms', href):
                        article_links.append(href)

            logger.info(f"Found {len(article_links)} articles for keyword: {keyword}")
            return article_links, keyword
    except asyncio.TimeoutError:
        logger.error(f"Timeout fetching articles for keyword {keyword}")
        return [], keyword
    except Exception as e:
        logger.error(f"Error fetching articles for keyword {keyword}: {e}")
        return [], keyword
    finally:
        if browser:
            await browser.close()

# Function to process a single article using Playwright Async API
async def process_article(article_url, keyword, index):
    browser = None
    try:
        async with async_playwright() as p:
            # Launch headless browser
            browser = await p.chromium.launch(headless=True)
            page = await browser.new_page()

            # Navigate to article page and wait for content to load
            await page.goto(article_url, wait_until="domcontentloaded")
            # Wait for article content to appear
            await page.wait_for_selector('h1, h2', timeout=10000)

            # Extract title
            title_elem = await page.query_selector('h1, h2')
            title = await title_elem.inner_text() if title_elem else "No title found"
            title = title.strip()

            # Extract publish date
            date_selector = 'div.as_byline > div > span, time, meta[name="publish-date"]'
            date_elem = await page.query_selector(date_selector)
            publish_date = await date_elem.inner_text() if date_elem else None
            if not publish_date and date_elem:
                publish_date = await date_elem.get_attribute('content') or await date_elem.get_attribute('datetime')
            if publish_date:
                publish_date = publish_date.strip()

            # Extract full article text
            text_selector = 'div._s30W > div._3WlLe, div.articlebody, div.content'
            article_body = await page.query_selector_all(text_selector)
            article_text = ' '.join([await elem.inner_text() for elem in article_body]).strip() if article_body else ""

            # Extract author
            author_selector = 'div.as_byline > a, span.author, meta[name="author"]'
            author_elem = await page.query_selector(author_selector)
            author = await author_elem.inner_text() if author_elem else None
            if not author and author_elem:
                author = await author_elem.get_attribute('content')
            if author:
                author = author.strip()

            # Extract article-specific keywords/tags
            article_keywords = []
            # Try meta keywords
            meta_keywords = await page.query_selector('meta[name="keywords"]')
            if meta_keywords:
                keywords_content = await meta_keywords.get_attribute('content')
                if keywords_content:
                    article_keywords.extend([kw.strip() for kw in keywords_content.split(',')])
            # Try visible tags
            tag_elems = await page.query_selector_all('div.tags a, ul.tags li a')
            if tag_elems:
                article_keywords.extend([await elem.inner_text() for elem in tag_elems])
            # Remove duplicates and empty strings
            article_keywords = list(set([kw.strip() for kw in article_keywords if kw]))

            # Create article info
            article_info = {
                'title': title,
                'url': article_url,
                'publish_date': publish_date,
                'keyword': keyword,
                'author': author,
                'article_keywords': article_keywords,
                'full_text': article_text
            }
            return article_info, index
    except asyncio.TimeoutError:
        logger.warning(f"Timeout processing article {article_url}")
        return None, index
    except Exception as e:
        logger.warning(f"Error processing article {article_url}: {e}")
        return None, index
    finally:
        if browser:
            await browser.close()

# Main function to handle async operations
async def main():
    articles_data = []
    max_workers = 3  # Reduced workers to minimize server load
    
    for keyword in keywords:
        # Fetch article URLs for the keyword
        article_urls, kw = await fetch_article_urls(keyword)
        
        if not article_urls:
            logger.warning(f"No articles found for keyword: {keyword}")
            continue

        # Process articles with concurrency control
        tasks = []
        for i, url in enumerate(article_urls[:10], 1):  # Limit to first 10 articles per keyword
            # Add small delay between task creations to avoid overwhelming the server
            if i > 1:
                await asyncio.sleep(0.2)  # 200ms delay
            tasks.append(process_article(url, keyword, i))
        
        # Process concurrently but with limited concurrency
        for i in range(0, len(tasks), max_workers):
            batch = tasks[i:i+max_workers]
            results = await asyncio.gather(*batch)
            for article_info, index in results:
                if article_info:
                    articles_data.append(article_info)
                    logger.info(f"Processed article {index} for keyword {keyword}: {article_info['title']}")

    # Save to JSON file
    output_file = 'toi_articles_by_keyword.json'
    try:
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(articles_data, f, indent=4, ensure_ascii=False)
        logger.info(f"Saved {len(articles_data)} articles to {output_file}")
        print(f"Scraping complete. Saved {len(articles_data)} articles to {output_file}")
    except Exception as e:
        logger.error(f"Failed to save JSON file: {e}")
    
    return articles_data  # Return data for Jupyter notebook analysis

# For Jupyter/IPython, run this cell
articles_data = await main()

2025-05-02 14:58:58,545 - INFO - Loaded 2 keywords: ['city', 'business']
2025-05-02 14:58:59,515 - ERROR - Error fetching articles for keyword city: BrowserType.launch: Executable doesn't exist at /Users/pranavi/Library/Caches/ms-playwright/chromium_headless_shell-1169/chrome-mac/headless_shell
╔════════════════════════════════════════════════════════════╗
║ Looks like Playwright was just installed or updated.       ║
║ Please run the following command to download new browsers: ║
║                                                            ║
║     playwright install                                     ║
║                                                            ║
║ <3 Playwright Team                                         ║
╚════════════════════════════════════════════════════════════╝
2025-05-02 14:59:00,088 - ERROR - Error fetching articles for keyword business: BrowserType.launch: Executable doesn't exist at /Users/pranavi/Library/Caches/ms-playwright/chromium_headless_shell-1169

Scraping complete. Saved 0 articles to toi_articles_by_keyword.json


### Scrapes all the articles of times of india

In [None]:
import logging
import json
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from playwright.sync_api import sync_playwright
import re

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Function to fetch article URLs from the entire site using Playwright
def fetch_article_urls():
    try:
        with sync_playwright() as p:
            # Launch headless browser
            browser = p.chromium.launch(headless=True)
            page = browser.new_page()

            # Start from the homepage
            homepage_url = 'https://timesofindia.indiatimes.com/'
            logger.info(f"Starting crawl from {homepage_url}")
            page.goto(homepage_url, wait_until="domcontentloaded")

            # Extract main section links (e.g., India, World, Business)
            section_links = []
            # Selector for section navigation (adjust based on page inspection)
            nav_links = page.query_selector_all('nav a[href*="/india"], nav a[href*="/world"], nav a[href*="/business"], nav a[href*="/sports"], nav a[href*="/entertainment"], nav a[href*="/lifestyle"]')
            for link in nav_links:
                href = link.get_attribute('href')
                if href:
                    if href.startswith('/'):
                        href = 'https://timesofindia.indiatimes.com' + href
                    if href not in section_links:
                        section_links.append(href)
            logger.info(f"Found {len(section_links)} section links: {section_links}")

            # Collect article URLs from homepage and sections
            article_urls = set()
            
            # Scrape homepage articles
            logger.info("Fetching articles from homepage")
            page.wait_for_selector('a[href*="/articleshow/"]', timeout=10000)
            links = page.query_selector_all('a[href*="/articleshow/"]')
            for link in links:
                href = link.get_attribute('href')
                if href:
                    if href.startswith('/'):
                        href = 'https://timesofindia.indiatimes.com' + href
                    if re.match(r'.*/articleshow/\d+\.cms', href):
                        article_urls.add(href)

            # Scrape articles from each section
            for section_url in section_links:
                try:
                    logger.info(f"Fetching articles from section: {section_url}")
                    page.goto(section_url, wait_until="domcontentloaded")
                    page.wait_for_selector('a[href*="/articleshow/"]', timeout=10000)

                    # Handle pagination (fetch up to 3 pages per section to avoid excessive crawling)
                    for page_num in range(3):
                        links = page.query_selector_all('a[href*="/articleshow/"]')
                        for link in links:
                            href = link.get_attribute('href')
                            if href:
                                if href.startswith('/'):
                                    href = 'https://timesofindia.indiatimes.com' + href
                                if re.match(r'.*/articleshow/\d+\.cms', href):
                                    article_urls.add(href)
                        
                        # Check for "Next" button
                        next_button = page.query_selector('a.next, a[rel="next"]')
                        if not next_button:
                            break
                        next_button.click()
                        page.wait_for_load_state('domcontentloaded')
                        time.sleep(1)  # Small delay for page load
                except Exception as e:
                    logger.warning(f"Error fetching articles from section {section_url}: {e}")
                    continue

            browser.close()
            article_urls = list(article_urls)
            logger.info(f"Found {len(article_urls)} unique article URLs")
            return article_urls
    except Exception as e:
        logger.error(f"Error fetching article URLs: {e}")
        return []

# Function to process a single article using Playwright
def process_article(article_url, index):
    try:
        with sync_playwright() as p:
            # Launch headless browser
            browser = p.chromium.launch(headless=True)
            page = browser.new_page()

            # Navigate to article page and wait for content to load
            page.goto(article_url, wait_until="domcontentloaded")
            # Wait for article content to appear (adjust selector as needed)
            page.wait_for_selector('h1', timeout=10000)

            # Extract title
            title_elem = page.query_selector('h1')
            title = title_elem.inner_text().strip() if title_elem else "No title found"

            # Extract publish date (adjust selector based on page structure)
            date_elem = page.query_selector('div.as_byline > div > span')
            publish_date = date_elem.inner_text().strip() if date_elem else None

            # Extract full article text (adjust selector based on page structure)
            article_body = page.query_selector_all('div._s30W > div._3WlLe')
            article_text = ' '.join([elem.inner_text().strip() for elem in article_body]) if article_body else ""

            # Extract author (adjust selector based on page structure)
            author_elem = page.query_selector('div.as_byline > a')
            author = author_elem.inner_text().strip() if author_elem else None

            # Extract article-specific keywords/tags (e.g., from meta tags or visible tags)
            article_keywords = []
            # Try meta keywords
            meta_keywords = page.query_selector('meta[name="keywords"]')
            if meta_keywords:
                keywords_content = meta_keywords.get_attribute('content')
                if keywords_content:
                    article_keywords.extend([kw.strip() for kw in keywords_content.split(',')])
            # Try visible tags (adjust selector based on page structure)
            tag_elems = page.query_selector_all('div.tags a')
            if tag_elems:
                article_keywords.extend([elem.inner_text().strip() for elem in tag_elems])
            # Remove duplicates and empty strings
            article_keywords = list(set([kw for kw in article_keywords if kw]))

            browser.close()

            # Create article info
            article_info = {
                'title': title,
                'url': article_url,
                'publish_date': publish_date,
                'author': author,
                'article_keywords': article_keywords,
                'full_text': article_text
            }
            return article_info, index
    except Exception as e:
        logger.warning(f"Error processing article {article_url}: {e}")
        return None, index

# Main scraping process
max_workers = 4  # Reduced workers due to Playwright's resource intensity
articles_data = []
try:
    # Fetch all article URLs
    article_urls = fetch_article_urls()
    
    if not article_urls:
        logger.error("No articles found. Exiting.")
        exit(1)

    # Process articles in parallel
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all article processing tasks
        future_to_article = {
            executor.submit(process_article, url, i): url
            for i, url in enumerate(article_urls, 1)
        }

        # Collect results as they complete
        for future in as_completed(future_to_article):
            article_info, index = future.result()
            if article_info:
                articles_data.append(article_info)
                logger.info(f"Processed article {index}: {article_info['title']}")
            # Small delay to avoid overwhelming the server
            time.sleep(0.2)  # 200ms delay due to Playwright's heavier requests
except Exception as e:
    logger.error(f"Error during parallel processing: {e}")

# Save to JSON file
output_file = 'toi_all_articles.json'
try:
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(articles_data, f, indent=4, ensure_ascii=False)
    logger.info(f"Saved {len(articles_data)} articles to {output_file}")
except Exception as e:
    logger.error(f"Failed to save JSON file: {e}")

print(f"Scraping complete. Saved {len(articles_data)} articles to {output_file}")

### scrape any articles

In [None]:
import logging
import json
import time
import asyncio
from concurrent.futures import ThreadPoolExecutor, as_completed
from playwright.async_api import async_playwright
import re

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Load site configurations from sites.json
try:
    with open('sites.json', 'r', encoding='utf-8') as f:
        sites = json.load(f)
    logger.info(f"Loaded {len(sites)} sites: {[site['name'] for site in sites]}")
except FileNotFoundError:
    logger.error("sites.json not found. Please create the file with site configurations.")
    exit(1)
except Exception as e:
    logger.error(f"Error reading sites.json: {e}")
    exit(1)

# Function to fetch article URLs from a site using Playwright Async API
async def fetch_article_urls(site):
    try:
        async with async_playwright() as p:
            # Launch headless browser
            browser = await p.chromium.launch(headless=True)
            page = await browser.new_page()

            # Start from the homepage
            homepage_url = site['homepage']
            logger.info(f"Starting crawl for {site['name']} from {homepage_url}")
            await page.goto(homepage_url, wait_until="domcontentloaded")

            # Extract section links
            section_links = [homepage_url]  # Include homepage
            section_selector = site.get('section_selector', 'nav a[href], header a[href]')
            nav_links = await page.query_selector_all(section_selector)
            for link in nav_links:
                href = await link.get_attribute('href')
                if href:
                    if href.startswith('/'):
                        href = site['base_url'] + href
                    elif not href.startswith(('http://', 'https://')):
                        href = site['base_url'].rstrip('/') + '/' + href.lstrip('/')
                    if href not in section_links and site['base_url'] in href:
                        section_links.append(href)
            logger.info(f"Found {len(section_links)} section links for {site['name']}")

            # Collect article URLs
            article_urls = set()
            article_selector = site.get('article_selector', 'a[href]')
            article_regex = site.get('article_regex', r'.+/\d+.*')

            for section_url in section_links:
                try:
                    logger.info(f"Fetching articles from {section_url}")
                    await page.goto(section_url, wait_until="domcontentloaded")
                    await page.wait_for_selector(article_selector, timeout=10000)

                    # Handle pagination (up to 3 pages)
                    for page_num in range(3):
                        links = await page.query_selector_all(article_selector)
                        for link in links:
                            href = await link.get_attribute('href')
                            if href:
                                if href.startswith('/'):
                                    href = site['base_url'] + href
                                elif not href.startswith(('http://', 'https://')):
                                    href = site['base_url'].rstrip('/') + '/' + href.lstrip('/')
                                if re.match(article_regex, href) and href not in article_urls:
                                    article_urls.add(href)

                        # Check for "Next" button
                        next_selector = site.get('next_selector', 'a.next, a[rel="next"], a[aria-label*="next"]')
                        next_button = await page.query_selector(next_selector)
                        if not next_button:
                            break
                        await next_button.click()
                        await page.wait_for_load_state('domcontentloaded')
                        await asyncio.sleep(1)  # Delay for page load
                except Exception as e:
                    logger.warning(f"Error fetching articles from {section_url}: {e}")
                    continue

            await browser.close()
            article_urls = list(article_urls)
            logger.info(f"Found {len(article_urls)} unique article URLs for {site['name']}")
            return article_urls, site['name']
    except Exception as e:
        logger.error(f"Error fetching article URLs for {site['name']}: {e}")
        return [], site['name']

# Function to process a single article using Playwright Async API
async def process_article(article_url, site, index):
    try:
        async with async_playwright() as p:
            # Launch headless browser
            browser = await p.chromium.launch(headless=True)
            page = await browser.new_page()

            # Navigate to article page
            await page.goto(article_url, wait_until="domcontentloaded")
            await page.wait_for_selector(site.get('title_selector', 'h1, h2, article h1'), timeout=10000)

            # Extract title
            title_elem = await page.query_selector(site.get('title_selector', 'h1, h2, article h1'))
            title = await title_elem.inner_text() if title_elem else "No title found"
            title = title.strip()

            # Extract publish date
            date_selector = site.get('date_selector', 'time, meta[name="publish-date"], div.date, span.date')
            date_elem = await page.query_selector(date_selector)
            publish_date = await date_elem.inner_text() if date_elem else None
            if not publish_date and date_elem:
                publish_date = await date_elem.get_attribute('content') or await date_elem.get_attribute('datetime')
            if publish_date:
                publish_date = publish_date.strip()

            # Extract full article text
            text_selector = site.get('text_selector', 'article, div.article-content, div.story-content, div.content')
            article_body = await page.query_selector_all(text_selector + ' p')
            article_text = ' '.join([await elem.inner_text() for elem in article_body]).strip() if article_body else ""

            # Extract author
            author_selector = site.get('author_selector', 'span.author, div.byline, a[rel="author"], meta[name="author"]')
            author_elem = await page.query_selector(author_selector)
            author = await author_elem.inner_text() if author_elem else None
            if not author and author_elem:
                author = await author_elem.get_attribute('content')
            if author:
                author = author.strip()

            # Extract article-specific keywords/tags
            article_keywords = []
            # Meta keywords
            meta_keywords = await page.query_selector('meta[name="keywords"], meta[property="article:tag"]')
            if meta_keywords:
                keywords_content = await meta_keywords.get_attribute('content')
                if keywords_content:
                    article_keywords.extend([kw.strip() for kw in keywords_content.split(',')])
            # Visible tags
            tag_selector = site.get('tag_selector', 'div.tags a, ul.tags li, a.tag')
            tag_elems = await page.query_selector_all(tag_selector)
            if tag_elems:
                article_keywords.extend([await elem.inner_text() for elem in tag_elems])
            # Remove duplicates and empty strings
            article_keywords = list(set([kw.strip() for kw in article_keywords if kw]))

            await browser.close()

            # Create article info
            article_info = {
                'site': site['name'],
                'title': title,
                'url': article_url,
                'publish_date': publish_date,
                'author': author,
                'article_keywords': article_keywords,
                'full_text': article_text
            }
            return article_info, index
    except Exception as e:
        logger.warning(f"Error processing article {article_url} from {site['name']}: {e}")
        return None, index

# Wrapper to run async function in sync context for ThreadPoolExecutor
def run_async_fetch(site):
    return asyncio.run(fetch_article_urls(site))

def run_async_process(article_url, site, index):
    return asyncio.run(process_article(article_url, site, index))

# Main scraping process
max_workers = 4  # Adjust based on system resources
articles_data = []
try:
    for site in sites:
        # Fetch article URLs for the site
        article_urls, site_name = run_async_fetch(site)
        
        if not article_urls:
            logger.warning(f"No articles found for {site_name}")
            continue

        # Process articles in parallel
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            future_to_article = {
                executor.submit(run_async_process, url, site, i): url
                for i, url in enumerate(article_urls, 1)
            }

            for future in as_completed(future_to_article):
                article_info, index = future.result()
                if article_info:
                    articles_data.append(article_info)
                    logger.info(f"Processed article {index} from {site_name}: {article_info['title']}")
                time.sleep(0.2)  # Delay to avoid overwhelming servers
except Exception as e:
    logger.error(f"Error during parallel processing: {e}")

# Save to JSON file
output_file = 'news_articles.json'
try:
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(articles_data, f, indent=4, ensure_ascii=False)
    logger.info(f"Saved {len(articles_data)} articles to {output_file}")
except Exception as e:
    logger.error(f"Failed to save JSON file: {e}")

print(f"Scraping complete. Saved {len(articles_data)} articles to {output_file}")

### To convert to embeddings

In [7]:
import json
import logging
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
!pip install chromadb sentence-transformers

Defaulting to user installation because normal site-packages is not writeable
Collecting chromadb
  Downloading chromadb-1.0.7-cp39-abi3-macosx_10_12_x86_64.whl.metadata (6.9 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting pydantic>=1.9 (from chromadb)
  Downloading pydantic-2.11.4-py3-none-any.whl.metadata (66 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp39-cp39-macosx_10_9_x86_64.whl.metadata (252 bytes)
Collecting fastapi==0.115.9 (from chromadb)
  Downloading fastapi-0.115.9-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-4.0.1-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.19.2-cp39-cp39-macosx_11_0_universal2.whl.met

In [8]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Load the JSON file
input_file = 'toi_articles_all.json'
try:
    with open(input_file, 'r', encoding='utf-8') as f:
        articles_data = json.load(f)
    logger.info(f"Loaded {len(articles_data)} articles from {input_file}")
except FileNotFoundError:
    logger.error(f"{input_file} not found. Please ensure the file exists.")
    exit(1)
except Exception as e:
    logger.error(f"Error reading {input_file}: {e}")
    exit(1)

# Initialize SentenceTransformer model
try:
    embed_model = SentenceTransformer('all-MiniLM-L6-v2')
    logger.info("Initialized SentenceTransformer model: all-MiniLM-L6-v2")
except Exception as e:
    logger.error(f"Failed to initialize SentenceTransformer model: {e}")
    exit(1)

# Initialize Chroma DB client
try:
    chroma_client = chromadb.PersistentClient(
        path="./chroma_db",
        settings=Settings()
    )
    collection = chroma_client.get_or_create_collection(
        name="toi_articles",
        metadata={"hnsw:space": "cosine"}
    )
    logger.info("Initialized Chroma DB collection: toi_articles")
except Exception as e:
    logger.error(f"Failed to initialize Chroma DB: {e}")
    exit(1)

# Prepare data for Chroma DB
documents = []
embeddings = []
metadatas = []
ids = []

for i, article in enumerate(articles_data, 1):
    try:
        # Prepare text for embedding
        document = f"{article['title']} {article['summary']}"
        
        # Generate embedding
        embedding = embed_model.encode(document, show_progress_bar=False).tolist()
        
        # Prepare metadata
        metadata = {
            'title': article['title'],
            'url': article['url'],
            'publish_date': article['publish_date'],
            'keywords': ",".join(article['keywords'])
        }
        article_id = f"article_{i}"

        documents.append(document)
        embeddings.append(embedding)
        metadatas.append(metadata)
        ids.append(article_id)

        logger.info(f"Prepared article {i}: {article['title']}")
    except Exception as e:
        logger.warning(f"Error preparing article {article.get('title', 'unknown')}: {e}")
        continue

# Add data to Chroma DB
try:
    if documents:
        collection.add(
            documents=documents,
            embeddings=embeddings,
            metadatas=metadatas,
            ids=ids
        )
        logger.info(f"Added {len(ids)} articles to Chroma DB")
    else:
        logger.warning("No articles to add to Chroma DB")
except Exception as e:
    logger.error(f"Failed to add articles to Chroma DB: {e}")

# Example query
try:
    query_text = "education policy in India"
    query_embedding = embed_model.encode(query_text).tolist()
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=3
    )
    logger.info("Query results:")
    for i, (id_, dist, meta, doc) in enumerate(zip(
        results['ids'][0], results['distances'][0], results['metadatas'][0], results['documents'][0]
    ), 1):
        logger.info(f"Result {i}: {meta['title']} (Distance: {dist:.4f})")
        logger.info(f"URL: {meta['url']}")
        logger.info(f"Keywords: {meta['keywords']}")
        logger.info(f"Summary: {doc[:200]}...\n")
except Exception as e:
    logger.error(f"Error querying Chroma DB: {e}")

print(f"Completed. Stored {len(ids)} articles in Chroma DB.")

2025-05-01 18:39:05,421 - INFO - Loaded 13 articles from toi_articles_all.json
2025-05-01 18:39:05,449 - INFO - Use pytorch device_name: cpu
2025-05-01 18:39:05,451 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2025-05-01 18:39:11,370 - INFO - Initialized SentenceTransformer model: all-MiniLM-L6-v2
2025-05-01 18:39:11,413 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2025-05-01 18:39:12,689 - INFO - Initialized Chroma DB collection: toi_articles
2025-05-01 18:39:12,995 - INFO - Prepared article 1: Infosys Eyes Experienced Tech Workers across 40+ Skill Sets
2025-05-01 18:39:13,045 - INFO - Prepared article 2: Instagram’s head Adam Mosseri dances with influencers, gets a Bollywood welcome from Ranveer and Deepika in Mumbai
2025-05-01 18:39:13,085 - INFO - Prepared article 3: Yolanthe OTT Release Date: When to watch Dutch reality show peeking into Yolanthe Cabau’s life
2025-05-01 18:39:13,124 - IN

Completed. Stored 13 articles in Chroma DB.


### Check if data is stored

In [10]:
import logging
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Initialize Chroma DB client
try:
    chroma_client = chromadb.PersistentClient(
        path="./chroma_db",
        settings=Settings()
    )
    logger.info("Connected to Chroma DB")
except Exception as e:
    logger.error(f"Failed to connect to Chroma DB: {e}")
    exit(1)

# Access the collection
collection_name = "toi_articles"
try:
    collection = chroma_client.get_collection(name=collection_name)
    logger.info(f"Accessed collection: {collection_name}")
except Exception as e:
    logger.error(f"Failed to access collection {collection_name}: {e}")
    exit(1)

# 1. Check total number of items
try:
    count = collection.count()
    logger.info(f"Total articles in collection: {count}")
except Exception as e:
    logger.error(f"Failed to count items: {e}")

# 2. List all articles (with pagination to avoid memory issues)
try:
    if count > 0:
        logger.info("Listing all articles:")
        batch_size = 100  # Adjust based on memory constraints
        for offset in range(0, count, batch_size):
            items = collection.get(
                include=['documents', 'metadatas'],
                limit=batch_size,
                offset=offset
            )
            for i, (id_, doc, meta) in enumerate(zip(items['ids'], items['documents'], items['metadatas']), offset + 1):
                logger.info(f"Article {i}:")
                logger.info(f"  ID: {id_}")
                logger.info(f"  Title: {meta['title']}")
                logger.info(f"  URL: {meta['url']}")
                logger.info(f"  Keywords: {meta['keywords']}")
                logger.info(f"  Summary: {doc[:200]}...")
                logger.info(f"  Publish Date: {meta['publish_date']}\n")
    else:
        logger.warning("No articles found in the collection")
except Exception as e:
    logger.error(f"Failed to list articles: {e}")

# 3. Retrieve specific article by ID (example)
try:
    sample_id = "article_1"  # Replace with a known ID
    item = collection.get(ids=[sample_id], include=['documents', 'metadatas'])
    if item['ids']:
        logger.info(f"Retrieved article with ID: {sample_id}")
        logger.info(f"  Title: {item['metadatas'][0]['title']}")
        logger.info(f"  URL: {item['metadatas'][0]['url']}")
        logger.info(f"  Keywords: {item['metadatas'][0]['keywords']}")
        logger.info(f"  Summary: {item['documents'][0][:200]}...")
        logger.info(f"  Publish Date: {item['metadatas'][0]['publish_date']}\n")
    else:
        logger.warning(f"No article found with ID: {sample_id}")
except Exception as e:
    logger.error(f"Failed to retrieve article by ID: {e}")

# 4. Perform a custom query
try:
    # Initialize SentenceTransformer for query embedding
    embed_model = SentenceTransformer('all-MiniLM-L6-v2')
    logger.info("Initialized SentenceTransformer model: all-MiniLM-L6-v2")
    
    # Custom query (modify this to search for different topics)
    query_text = input("Enter a query (e.g., 'education policy in India') or press Enter for default: ") or "education policy in India"
    query_embedding = embed_model.encode(query_text).tolist()
    
    # Query the collection
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=5  # Return up to 5 results
    )
    
    logger.info(f"Query results for: '{query_text}'")
    for i, (id_, dist, meta, doc) in enumerate(zip(
        results['ids'][0], results['distances'][0], results['metadatas'][0], results['documents'][0]
    ), 1):
        logger.info(f"Result {i}:")
        logger.info(f"  ID: {id_}")
        logger.info(f"  Title: {meta['title']}")
        logger.info(f"  URL: {meta['url']}")
        logger.info(f"  Keywords: {meta['keywords']}")
        logger.info(f"  Summary: {doc[:200]}...")
        logger.info(f"  Distance: {dist:.4f}\n")
except Exception as e:
    logger.error(f"Failed to perform query: {e}")

# 5. List all collections (for context)
try:
    collections = chroma_client.list_collections()
    logger.info("All collections in Chroma DB:")
    for coll in collections:
        logger.info(f"  Name: {coll.name}, Metadata: {coll.metadata}")
except Exception as e:
    logger.error(f"Failed to list collections: {e}")

2025-05-01 18:43:18,345 - INFO - Connected to Chroma DB
2025-05-01 18:43:18,347 - INFO - Accessed collection: toi_articles
2025-05-01 18:43:18,349 - INFO - Total articles in collection: 0
2025-05-01 18:43:18,356 - INFO - Use pytorch device_name: cpu
2025-05-01 18:43:18,357 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2025-05-01 18:43:21,440 - INFO - Initialized SentenceTransformer model: all-MiniLM-L6-v2
Batches: 100%|██████████| 1/1 [00:00<00:00, 44.29it/s]
2025-05-01 18:43:27,174 - INFO - Query results for: 'Instagram’s head Adam Mosseri'
2025-05-01 18:43:27,176 - INFO - All collections in Chroma DB:
2025-05-01 18:43:27,178 - INFO -   Name: toi_articles, Metadata: {'hnsw:space': 'cosine'}
