In [2]:
import logging
import urllib.parse
from crewai.tools import BaseTool
from gnews import GNews
from newspaper import Article

# Configure logging for better feedback
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class EnhancedSearchTool(BaseTool):
    """A tool to search the internet for news articles and extract their content."""
    
    name: str = "Enhanced Search"
    description: str = (
        "Searches the internet for recent information and extracts the full content of relevant articles. "
        "Input should be a simple search query string."
    )

    def _run(self, query: str) -> str:
        """
        Runs the search query, fetches articles, and returns their concatenated content.
        
        Args:
            query (str): The search query string.
            
        Returns:
            str: The combined content of the fetched articles, or a message if no content is found.
        """
        try:
            logger.info(f"Initiating enhanced search for query: '{query}'")
            
            gn = GNews(language='en', country='us', period='7d')
            articles = gn.get_news(query)

            if not articles:
                return f"No news articles found for the query: '{query}'."

            full_content_list = []
            max_articles_to_process = 5
            processed_count = 0

            for article_meta in articles:
                if processed_count >= max_articles_to_process:
                    logger.info("Reached maximum number of articles to process. Stopping.")
                    break

                article_url_to_process = article_meta.get('url')
                if not article_url_to_process:
                    continue

                if "news.google.com/rss" in article_url_to_process:
                    try:
                        parsed_url = urllib.parse.urlparse(article_url_to_process)
                        query_params = urllib.parse.parse_qs(parsed_url.query)
                        if 'oc' in query_params:
                            potential_url = query_params['oc'][0]
                            # New, more specific validation: check for a recognizable domain
                            # A simple check for a period followed by 2-4 letters (or more)
                            if any(tld in potential_url for tld in ['.com', '.org', '.net', '.io', '.co', '.ai']):
                                article_url_to_process = potential_url
                            else:
                                logger.warning(f"Extracted string is not a valid URL: {potential_url}")
                                continue
                        else:
                            logger.warning(f"Could not extract original URL from Google redirect link: {article_url_to_process}")
                            continue
                    except Exception as e:
                        logger.error(f"Error parsing Google redirect URL: {e}")
                        continue
                
                # Final check: Ensure the URL has a valid scheme
                if not article_url_to_process.startswith(('http://', 'https://')):
                    article_url_to_process = f"https://{article_url_to_process}"
                    logger.info(f"Adding 'https://' scheme to URL: {article_url_to_process}")

                try:
                    news_article = Article(article_url_to_process)
                    news_article.download()
                    news_article.parse()
                    
                    content_text = news_article.text
                    title = news_article.title
                    
                    if len(content_text) > 200: 
                        full_content_list.append(f"Title: {title}\nURL: {article_url_to_process}\nContent:\n{content_text}")
                        processed_count += 1
                        logger.info(f"Successfully processed article: '{title}' from {article_url_to_process}")
                    else:
                        logger.warning(f"Skipping article at {article_url_to_process} due to insufficient content.")

                except Exception as e:
                    logger.error(f"Error fetching article at {article_url_to_process}: {e}")
                
                logger.info("=" * 80)
            
            if not full_content_list:
                return "No useful content found for the query after filtering. The tool might have encountered paywalls or non-parsable content."

            return "\n\n---\n\n".join(full_content_list)

        except Exception as e:
            logger.error(f"Search failed: {e}")
            return f"Search failed: {e}"

# --- Example Usage ---
if __name__ == "__main__":
    search_tool_instance = EnhancedSearchTool()
    
    query = "latest technology news"
    print(f"Executing search for query: '{query}'")
    search_results = search_tool_instance._run(query)
    
    print("\n--- Search Results ---")
    print(search_results)

08/25/2025 04:36:03 PM - Initiating enhanced search for query: 'the impact of generative AI on software development'


Executing search for query: 'the impact of generative AI on software development'


08/25/2025 04:36:23 PM - Extracted string is not a valid URL: 5
08/25/2025 04:36:23 PM - Extracted string is not a valid URL: 5
08/25/2025 04:36:23 PM - Extracted string is not a valid URL: 5
08/25/2025 04:36:23 PM - Extracted string is not a valid URL: 5
08/25/2025 04:36:23 PM - Extracted string is not a valid URL: 5
08/25/2025 04:36:23 PM - Extracted string is not a valid URL: 5
08/25/2025 04:36:23 PM - Extracted string is not a valid URL: 5
08/25/2025 04:36:23 PM - Extracted string is not a valid URL: 5
08/25/2025 04:36:23 PM - Extracted string is not a valid URL: 5
08/25/2025 04:36:23 PM - Extracted string is not a valid URL: 5
08/25/2025 04:36:23 PM - Extracted string is not a valid URL: 5
08/25/2025 04:36:23 PM - Extracted string is not a valid URL: 5
08/25/2025 04:36:23 PM - Extracted string is not a valid URL: 5
08/25/2025 04:36:23 PM - Extracted string is not a valid URL: 5
08/25/2025 04:36:23 PM - Extracted string is not a valid URL: 5
08/25/2025 04:36:23 PM - Extracted strin


--- Search Results ---
No useful content found for the query after filtering. The tool might have encountered paywalls or non-parsable content.
