In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
def fetch_nytimes_search_results(company_name):
    """
    Fetches search results from the New York Times website for a given company name.

    Args:
        company_name (str): The name of the company to search for.

    Returns:
        requests.Response: The HTTP response from the NYTimes search URL.
    """
    # Define headers to mimic a real browser request
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
    }

    # Construct the search URL
    search_url = f"https://www.nytimes.com/search?dropmab=false&lang=en&query={company_name}&sections=Business%7Cnyt%3A%2F%2Fsection%2F0415b2b0-513a-5e78-80da-21ab770cb753&sort=best&types=article"

    try:
        # Make the HTTP GET request
        response = requests.get(search_url, headers=headers, timeout=15)
        response.raise_for_status()  # Raise an exception for HTTP errors
        return response
    except requests.exceptions.RequestException as e:
        print(f"An error occurred while fetching search results: {e}")
        return None

In [3]:
def extract_article_info(url_response):
    """Extracts all the relevant information about the article from the URL response.
    Returns a list of dictionaries containing "title", "summary", "metadata", etc.
    """
    articles = []
    soup = BeautifulSoup(url_response.text, "html.parser")

    # Find all <a> tags that contain article information
    for a_tag in soup.find_all("a", href=True):
        try:
            # Extract the link
            link = a_tag["href"]

            # Extract the title
            title_tag = a_tag.find("h4", class_="css-nsjm9t")
            title = title_tag.get_text(strip=True) if title_tag else None

            # Only proceed if the title is available (assume it's an article)
            if title:
                # Extract summary
                summary_tag = a_tag.find("p", class_="css-e5tzus")
                summary = summary_tag.get_text(strip=True) if summary_tag else None

                # Extract source
                source_tag = a_tag.find("span", class_="css-chk81a")
                source = source_tag.get_text(strip=True) if source_tag else None

                # Extract author
                author_tag = a_tag.find("p", class_="css-1engk30")
                author = author_tag.get_text(strip=True) if author_tag else None

                # Extract and format timestamp
                timestamp_span = a_tag.find("span", class_="css-1t2tqhf")
                timestamp = None
                if timestamp_span and timestamp_span.next_sibling:
                    timestamp = timestamp_span.next_sibling.strip()
                    if timestamp:
                        timestamp = ", ".join(timestamp.split(",")[:2])  # Format timestamp

                # Add article info to the list
                articles.append({
                    'link': link,
                    'title': title,
                    'source': source,
                    'author': author,
                    'timestamp': timestamp,
                    'summary': summary
                })
        except Exception as e:
            print(f"Error processing an article: {e}")

    return articles

In [4]:
###
url_response = fetch_nytimes_search_results("Tesla")
articles = extract_article_info(url_response)

### Analyze

In [5]:
import nltk
nltk.download("vader_lexicon")
from nltk.sentiment import SentimentIntensityAnalyzer

class SentimentAnalyzer:
    """
    A class to perform sentiment analysis on a list of articles using VADER.
    """

    def __init__(self, articles):
        """
        Initializes the SentimentAnalyzer with a list of articles.

        Args:
            articles (list): A list of dictionaries containing article details.
        """
        self.articles = articles
        self.sia = SentimentIntensityAnalyzer()  # Initialize VADER sentiment analyzer

    def analyze_sentiment(self, text):
        """
        Analyzes the sentiment of the given text using VADER.

        Args:
            text (str): The text to analyze.

        Returns:
            str: The sentiment label ("positive", "negative", or "neutral").
        """
        if not text:
            return "neutral"  # Return neutral if text is empty

        # Get sentiment scores
        sentiment_scores = self.sia.polarity_scores(text)

        # Determine sentiment based on compound score
        if sentiment_scores["compound"] >= 0.25:
            return "positive"
        elif sentiment_scores["compound"] <= -0.25:
            return "negative"
        else:
            return "neutral"

    def analyze_articles(self):
        """
        Performs sentiment analysis on all articles in the list.

        Returns:
            list: A list of dictionaries with added sentiment analysis results.
        """
        for article in self.articles:
            sentiment = self.analyze_sentiment(article.get("summary"))
            article["sentiment"] = sentiment  # Add sentiment to the article dictionary

        return self.articles

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [6]:
# Initialize and analyze articles
sentiment_analyzer = SentimentAnalyzer(articles)
results = sentiment_analyzer.analyze_articles()

In [7]:
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Download necessary NLTK resources
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

class NewsTopicExtractor:
    def __init__(self):
        # Load spaCy English model
        try:
            self.nlp = spacy.load('en_core_web_sm')
        except OSError:
            print("Downloading spaCy English model...")
            spacy.cli.download("en_core_web_sm")
            self.nlp = spacy.load('en_core_web_sm')
        
        # Stop words to filter out
        self.stop_words = set(stopwords.words('english'))
    
    def extract_topics(self, summary, num_topics=3):
        """
        Extract topics from a news summary
        
        Args:
            summary (str): News summary text
            num_topics (int): Number of topics to extract
        
        Returns:
            list: Extracted topics
        """
        # Process the summary with spaCy
        doc = self.nlp(summary)
        
        # Extract named entities and nouns as potential topics
        potential_topics = []
        
        # Add named entities
        potential_topics.extend([ent.text for ent in doc.ents 
                                 if ent.label_ in ['ORG', 'PERSON', 'GPE', 'PRODUCT']])
        
        # Add important nouns and proper nouns
        potential_topics.extend([token.text for token in doc 
                                 if token.pos_ in ['PROPN', 'NOUN'] 
                                 and token.text.lower() not in self.stop_words
                                 and len(token.text) > 2])
        
        # Remove duplicates while preserving order
        topics = list(dict.fromkeys(potential_topics))
        
        # If not enough topics, use TF-IDF to extract more
        if len(topics) < num_topics:
            vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
            tfidf_matrix = vectorizer.fit_transform([summary])
            feature_names = vectorizer.get_feature_names_out()
            tfidf_scores = tfidf_matrix.toarray()[0]
            
            # Get top TF-IDF terms
            top_indices = tfidf_scores.argsort()[-num_topics:][::-1]
            tfidf_topics = [feature_names[i] for i in top_indices]
            
            topics.extend(tfidf_topics)
        
        # Ensure unique topics and limit to num_topics
        topics = list(dict.fromkeys(topics))[:num_topics]
        
        # Capitalize topics
        topics = [topic.capitalize() for topic in topics]
        
        return topics


In [9]:
for article in articles:
    try:
        if isinstance(article, dict) and 'summary' in article:
            extractor = NewsTopicExtractor()
            topic_list = extractor.extract_topics(article["summary"])
            article["topics"] = topic_list
    except Exception as e:
        print(f"Error processing article: {e}")

In [10]:
for article in articles:
    print(article["topics"])

['Autopilot', 'Regulators', 'Automaker']
['Twitter', 'Maker', 'Cars']
['Head', 'Accounting', 'Weeks']
['Disparity', 'Performance', 'Car']
['California', 'Autopilot', 'Family']
['Chicago', 'Temperatures', 'Batteries']
['Florida', 'Tesla', 'Crash']
['Elon musk', 'Tesla', 'Elon']
['Tesla', 'Furor', 'Markets']
['Trump', 'Teslas', 'President']


In [20]:
def analyze_article_topics_pairs(articles):
    """
    Analyzes topics across pairs of articles to find common and unique topics,
    with case-insensitive comparison while preserving original case in results.
    Returns a single list of common words across pairs.

    Args:
        articles: List of dictionaries, each containing a 'topics' key with a list of words

    Returns:
        A dictionary with:
        - 'common_words_across_pairs': List of words common to pairs of articles
        - 'unique_words_in_article_X': List of words unique to each article (X is index, original case preserved)
    """
    if not articles:
        return {}

    processed_articles = []
    for article in articles:
        if 'topics' not in article or not isinstance(article['topics'], list):
            processed_articles.append({'original': [], 'normalized': set(), 'case_mapping': {}})
            continue

        case_mapping = {}
        normalized_topics = set()

        for word in article['topics']:
            normalized = word.lower()
            if normalized not in case_mapping:
                case_mapping[normalized] = word
            normalized_topics.add(normalized)

        processed_articles.append({
            'original': article['topics'],
            'normalized': normalized_topics,
            'case_mapping': case_mapping
        })

    result = {
        'common_words_across_pairs': []
    }

    # Find common words for each pair of articles and flatten into a single list
    for i in range(0, len(processed_articles) - 1, 2):
        article1 = processed_articles[i]
        article2 = processed_articles[i + 1]

        common_normalized = article1['normalized'].intersection(article2['normalized'])

        if common_normalized:
            common_words = [article1['case_mapping'].get(word, word) for word in common_normalized]
            result['common_words_across_pairs'].extend(common_words)

    # Find unique words for each article
    for i, article in enumerate(processed_articles):
        other_sets = [processed_articles[j]['normalized'] for j in range(len(processed_articles)) if j != i]

        unique_normalized = article['normalized'] - set.union(*other_sets) if other_sets else article['normalized']

        unique_words = []
        if unique_normalized:
            original_topics = article['original']
            unique_words = [word for word in original_topics if word.lower() in unique_normalized]

        result[f'unique_words_in_article_{i+1}'] = unique_words

    return result

In [21]:
top_ans = analyze_article_topics_pairs(articles)

In [22]:
print(top_ans)

{'common_words_across_pairs': ['Tesla'], 'unique_words_in_article_1': ['Regulators', 'Automaker'], 'unique_words_in_article_2': ['Twitter', 'Maker', 'Cars'], 'unique_words_in_article_3': ['Head', 'Accounting', 'Weeks'], 'unique_words_in_article_4': ['Disparity', 'Performance', 'Car'], 'unique_words_in_article_5': ['California', 'Family'], 'unique_words_in_article_6': ['Chicago', 'Temperatures', 'Batteries'], 'unique_words_in_article_7': ['Florida', 'Crash'], 'unique_words_in_article_8': ['Elon musk', 'Elon'], 'unique_words_in_article_9': ['Furor', 'Markets'], 'unique_words_in_article_10': ['Trump', 'Teslas', 'President']}


## Coverage comparision


In [14]:
import json
import os
from dotenv import load_dotenv
from google import genai

load_dotenv()
gemini_api_key = os.getenv("GEMINI_API_KEY")

class CoverageComparison:

    def __init__(self, api_key):
        """
        Initialize the CoverageComparison class.
        
        Args:
            api_key (str, optional): Gemini API key. If not provided, will try to load from environment.
        """
        self.api_key = api_key

    def compare_two_articles(self, i, article1, article2, gemini_api_key):
        """
        Generates a precise one-line comparison between two articles using Google's Gemini Flash model.
        
        Args:
            article1 (dict): Dictionary with 'title' and 'summary' keys
            article2 (dict): Dictionary with 'title' and 'summary' keys
            gemini_api_key (str): Your Gemini API key
        
        Returns:
            dict: {"Comparison": "one-line", "Impact": "one-line"}
        """
        client = genai.Client(api_key=self.api_key)
        
        prompt = f"""Compare these articles and respond in JSON format :

        Article {i} - Title: {article1.get('title','')}
        Summary: {article1.get('summary','')}

        Article {i+1} - Title: {article2.get('title','')}
        Summary: {article2.get('summary','')}

        Provide:
        1. "Comparison": One sentence highlighting key difference
        2. "Impact": One sentence on practical consequence

        Note: It should strictly avoid any other extra words like "JSON", etc.

        Format exactly like this:
        {{
            "Comparison": "Your one-line comparison here",
            "Impact": "Your one-line impact here"
        }}"""
        
        try:
            response = client.models.generate_content(
                model="gemini-2.0-flash",  # Using the faster Flash model
                contents=[prompt],
                
            )
            
            
            # Extract the text and remove any markdown code block formatting
            response_text = response.text.strip('`json\n').strip('`').strip()

            # Parse the JSON response
            result = json.loads(response_text) 

            return {
                "Comparison": " ".join(result["Comparison"].split()),
                "Impact": " ".join(result["Impact"].split())
            }
        except json.JSONDecodeError as je:
            print(f"JSON Parsing Error: {je}")
            print(f"Received response: {response.text}")
            return {
                "Comparison": "Parsing error",
                "Impact": "Unable to parse response"
            }
        except Exception as e:
            print(f"API Error: {e}")
            return {
                "Comparison": "Comparison unavailable",
                "Impact": "Impact analysis failed"
            }

    def get_analysis_across_all(self, articles):
        """
        Generates article comparisons across all the articles in pairs.
        Args:
            articles (list): List of dictionaries of articles with 'title' and 'summary' keys
            gemini_api_key (str): Your Gemini API key

        Returns:
            list: List of dictionaries, each with "Comparison" and "Impact" keys.
        """
        comparison_list = []
        if len(articles) < 2:
            return comparison_list # return empty list if less than 2 articles.

        for i in range(0, len(articles) - 1, 2):
            try:
                ans_dict = self.compare_two_articles(i + 1, articles[i], articles[i + 1], self.api_key)
                comparison_list.append(ans_dict)
            except IndexError: # handle the case where there is an odd number of articles.
                print("Odd number of articles, last one will be ignored")
                break
            except Exception as e:
                print(f"An unexpected error occurred: {e}")
        return comparison_list
    
    def get_final_sentiment_analysis(self, comparisons):
        """
        Generates a final sentiment analysis based on the impact of all article comparisons.
        
        Args:
            comparisons (list): List of dictionaries, each with "Comparison" and "Impact" keys.
        
        Returns:
            dict: {"Final Sentiment Analysis": "Two-line sentiment analysis"}
        """
        client = genai.Client(api_key=self.api_key)
        
        impacts = [comp["Impact"] for comp in comparisons]
        impacts_str = "\n".join(impacts)

        prompt = f"""Analyze the following impacts from news coverage and provide a final sentiment analysis in two lines:

        {impacts_str}

        Specifically address:
        1. Whether the overall news coverage is positive or negative.
        2. The overall impact on the company's market growth.

        Respond in JSON format:
        {{
            "Final Sentiment Analysis": "Your two-line analysis here"
        }}

        Note: It should strictly avoid any other extra words like "JSON", etc."""

        try:
            response = client.models.generate_content(
                model="gemini-2.0-flash",
                contents=[prompt],
            )
            response_text = response.text.strip('`json\n').strip('`').strip()
            result = json.loads(response_text)
            return {"Final Sentiment Analysis": " ".join(result["Final Sentiment Analysis"].split())}
        except json.JSONDecodeError as je:
            print(f"JSON Parsing Error: {je}")
            print(f"Received response: {response.text}")
            return {"Final Sentiment Analysis": "Parsing error"}
        except Exception as e:
            print(f"API Error: {e}")
            return {"Final Sentiment Analysis": "Analysis failed"}
        
    def get_all_analysis(self, articles):
        """
        Generates all comparison analysis and the final sentiment analysis.
        
        Args:
            articles (list): List of dictionaries of articles with 'title' and 'summary' keys.
        
        Returns:
            tuple: (comparison_list, final_sentiment)
        """
        comparison_list = self.get_analysis_across_all(articles)
        final_sentiment = self.get_final_sentiment_analysis(comparison_list)
        return comparison_list, final_sentiment

In [15]:
compare = CoverageComparison(gemini_api_key)
c_list, final = compare.get_all_analysis(articles)

In [16]:
final

{'Final Sentiment Analysis': 'The news coverage leans negative due to concerns about recalls, profitability, leadership, and increased scrutiny. This could hinder market growth by impacting investor confidence, consumer decisions, and overall brand perception.'}

In [24]:
!pip install deep_translator

Collecting deep_translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
   ---------------------------------------- 0.0/42.3 kB ? eta -:--:--
   -------------------------------------- - 41.0/42.3 kB 1.9 MB/s eta 0:00:01
   ---------------------------------------- 42.3/42.3 kB 1.0 MB/s eta 0:00:00
Installing collected packages: deep_translator
Successfully installed deep_translator-1.11.4



[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [31]:
import os
import shutil
from gtts import gTTS
from deep_translator import GoogleTranslator

class TextToSpeechConverter:
    def __init__(self, output_directory="audio_outputs"):
        self.output_directory = output_directory
        
        # Remove existing directory and create a fresh one
        if os.path.exists(self.output_directory):
            shutil.rmtree(self.output_directory)
        
        # Create a new, empty directory
        os.makedirs(self.output_directory)

    def convert_english_to_hindi_audio(self, english_text, filename=None):
        """
        Translates the given English text to Hindi, converts it to audio, and saves it as an MP3 file.

        Args:
            english_text (str): The English text to convert to audio.
            filename (str, optional): The desired filename for the audio output. 
                                      If None, generates a default filename.
        
        Returns:
            str or None: Path to the saved audio file, or None if an error occurs.
        """
        try:
            # Generate a default filename if not provided
            if filename is None:
                filename = f"news_coverage.mp3"
            
            # Translate English to Hindi using deep_translator
            hindi_text = GoogleTranslator(source='auto', target='hi').translate(english_text)

            # Convert Hindi text to audio
            tts = gTTS(text=hindi_text, lang='hi')
            filepath = os.path.join(self.output_directory, filename)
            tts.save(filepath)
            
            print(f"English text: {english_text}")
            print(f"Hindi translation: {hindi_text}")
            print(f"Audio saved to: {filepath}")
            return filepath
        except Exception as e:
            print(f"Error during audio conversion: {e}")
            return None

In [32]:
# Example Usage
converter = TextToSpeechConverter()
audio_file_path = converter.convert_english_to_hindi_audio(final["Final Sentiment Analysis"])

English text: The news coverage leans negative due to concerns about recalls, profitability, leadership, and increased scrutiny. This could hinder market growth by impacting investor confidence, consumer decisions, and overall brand perception.
Hindi translation: समाचार कवरेज रिकॉल, लाभप्रदता, नेतृत्व और बढ़ी हुई जांच के बारे में चिंताओं के कारण नकारात्मक झुकता है। यह निवेशक के विश्वास, उपभोक्ता निर्णयों और समग्र ब्रांड धारणा को प्रभावित करके बाजार की वृद्धि में बाधा डाल सकता है।
Audio saved to: audio_outputs\news_coverage.mp3
