In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
def fetch_nytimes_search_results(company_name):
    """
    Fetches search results from the New York Times website for a given company name.

    Args:
        company_name (str): The name of the company to search for.

    Returns:
        requests.Response: The HTTP response from the NYTimes search URL.
    """
    # Define headers to mimic a real browser request
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
    }

    # Construct the search URL
    search_url = f"https://www.nytimes.com/search?dropmab=false&lang=en&query={company_name}&sections=Business%7Cnyt%3A%2F%2Fsection%2F0415b2b0-513a-5e78-80da-21ab770cb753&sort=best&types=article"

    try:
        # Make the HTTP GET request
        response = requests.get(search_url, headers=headers, timeout=15)
        response.raise_for_status()  # Raise an exception for HTTP errors
        return response
    except requests.exceptions.RequestException as e:
        print(f"An error occurred while fetching search results: {e}")
        return None

In [8]:
def extract_article_info(url_response):
    """Extracts all the relevant information about the article from the URL response.
    Returns a list of dictionaries containing "title", "summary", "metadata", etc.
    """
    articles = []
    soup = BeautifulSoup(url_response.text, "html.parser")

    # Find all <a> tags that contain article information
    for a_tag in soup.find_all("a", href=True):
        try:
            # Extract the link
            link = a_tag["href"]

            # Extract the title
            title_tag = a_tag.find("h4", class_="css-nsjm9t")
            title = title_tag.get_text(strip=True) if title_tag else None

            # Only proceed if the title is available (assume it's an article)
            if title:
                # Extract summary
                summary_tag = a_tag.find("p", class_="css-e5tzus")
                summary = summary_tag.get_text(strip=True) if summary_tag else None

                # Extract source
                source_tag = a_tag.find("span", class_="css-chk81a")
                source = source_tag.get_text(strip=True) if source_tag else None

                # Extract author
                author_tag = a_tag.find("p", class_="css-1engk30")
                author = author_tag.get_text(strip=True) if author_tag else None

                # Extract and format timestamp
                timestamp_span = a_tag.find("span", class_="css-1t2tqhf")
                timestamp = None
                if timestamp_span and timestamp_span.next_sibling:
                    timestamp = timestamp_span.next_sibling.strip()
                    if timestamp:
                        timestamp = ", ".join(timestamp.split(",")[:2])  # Format timestamp

                # Add article info to the list
                articles.append({
                    'link': link,
                    'title': title,
                    'source': source,
                    'author': author,
                    'timestamp': timestamp,
                    'summary': summary
                })
        except Exception as e:
            print(f"Error processing an article: {e}")

    return articles

In [23]:
###
url_response = fetch_nytimes_search_results("Tesla")
articles = extract_article_info(url_response)

### Analyze

In [24]:
import nltk
nltk.download("vader_lexicon")
from nltk.sentiment import SentimentIntensityAnalyzer

class SentimentAnalyzer:
    """
    A class to perform sentiment analysis on a list of articles using VADER.
    """

    def __init__(self, articles):
        """
        Initializes the SentimentAnalyzer with a list of articles.

        Args:
            articles (list): A list of dictionaries containing article details.
        """
        self.articles = articles
        self.sia = SentimentIntensityAnalyzer()  # Initialize VADER sentiment analyzer

    def analyze_sentiment(self, text):
        """
        Analyzes the sentiment of the given text using VADER.

        Args:
            text (str): The text to analyze.

        Returns:
            str: The sentiment label ("positive", "negative", or "neutral").
        """
        if not text:
            return "neutral"  # Return neutral if text is empty

        # Get sentiment scores
        sentiment_scores = self.sia.polarity_scores(text)

        # Determine sentiment based on compound score
        if sentiment_scores["compound"] >= 0.25:
            return "positive"
        elif sentiment_scores["compound"] <= -0.25:
            return "negative"
        else:
            return "neutral"

    def analyze_articles(self):
        """
        Performs sentiment analysis on all articles in the list.

        Returns:
            list: A list of dictionaries with added sentiment analysis results.
        """
        for article in self.articles:
            sentiment = self.analyze_sentiment(article.get("summary"))
            article["sentiment"] = sentiment  # Add sentiment to the article dictionary

        return self.articles

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [29]:
# Initialize and analyze articles
sentiment_analyzer = SentimentAnalyzer(articles)
results = sentiment_analyzer.analyze_articles()

In [26]:
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Download necessary NLTK resources
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

class NewsTopicExtractor:
    def __init__(self):
        # Load spaCy English model
        try:
            self.nlp = spacy.load('en_core_web_sm')
        except OSError:
            print("Downloading spaCy English model...")
            spacy.cli.download("en_core_web_sm")
            self.nlp = spacy.load('en_core_web_sm')
        
        # Stop words to filter out
        self.stop_words = set(stopwords.words('english'))
    
    def extract_topics(self, summary, num_topics=3):
        """
        Extract topics from a news summary
        
        Args:
            summary (str): News summary text
            num_topics (int): Number of topics to extract
        
        Returns:
            list: Extracted topics
        """
        # Process the summary with spaCy
        doc = self.nlp(summary)
        
        # Extract named entities and nouns as potential topics
        potential_topics = []
        
        # Add named entities
        potential_topics.extend([ent.text for ent in doc.ents 
                                 if ent.label_ in ['ORG', 'PERSON', 'GPE', 'PRODUCT']])
        
        # Add important nouns and proper nouns
        potential_topics.extend([token.text for token in doc 
                                 if token.pos_ in ['PROPN', 'NOUN'] 
                                 and token.text.lower() not in self.stop_words
                                 and len(token.text) > 2])
        
        # Remove duplicates while preserving order
        topics = list(dict.fromkeys(potential_topics))
        
        # If not enough topics, use TF-IDF to extract more
        if len(topics) < num_topics:
            vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
            tfidf_matrix = vectorizer.fit_transform([summary])
            feature_names = vectorizer.get_feature_names_out()
            tfidf_scores = tfidf_matrix.toarray()[0]
            
            # Get top TF-IDF terms
            top_indices = tfidf_scores.argsort()[-num_topics:][::-1]
            tfidf_topics = [feature_names[i] for i in top_indices]
            
            topics.extend(tfidf_topics)
        
        # Ensure unique topics and limit to num_topics
        topics = list(dict.fromkeys(topics))[:num_topics]
        
        # Capitalize topics
        topics = [topic.capitalize() for topic in topics]
        
        return topics


In [27]:
for article in articles:
    try:
        if isinstance(article, dict) and 'summary' in article:
            extractor = NewsTopicExtractor()
            topic_list = extractor.extract_topics(article["summary"])
            article["topics"] = topic_list
    except Exception as e:
        print(f"Error processing article: {e}")

In [28]:
for article in articles:
    print(article["topics"])

['Autopilot', 'Regulators', 'Automaker']
['Twitter', 'Maker', 'Cars']
['Head', 'Accounting', 'Weeks']
['Disparity', 'Performance', 'Car']
['California', 'Autopilot', 'Family']
['Chicago', 'Temperatures', 'Batteries']
['Florida', 'Tesla', 'Crash']
['Elon musk', 'Tesla', 'Elon']
['Tesla', 'Furor', 'Markets']
['Trump', 'Teslas', 'President']
