In [1]:
import requests
import json
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
import os
from datetime import datetime
import yfinance as yf
from datetime import datetime, timedelta

In [2]:
# Download NLTK resources (only need to run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:
# NewsAPI parameters
api_key = ""  # Replace with your actual NewsAPI key
companies = [
    "Apple", "Microsoft", "Nvidia", "TSMC", "Tesla",
    "Walmart", "Visa", "JPMorgan", "Tencent", "United Health",
    "Costco", "Netflix", "Johnson & Johnson", "Novo Nordisk",
    "Alibaba", "Hermes", "Nestle", "Cisco", "Palantir"
]
start_date = "2025-03-22"
end_date = "2025-03-23"

# Create a directory for results if it doesn't exist
results_dir = "company_sentiment_results"
os.makedirs(results_dir, exist_ok=True)

# Summary DataFrame to store aggregate results for all companies
summary_results = pd.DataFrame(columns=[
    'company', 'articles_count', 'avg_sentiment_raw',
    'avg_sentiment_processed', 'combined_sentiment', 'predicted_movement',
    'positive_count', 'neutral_count', 'negative_count'
])

In [4]:
# Preprocessing function
def preprocess_text(text):
    if pd.isna(text):
        return ""

    # Convert to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove special characters and numbers
    text = re.sub(r'\@\w+|\#|\d+', '', text)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join tokens back into text
    processed_text = ' '.join(tokens)

    return processed_text

# Function to get sentiment scores using TextBlob
def analyze_sentiment(text):
    if pd.isna(text) or text == "":
        return 0  # neutral score for empty text

    # Get sentiment using TextBlob (polarity ranges from -1 to 1)
    blob = TextBlob(text)
    return blob.sentiment.polarity


##########################################0.15 -> 0.1
# Categorize sentiment
def categorize_sentiment(score):
    if score >= 0.1:
        return 'Positive'
    elif score <= -0.1:
        return 'Negative'
    else:
        return 'Neutral'

# Function to predict market movement
def predict_movement(score):
    if score > 0.1:
        return "UP"
    elif score < -0.1:
        return "DOWN"
    else:
        return "NEUTRAL"

# Function to convert company name to ticker symbol
def company_to_ticker(company_name):
    # This is a simple mapping - in a production environment, you might want a more robust solution
    ticker_mapping = {
        "Apple": "AAPL",
        "Microsoft": "MSFT",
        "Nvidia": "NVDA",
        "TSMC": "TSM",
        "Tesla": "TSLA",
        "Walmart": "WMT",
        "Visa": "V",
        "JPMorgan": "JPM",
        "Tencent": "TCEHY",
        "United Health": "UNH",
        "Costco": "COST",
        "Netflix": "NFLX",
        "Johnson & Johnson": "JNJ",
        "Novo Nordisk": "NVO",
        "Alibaba": "BABA",
        "SAP": "SAP",
        "Hermes": "RMS.PA",
        "Nestle": "NSRGY",
        "Cisco": "CSCO",
        "Palantir": "PLTR"
    }

    return ticker_mapping.get(company_name)

In [5]:
# Process each company
for company in companies:
    print(f"\n{'='*50}")
    print(f"Processing {company}...")

    # NewsAPI endpoint
    url = f"https://newsapi.org/v2/everything?q={company}&from={start_date}&to={end_date}&language=en&sortBy=publishedAt&apiKey={api_key}"

    # Make the request
    response = requests.get(url)
    data = response.json()

    # Process the results
    if data['status'] == 'ok' and len(data.get('articles', [])) > 0:
        # Create a list to store article data
        articles_data = []

        for article in data['articles']:
            article_info = {
                'title': article.get('title', ''),
                'description': article.get('description', ''),
                'content': article.get('content', ''),
                'url': article.get('url', ''),
                'publishedAt': article.get('publishedAt', '')
            }
            articles_data.append(article_info)

        # Convert to DataFrame
        df = pd.DataFrame(articles_data)

        # Apply preprocessing to text columns
        df['processed_title'] = df['title'].apply(preprocess_text)
        df['processed_description'] = df['description'].apply(preprocess_text)
        df['processed_content'] = df['content'].apply(preprocess_text)

        # Apply sentiment analysis on both raw and processed text
        # Raw text sentiment analysis
        df['title_sentiment_raw'] = df['title'].apply(analyze_sentiment)
        df['description_sentiment_raw'] = df['description'].apply(analyze_sentiment)
        df['content_sentiment_raw'] = df['content'].apply(analyze_sentiment)

        # Processed text sentiment analysis
        df['title_sentiment_processed'] = df['processed_title'].apply(analyze_sentiment)
        df['description_sentiment_processed'] = df['processed_description'].apply(analyze_sentiment)
        df['content_sentiment_processed'] = df['processed_content'].apply(analyze_sentiment)

        # Calculate weighted average sentiment
        df['overall_sentiment_raw'] = (
            df['title_sentiment_raw'] * 0.5 +
            df['description_sentiment_raw'] * 0.4 +
            df['content_sentiment_raw'] * 0.1
        )

        df['overall_sentiment_processed'] = (
            df['title_sentiment_processed'] * 0.5 +
            df['description_sentiment_processed'] * 0.4 +
            df['content_sentiment_processed'] * 0.1
        )

        # Categorize sentiment
        df['sentiment_category_raw'] = df['overall_sentiment_raw'].apply(categorize_sentiment)
        df['sentiment_category_processed'] = df['overall_sentiment_processed'].apply(categorize_sentiment)

        # Save detailed results for this company
        company_filename = f"{company.replace(' ', '_').lower()}_sentiment_{start_date}_to_{end_date}.csv"
        company_filepath = os.path.join(results_dir, company_filename)
        df.to_csv(company_filepath, index=False)

        # Calculate summary metrics
        raw_avg = df['overall_sentiment_raw'].mean()
        processed_avg = df['overall_sentiment_processed'].mean()
        combined_avg = (raw_avg + processed_avg) / 2
        sentiment_prediction = predict_movement(combined_avg)

        # Count sentiment categories
        sentiment_counts = df['sentiment_category_processed'].value_counts()
        positive_count = sentiment_counts.get('Positive', 0)
        neutral_count = sentiment_counts.get('Neutral', 0)
        negative_count = sentiment_counts.get('Negative', 0)

        # Add to summary results
        summary_results = pd.concat([summary_results, pd.DataFrame([{
            'company': company,
            'articles_count': len(df),
            'avg_sentiment_raw': raw_avg,
            'avg_sentiment_processed': processed_avg,
            'combined_sentiment': combined_avg,
            'predicted_movement': sentiment_prediction,
            'positive_count': positive_count,
            'neutral_count': neutral_count,
            'negative_count': negative_count
        }])], ignore_index=True)

        # Display summary for this company
        print(f"\nSentiment Analysis Summary for {company}:")
        print(f"Articles analyzed: {len(df)}")
        print(f"Average Raw Sentiment: {raw_avg:.4f}")
        print(f"Average Processed Sentiment: {processed_avg:.4f}")
        print(f"Combined Average Sentiment: {combined_avg:.4f}")
        print(f"Predicted Market Movement: {sentiment_prediction}")

        # Display sentiment distribution
        print("\nSentiment Distribution:")
        print(df['sentiment_category_processed'].value_counts())

        # Display most positive/negative headlines
        print("\nMost Positive Headlines:")
        print(df.nlargest(3, 'title_sentiment_raw')[['title', 'title_sentiment_raw']])

        print("\nMost Negative Headlines:")
        print(df.nsmallest(3, 'title_sentiment_raw')[['title', 'title_sentiment_raw']])

    else:
        print(f"Error retrieving data for {company}: {data.get('message', 'Unknown error')}")
        # Add empty row to summary with error indication
        summary_results = pd.concat([summary_results, pd.DataFrame([{
            'company': company,
            'articles_count': 0,
            'avg_sentiment_raw': None,
            'avg_sentiment_processed': None,
            'combined_sentiment': None,
            'predicted_movement': 'ERROR',
            'positive_count': 0,
            'neutral_count': 0,
            'negative_count': 0
        }])], ignore_index=True)

# Save summary results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
summary_filepath = os.path.join(results_dir, f"all_companies_summary_{timestamp}.csv")
summary_results.to_csv(summary_filepath, index=False)



Processing Apple...


  summary_results = pd.concat([summary_results, pd.DataFrame([{



Sentiment Analysis Summary for Apple:
Articles analyzed: 97
Average Raw Sentiment: 0.0859
Average Processed Sentiment: 0.0680
Combined Average Sentiment: 0.0769
Predicted Market Movement: NEUTRAL

Sentiment Distribution:
sentiment_category_processed
Neutral     50
Positive    36
Negative    11
Name: count, dtype: int64

Most Positive Headlines:
                                                title  title_sentiment_raw
27  Adam Scott Describes Fan Commitment to ‘Severa...                  0.7
64          7 Must-See Shows If You Loved The Expanse                  0.7
5   Is This Why So Many People Are Switching To Wi...                  0.5

Most Negative Headlines:
                                                title  title_sentiment_raw
2   Euphoria star Austin Abrams eyed for Resident ...                 -1.0
34  Apple Maps bug randomly showing airport baggag...                 -0.5
77  Gel’s Streaming Accounts Got Hacked With Some ...                 -0.5

Processing Microsoft...



In [6]:
# Display final comparison
print("\n\n" + "="*70)
print("FINAL COMPANY SENTIMENT COMPARISON")
print("="*70)

# Sort by combined sentiment for better visualization
sorted_summary = summary_results.sort_values(by='combined_sentiment', ascending=False)

# Display company comparison
print(sorted_summary[['company', 'articles_count', 'combined_sentiment', 'predicted_movement']])

# Calculate market-wide sentiment
valid_results = summary_results.dropna(subset=['combined_sentiment'])
if len(valid_results) > 0:
    market_sentiment = valid_results['combined_sentiment'].mean()
    print(f"\nOverall Market Sentiment: {market_sentiment:.4f}")
    print(f"Market Direction Prediction: {predict_movement(market_sentiment)}")

print(f"\nDetailed results saved to: {results_dir}")
print(f"Summary comparison saved to: {summary_filepath}")



FINAL COMPANY SENTIMENT COMPARISON
              company articles_count  combined_sentiment predicted_movement
16             Nestle              8            0.173193                 UP
11            Netflix             94            0.120195                 UP
18           Palantir             11            0.110805                 UP
5             Walmart             97            0.108415                 UP
1           Microsoft             98            0.094148            NEUTRAL
3                TSMC             16            0.080293            NEUTRAL
17              Cisco             12            0.079959            NEUTRAL
13       Novo Nordisk              8            0.078614            NEUTRAL
10             Costco             48            0.077948            NEUTRAL
0               Apple             97            0.076911            NEUTRAL
7            JPMorgan            100            0.074719            NEUTRAL
12  Johnson & Johnson             93            0.0

In [7]:
# Parse dates
start_date_dt = datetime.strptime(start_date, "%Y-%m-%d")
end_date_dt = datetime.strptime(end_date, "%Y-%m-%d")

# Get the day before start_date and the day after end_date
day_before_start = (start_date_dt - timedelta(days=1)).strftime("%Y-%m-%d")
day_after_end = (end_date_dt + timedelta(days=1)).strftime("%Y-%m-%d")
end = (end_date_dt + timedelta(days=2)).strftime("%Y-%m-%d")

In [8]:
print("\n\n" + "="*70)
print("PREDICTION VALIDATION AGAINST ACTUAL STOCK MOVEMENT")
print("="*70)

# Filter companies that are not predicted as NEUTRAL and have at least 30 articles
companies_to_check = summary_results[(summary_results['predicted_movement'] != 'NEUTRAL') &
                                    (summary_results['articles_count'] >= 30)]

# Create a DataFrame to store validation results
validation_results = pd.DataFrame(columns=[
    'company', 'ticker', 'prediction', 'before_close',
    'comparison_price', 'actual_movement', 'result'
])

# If no companies match the criteria
if len(companies_to_check) == 0:
    print("No companies match the criteria (non-NEUTRAL prediction with at least 30 articles)")
else:
    # Initialize counters for hit rate calculation
    total_checks = 0
    total_hits = 0

    # Check each qualifying company
    for index, row in companies_to_check.iterrows():
        company_name = row['company']
        prediction = row['predicted_movement']

        # Get ticker for the company
        ticker = company_to_ticker(company_name)
        if ticker:
            try:
                # Get stock data for the day before start_date
                before_data = yf.download(ticker, start=day_before_start, end=start_date, progress=False)
                # Get stock data for the day after end_date
                after_data = yf.download(ticker, start=end_date, end=end, progress=False)

                if not before_data.empty and not after_data.empty:
                    # Get close price from day before start_date
                    close_price = before_data.iloc[0]['Close'].item()

                    # For UP predictions, use the high price
                    # For DOWN predictions, use the low price
                    if prediction == "UP":
                        comparison_price = after_data.iloc[0]['High'].item()
                        actual_movement = "UP" if comparison_price > close_price else "DOWN"
                    else:  # prediction == "DOWN"
                        comparison_price = after_data.iloc[0]['Low'].item()
                        actual_movement = "DOWN" if comparison_price < close_price else "UP"

                    # Compare with prediction
                    result = "HIT" if prediction == actual_movement else "MISS"

                    # Update counters
                    total_checks += 1
                    if result == "HIT":
                        total_hits += 1

                    # Add to validation results
                    new_row = {
                        'company': company_name,
                        'ticker': ticker,
                        'prediction': prediction,
                        'before_close': close_price,
                        'comparison_price': comparison_price,
                        'actual_movement': actual_movement,
                        'result': result
                    }
                    validation_results = pd.concat([validation_results, pd.DataFrame([new_row])], ignore_index=True)
                else:
                    print(f"{company_name} ({ticker}): Insufficient stock data available")

            except Exception as e:
                print(f"{company_name} ({ticker}): Error retrieving stock data - {str(e)}")
        else:
            print(f"{company_name}: No ticker symbol mapping available")

    # Calculate and print hit rate
    if total_checks > 0:
        hit_rate = (total_hits / total_checks) * 100
        validation_results.attrs['hit_rate'] = hit_rate
    else:
        print("\nNo valid stock data was available for comparison")

# Save the validation results
print(validation_results)
print(f"Total predictions checked: {total_checks}")
print(f"Total hits: {total_hits}")
if total_checks > 0:
    hit_rate = (total_hits / total_checks) * 100
    print(f"Hit rate: {hit_rate:.2f}%")



PREDICTION VALIDATION AGAINST ACTUAL STOCK MOVEMENT
YF.download() has changed argument auto_adjust default to True
   company ticker prediction  before_close  comparison_price actual_movement  \
0  Walmart    WMT         UP     85.980003         87.650002              UP   
1  Netflix   NFLX         UP    960.289978        977.000000              UP   

  result  
0    HIT  
1    HIT  
Total predictions checked: 2
Total hits: 2
Hit rate: 100.00%


  validation_results = pd.concat([validation_results, pd.DataFrame([new_row])], ignore_index=True)
