In [15]:
import pandas as pd
from newspaper import Article
import requests
import numpy as np

In [4]:
# Import the appropriate functions from the vader library 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [77]:
class SentimentScoreAnalyzer:
    def __init__(self, keywords, stock_symbol):
        self.keywords = keywords
        self.stock_symbol = stock_symbol
        self.analyzer = SentimentIntensityAnalyzer()

    def map_article_to_stock(self, article_title, article_text):
        combined_text = f"{article_title} {article_text}".lower()
        for keyword in self.keywords:
            if keyword.lower() in combined_text:
                return self.stock_symbol
        return None

    def fetch_article_text(self, url):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
        }
        try:
            response = requests.get(url, headers=headers)
            response.raise_for_status()  # Raises HTTP errors if they occur
            
            article = Article(url)
            article.set_html(response.text)
            article.parse()
            return article.text
        except requests.exceptions.HTTPError as e:
            status_code = e.response.status_code
            if status_code == 403:
                print(f"Access forbidden for URL: {url}")
            elif status_code == 401:
                print(f"Unauthorized access for URL: {url}")
            elif status_code == 404:
                print(f"Article not found at URL: {url}")
            elif status_code == 422:
                print(f"Unprocessable content at URL: {url}")
            elif status_code == 502:
                print(f"Bad gateway error for URL: {url}. Retrying might help.")
            else:
                print(f"HTTP error occurred for URL {url}: {e}")
            return None
        except Exception as e:
            print(f"Error fetching article text for URL {url}: {e}")
            return None

    def add_sentiment_scores_to_df(self, df, filename):
        sentiment_scores = []
        
        for _, row in df.iterrows():
            title = row['title']
            url = row['url']
            article_text = self.fetch_article_text(url)

            content = title + " " + article_text if article_text else title
            if self.map_article_to_stock(title, article_text):
                sentiment = self.analyzer.polarity_scores(content)
                sentiment_scores.append(sentiment['compound'])
            else:
                sentiment_scores.append(0)

        df['compound sentiment score'] = sentiment_scores
        df.dropna(subset=['compound sentiment score'], inplace=True)
        df.to_csv(filename, index=False, encoding='utf-8')
        print(f"Sentiment scores saved to {filename}")

In [68]:
data_it = pd.read_csv('/Users/rishabhbhardwaj/Desktop/Bootcamp project/NewsAPI/IT_all_news.csv')
df_it = pd.DataFrame(data_it)

# Convert the date_time column to datetime format
df_it['publishedAt'] = pd.to_datetime(df_it['publishedAt'])

# Sort by date_time in ascending order (past to future)
df_it = df_it.sort_values(by='publishedAt').reset_index(drop=True).drop_duplicates()

In [75]:
data_cons = pd.read_csv('/Users/rishabhbhardwaj/Desktop/Bootcamp project/NewsAPI/consumers_all_news.csv')
df_cons = pd.DataFrame(data_cons)

# Convert the date_time column to datetime format
df_cons['publishedAt'] = pd.to_datetime(df_cons['publishedAt'])

# Sort by date_time in ascending order (past to future)
df_cons = df_cons.sort_values(by='publishedAt').reset_index(drop=True).drop_duplicates()

In [80]:
data_comm = pd.read_csv('/Users/rishabhbhardwaj/Desktop/Bootcamp project/NewsAPI/communication_all_news.csv')
df_comm = pd.DataFrame(data_comm)

# Convert the date_time column to datetime format
df_comm['publishedAt'] = pd.to_datetime(df_cons['publishedAt'])

# Sort by date_time in ascending order (past to future)
df_comm = df_comm.sort_values(by='publishedAt').reset_index(drop=True).drop_duplicates()

In [62]:
keywords = ["microsoft","msft","bill gates","gates","information","technology","cybersecurity"
            ," ai ","artificial","windows","intel","software","silicon","chips"]
analyzer = SentimentScoreAnalyzer(keywords, 'MSFT')
result_df = analyzer.add_sentiment_scores_to_df(df_msft, "msft_sentiment_scores.csv")

Error fetching article text: 403 Client Error: Forbidden for url: https://www.business-standard.com/world-news/us-president-biden-discourages-israel-from-attacking-iran-s-oil-fields-124100500089_1.html


In [69]:
keywords = ["apple","aapl","tim cook","cook","information","technology","cybersecurity"
            ," ai ","artificial","macos","macbook","software","iphone","chips"]
analyzer = SentimentScoreAnalyzer(keywords, 'AAPL')
result_df = analyzer.add_sentiment_scores_to_df(df_it, "aapl_sentiment_scores.csv")

Error fetching article text: 403 Client Error: Forbidden for url: https://www.business-standard.com/world-news/us-president-biden-discourages-israel-from-attacking-iran-s-oil-fields-124100500089_1.html


In [70]:
keywords = ["nvidia","nvda","jensen huang","information","technology","supercomputing"
            ," ai ","artificial","macos","graphics processing unit (GPU)","software"
            ,"deep learning","autonomous vehicles","gaming and esports","edge computing"]
analyzer = SentimentScoreAnalyzer(keywords, 'NVDA')
result_df = analyzer.add_sentiment_scores_to_df(df_it, "nvda_sentiment_scores.csv")

Error fetching article text: 403 Client Error: Forbidden for url: https://www.business-standard.com/world-news/us-president-biden-discourages-israel-from-attacking-iran-s-oil-fields-124100500089_1.html


In [78]:
keywords = ["amazon","amzn","jeff bezos","bezos","retail","supply chain","household",
            "e-commerce","aws"," ai ","delivery","streaming","digital"]
analyzer = SentimentScoreAnalyzer(keywords, 'AMZN')
result_df = analyzer.add_sentiment_scores_to_df(df_cons, "amzn_sentiment_scores.csv")

Access forbidden for URL: https://undark.org/2024/09/23/h5n1-old-drug-protect-against-new-pandemic/
Unauthorized access for URL: https://consent.yahoo.com/v2/collectConsent?sessionId=1_cc-session_a5bcc80c-318c-43dd-b61d-0f75e161cc92
Unprocessable content at URL: https://phys.org/news/2024-09-cement-coastline-ecosystems.html
Unprocessable content at URL: https://phys.org/news/2024-09-chief-sea-threatens-tide-misery.html
Article not found at URL: https://abcnews.go.com/US/wireStory/hurricane-helene-unusual-fujiwhara-effect-114208917
Unprocessable content at URL: https://phys.org/news/2024-09-regional-climate-essential-effective-policy.html
Bad gateway error for URL: https://springwise.com/innovation/sustainability/modular-floating-land-tackles-climate-change/. Retrying might help.
Access forbidden for URL: https://parade.com/health/top-vaccine-myth-to-stop-believing-according-to-infectious-disease-experts
Sentiment scores saved to amzn_sentiment_scores.csv


In [82]:
keywords = ["alphabet","goog","broadband","5g","media"," ai ", "sundar pichai","pichai","cloud computing"
            ,"data privacy","data","youtube","android os","quantum computing","search engine optimization"]
analyzer = SentimentScoreAnalyzer(keywords, 'GOOG')
result_df = analyzer.add_sentiment_scores_to_df(df_comm, "goog_sentiment_scores.csv")

Bad gateway error for URL: https://springwise.com/innovation/sustainability/modular-floating-land-tackles-climate-change/. Retrying might help.
Unprocessable content at URL: https://phys.org/news/2024-10-climate-escalating-crisis-urges-action.html
Sentiment scores saved to goog_sentiment_scores.csv


In [36]:
class SentimentScoreAnalyzerCSV:
    def __init__(self, keywords, stock_symbol):
        self.keywords = keywords
        self.stock_symbol = stock_symbol
        self.analyzer = SentimentIntensityAnalyzer()
         # Define relevant keywords with associated sentiment scores
        news_terms = {
            "war": -0.9,
            "conflict": -0.85,
            "peace": 0.85,
            "treaty": 0.8,
            "climate change": -0.75,
            "global warming": -0.8,
            "carbon emission": -0.7,
            "sustainability": 0.75,
            "renewable energy": 0.85,
            "pandemic": -0.95,
            "virus outbreak": -0.9,
            "quarantine": -0.85,
            "vaccine": 0.85,
            "public health": 0.75,
            "lockdown": -0.8,
            "election": 0.4,
            "vote": 0.5,
            "government policy": 0.6,
            "reform": 0.65,
            "tax cut": 0.7,
            "stimulus": 0.85,
            "interest rate hike": -0.7,
            "inflation": -0.75,
            "economic growth": 0.85,
            "recession": -0.85,
            "unemployment": -0.8,
            "job growth": 0.75,
            "natural disaster": -0.9,
            "hurricane": -0.85,
            "earthquake": -0.9,
            "wildfire": -0.85,
            "pollution": -0.75,
            "renewable resources": 0.85,
            "healthcare reform": 0.7,
            "border security": -0.6,
            "trade agreement": 0.7,
            "import tariffs": -0.65,
            "export": 0.65,
            "sanctions": -0.75,
            "diplomacy": 0.7
        }

       # Generate random scores with 5% standard deviation around each base score
        randomized_news_terms = {
            term: round(np.random.normal(loc=score, scale=abs(score) * 0.25), 3) for term, score in news_terms.items()
        }

        self.analyzer.lexicon.update(randomized_news_terms)
    def map_article_to_stock(self, text):
        text = text.lower()
        for keyword in self.keywords:
            if keyword.lower() in text:
                return self.stock_symbol
        return None

    def add_sentiment_scores_to_df(self, df, start_date, end_date, filename):
        # Convert 'date' column to datetime and filter the date range
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
        mask = (df['Date'] >= pd.to_datetime(start_date)) & (df['Date'] <= pd.to_datetime(end_date))
        df = df[mask].copy()

        sentiment_scores = []

        # Process each row and calculate sentiment for each headline
        for _, row in df.iterrows():
            row_score = 0
            valid_headlines = 0

            for col in [f"Top{i}" for i in range(1, 26)]:
                headline = row.get(col, "")
                if pd.notna(headline) and self.map_article_to_stock(headline):
                    sentiment = self.analyzer.polarity_scores(headline)
                    row_score += sentiment['compound']
                    valid_headlines += 1

            avg_score = row_score / valid_headlines if valid_headlines > 0 else None
            sentiment_scores.append(avg_score)

        # Add the calculated sentiment scores back to the DataFrame
        df['compound sentiment score'] = sentiment_scores
        df.dropna(subset=['compound sentiment score'], inplace=True)
        df.to_csv(filename, index=False, encoding='utf-8')
        print(f"Sentiment scores saved to {filename}")

In [32]:
buzzwords_dict = {
    "GOOG": [
        # Google-specific
        "Google", "Search Engine", "Google Maps", "Gmail", "YouTube", "Android", "Chrome", "Google Drive", "Alphabet Inc.",
        "Google Cloud", "Google Assistant", "Google Ads", "Google Analytics", "Pixel phone", "Google Home", 
        "DeepMind", "Waymo", "Nest", "DoubleClick", "Google Now", "Google Glass", "YouTube Red", "Google Earth",
        "AdSense", "Google Play", "AI research", "Machine Learning", "TensorFlow", "Google Photos", "Google Ventures",

        # Industry & Market-related
        "search engine optimization", "online advertising", "cloud services", "data privacy", "personalization", 
        "self-driving cars", "AI and ML", "deep learning", "technology investments", "digital marketing", 
        "streaming video", "smart home", "cloud computing", "digital transformation", "market expansion", 
        "user engagement", "mobile ecosystem", "R&D investment", "technology disruption", "quarterly earnings",
        
        # Broader technology & trends
        "AI revolution", "tech giant", "digital media", "advertising revenue", "online content", "search trends", 
        "data-driven decisions", "smartphones", "app ecosystem", "innovation", "consumer tech", "VR", "IoT devices",
        "big data", "privacy policy", "voice search", "computing power", "data security", "scalability"
    ],
    "MSFT": [
        # Microsoft-specific
        "Microsoft", "Windows OS", "Microsoft Office", "Azure", "Surface", "Xbox", "Bing", "LinkedIn acquisition",
        "Cortana", "OneDrive", "Visual Studio", "SharePoint", "Outlook", "Teams", "Internet Explorer", "MSN", "HoloLens",
        "Skype", "Dynamics", "Microsoft Edge", "SQL Server", "Power BI", "Active Directory", "Silverlight", "DirectX",
        
        # Industry & Market-related
        "cloud computing", "SaaS", "cloud services", "software licensing", "operating system", "business productivity",
        "gaming market", "data analytics", "cybersecurity", "enterprise solutions", "technology adoption",
        "data center", "AI and machine learning", "collaboration tools", "digital transformation", "partnership",
        "acquisition", "quarterly earnings", "market share", "R&D investment", "stock growth", "enterprise software",
        
        # Broader technology & trends
        "technology innovation", "tech giant", "productivity tools", "smartphones", "cross-platform", "software developer",
        "digital transformation", "personal computing", "enterprise adoption", "user experience", "mobile apps",
        "cloud storage", "cyber resilience", "digital workplace", "computing power", "future of work"
],
     "AMZN": [
        # Amazon-specific
        "Amazon", "e-commerce", "AWS", "Prime", "Kindle", "Amazon Echo", "Alexa", "Amazon Fresh", "Amazon Web Services",
        "Fire TV", "Prime Video", "Whole Foods", "Fulfillment Centers", "Prime Day", "marketplace", "third-party sellers",
        "logistics", "supply chain", "Amazon Go", "Amazon Basics", "subscription service", "Prime Membership",
        "Amazon Studios", "self-publishing", "Kindle Unlimited", "Amazon Music", "AWS Lambda", "Redshift", "S3 Storage",
        
        # Industry & Market-related
        "cloud computing", "digital commerce", "online retail", "last-mile delivery", "subscription economy",
        "streaming services", "logistics innovation", "digital transformation", "consumer behavior", "data analytics",
        "retail innovation", "cloud services", "media streaming", "warehousing", "supply chain management",
        "technology disruption", "AI and machine learning", "smart home", "customer experience", "fulfillment services",
        
        # Broader technology & trends
        "e-commerce growth", "consumer electronics", "tech giant", "retail sales", "online shopping", 
        "home automation", "digital assistants", "voice-activated", "data privacy", "digital marketing", 
        "smart devices", "IoT", "big data", "consumer insights", "logistics automation", "digital subscriptions",
        "cyber Monday", "Black Friday", "technology innovation", "R&D investment"
],

    "NVDA": [
        # NVIDIA-specific
        "NVIDIA", "GPU", "GeForce", "Quadro", "Tesla GPUs", "Tegra", "CUDA", "NVIDIA Shield", "SLI technology", 
        "GameWorks", "Deep Learning", "NVIDIA Drive", "G-Sync", "AI research", "Tensor Cores", "Pascal architecture", 
        "Maxwell architecture", "Kepler architecture", "Graphics card", "NVIDIA Jetson", "NVLink", "DLSS technology",
        "NVIDIA GRID", "PhysX", "CUDA cores", "3D Vision", "Omniverse", "Founders Edition",

        # Industry & Market-related
        "semiconductors", "AI chips", "data center", "gaming industry", "machine learning", "VR", "virtual reality",
        "augmented reality", "cloud computing", "supercomputing", "autonomous vehicles", "AI hardware", "stock performance",
        "AI market", "deep learning acceleration", "technology innovation", "partnership", "acquisition", "R&D investment",
        "market expansion", "product launch", "shareholder", "stock growth", "revenue growth", "quarterly earnings",
        "competition", "cloud services", "infrastructure", "tech giant", "software stack", "energy efficiency", 
        "compute power", "processing power", "machine vision", "graphical processing", "gaming performance",

        # Broader technology & trends
        "innovation", "AI revolution", "tech innovation", "big data", "machine intelligence", "inference", 
        "performance boost", "edge computing", "IoT devices", "next-gen technology", "consumer electronics", 
        "visual computing", "developer ecosystem", "parallel processing", "scalability", "hardware upgrade"
    ]
    ,
    "AAPL": [
        "iPhone", "iPad", "MacBook", "iPod", "Apple Watch", "AirPods", "iMac", "Mac Pro", "Mac Mini", "Apple TV",
        "App Store", "Apple Music", "iTunes", "iCloud", "Siri", "Touch ID", "Face ID", "Apple Pay", "Apple Pencil",
        "Retina Display", "iOS", "macOS", "OS X", "iLife", "iWork", "Safari", "Lightning Connector",
        "A-series chips", "M-series chips", "HealthKit", "HomeKit", "CarPlay", "iMessage", "AppleCare", 
        "Final Cut Pro", "Logic Pro", "Pro Display XDR", "iBooks", "iPhoto", "GarageBand", "iMovie", 
        "Time Capsule", "AirPort", "Apple Store", "Apple Campus", "Apple Silicon", "Digital Hub",
        
        # Common Apple-related buzzwords
        "Apple event", "product launch", "Apple stock", "Apple earnings", "Steve Jobs", "Tim Cook",
        "product innovation", "brand loyalty", "premium devices", "ecosystem", "user experience",
        "tech giant", "market leader", "tech innovation", "stock performance", "R&D investment",
        "software update", "security features", "quarterly results", "brand reputation", "environmental responsibility",
        "corporate responsibility", "customer privacy", "data encryption", "consumer electronics", "wearable tech",
        "smartphone market", "tablet market", "tech stock", "cloud services", "digital marketplace", "app development",
        "global expansion", "supply chain", "retail strategy", "customer experience", "device upgrade"
    ]
}


In [35]:
# Load the CSV file with a specified encoding
df = pd.read_csv('/Users/rishabhbhardwaj/Desktop/Bootcamp project/news_headlines.csv', encoding='ISO-8859-1')
# Specify the date range
start_date = "2002-07-01"
end_date = "2016-06-01"
for stock_symbol, keywords in buzzwords_dict.items():
    analyzer = SentimentScoreAnalyzerCSV(keywords, stock_symbol)

    # Run the analysis and save results
    analyzer.add_sentiment_scores_to_df(df, start_date, end_date, f'sentiment_analysis_results_{stock_symbol}.csv')

Sentiment scores saved to sentiment_analysis_results_GOOG.csv
Sentiment scores saved to sentiment_analysis_results_MSFT.csv
Sentiment scores saved to sentiment_analysis_results_AMZN.csv
Sentiment scores saved to sentiment_analysis_results_NVDA.csv
Sentiment scores saved to sentiment_analysis_results_AAPL.csv


In [65]:
class SentimentScoreAnalyzerStockNews:
    def __init__(self, stock_symbol):
        self.stock_symbol = stock_symbol
        self.analyzer = SentimentIntensityAnalyzer()
        # Define the base sentiment scores
        top_finance_terms = {
                "EPS beat": 0.95,
                "revenue miss": -0.92,
                "downgrade": -0.93,
                "FDA approval": 0.96,
                "positive earnings": 0.96,
                "negative earnings": -0.96,
                "profit warning": -0.92,
                "unexpected loss": -0.97,
                "stock surge": 0.95,
                "profit": 0.95,
                "loss": -0.95,
                "revenue": 0.9,
                "growth": 0.8,
                "decline": -0.8,
                "bullish": 0.9,
                "bearish": -0.9,
                "uptrend": 0.7,
                "downtrend": -0.7,
                "debt": -0.85,
                "earnings": 0.85,
                "bankruptcy": -1.0,
                "expenses": -0.75,
                "investment": 0.75,
                "dividend": 0.9,
                "recession": -0.95,
                "inflation": -0.85,
                "recovery": 0.8,
                "unemployment": -0.9,
                "market share": 0.7
            }
        #Generate random scores with 5% standard deviation around each base score
        randomized_finance_terms = {
            term: round(np.random.normal(loc=score, scale=abs(score) * 0.10), 3) for term, score in top_finance_terms.items()
        }

        self.analyzer.lexicon.update(randomized_finance_terms)

    def calculate_weighted_average_sentiment(self, scores):
        # Compute weighted average based on sentiment magnitude
        if not scores:
            return None
        sum_score = sum(abs(score) for score in scores)
        if sum_score != 0:
            return sum(score * abs(score) for score in scores) / sum_score
        return 0

    def add_sentiment_scores_to_df(self, df, start_date, end_date, filename):
        # Convert 'date' column to datetime and filter the date range and stock symbol
        df['date'] = pd.to_datetime(df['date'], errors='coerce')
        mask = (
            (df['date'] >= pd.to_datetime(start_date)) & 
            (df['date'] <= pd.to_datetime(end_date)) & 
            (df['ticker'] == self.stock_symbol)
        )
        df = df[mask].copy()

        # Dictionary to hold daily sentiment scores for each date
        daily_sentiment_scores = {}

        # Process each row and calculate sentiment for each headline
        for _, row in df.iterrows():
            date = row['date']
            headline = row['headline']  # Assuming the headline is in the 'headline' column
            
            # Skip if headline is NaN or not a string
            if pd.isna(headline) or not isinstance(headline, str):
                continue

            # Calculate sentiment score for the headline
            sentiment = self.analyzer.polarity_scores(headline)
            compound_score = sentiment['compound']

            # Add the score to the appropriate date in the dictionary
            if date not in daily_sentiment_scores:
                daily_sentiment_scores[date] = []
            daily_sentiment_scores[date].append(compound_score)

        # Compute weighted average sentiment for each date
        results = []
        for date, scores in daily_sentiment_scores.items():
            weighted_avg_score = self.calculate_weighted_average_sentiment(scores)
            results.append({'date': date, 'ticker': self.stock_symbol, 'weighted compound sentiment score': weighted_avg_score})

        # Convert results to DataFrame and save to CSV
        sentiment_df = pd.DataFrame(results)
        sentiment_df.dropna(subset=['weighted compound sentiment score'], inplace=True)
        sentiment_df.to_csv(filename, index=False, encoding='utf-8')
        print(f"Sentiment scores saved to {filename}")


In [66]:
# Load the CSV file with a specified encoding
df = pd.read_csv('/Users/rishabhbhardwaj/Desktop/Bootcamp project/stock_news.csv', encoding='ISO-8859-1')
# Specify the date range
start_date = "2011-05-02"
end_date = "2019-12-23"
stock_symbols = ['GOOG','AAPL','MSFT','NVDA','AMZN']
for stock_symbol in stock_symbols:
    analyzer = SentimentScoreAnalyzerStockNews(stock_symbol)

    # Run the analysis and save results
    analyzer.add_sentiment_scores_to_df(df, start_date, end_date, f'stock_news_sentiment_analysis_results_{stock_symbol}.csv')

Sentiment scores saved to stock_news_sentiment_analysis_results_GOOG.csv
Sentiment scores saved to stock_news_sentiment_analysis_results_AAPL.csv
Sentiment scores saved to stock_news_sentiment_analysis_results_MSFT.csv
Sentiment scores saved to stock_news_sentiment_analysis_results_NVDA.csv
Sentiment scores saved to stock_news_sentiment_analysis_results_AMZN.csv
