In [31]:
import pandas as pd
from newspaper import Article
import requests

In [2]:
# Import the appropriate functions from the vader library 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [77]:
class SentimentScoreAnalyzer:
    def __init__(self, keywords, stock_symbol):
        self.keywords = keywords
        self.stock_symbol = stock_symbol
        self.analyzer = SentimentIntensityAnalyzer()

    def map_article_to_stock(self, article_title, article_text):
        combined_text = f"{article_title} {article_text}".lower()
        for keyword in self.keywords:
            if keyword.lower() in combined_text:
                return self.stock_symbol
        return None

    def fetch_article_text(self, url):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
        }
        try:
            response = requests.get(url, headers=headers)
            response.raise_for_status()  # Raises HTTP errors if they occur
            
            article = Article(url)
            article.set_html(response.text)
            article.parse()
            return article.text
        except requests.exceptions.HTTPError as e:
            status_code = e.response.status_code
            if status_code == 403:
                print(f"Access forbidden for URL: {url}")
            elif status_code == 401:
                print(f"Unauthorized access for URL: {url}")
            elif status_code == 404:
                print(f"Article not found at URL: {url}")
            elif status_code == 422:
                print(f"Unprocessable content at URL: {url}")
            elif status_code == 502:
                print(f"Bad gateway error for URL: {url}. Retrying might help.")
            else:
                print(f"HTTP error occurred for URL {url}: {e}")
            return None
        except Exception as e:
            print(f"Error fetching article text for URL {url}: {e}")
            return None

    def add_sentiment_scores_to_df(self, df, filename):
        sentiment_scores = []
        
        for _, row in df.iterrows():
            title = row['title']
            url = row['url']
            article_text = self.fetch_article_text(url)

            content = title + " " + article_text if article_text else title
            if self.map_article_to_stock(title, article_text):
                sentiment = self.analyzer.polarity_scores(content)
                sentiment_scores.append(sentiment['compound'])
            else:
                sentiment_scores.append(0)

        df['compound sentiment score'] = sentiment_scores
        df.dropna(subset=['compound sentiment score'], inplace=True)
        df.to_csv(filename, index=False, encoding='utf-8')
        print(f"Sentiment scores saved to {filename}")

In [68]:
data_it = pd.read_csv('/Users/rishabhbhardwaj/Desktop/Bootcamp project/NewsAPI/IT_all_news.csv')
df_it = pd.DataFrame(data_it)

# Convert the date_time column to datetime format
df_it['publishedAt'] = pd.to_datetime(df_it['publishedAt'])

# Sort by date_time in ascending order (past to future)
df_it = df_it.sort_values(by='publishedAt').reset_index(drop=True).drop_duplicates()

In [75]:
data_cons = pd.read_csv('/Users/rishabhbhardwaj/Desktop/Bootcamp project/NewsAPI/consumers_all_news.csv')
df_cons = pd.DataFrame(data_cons)

# Convert the date_time column to datetime format
df_cons['publishedAt'] = pd.to_datetime(df_cons['publishedAt'])

# Sort by date_time in ascending order (past to future)
df_cons = df_cons.sort_values(by='publishedAt').reset_index(drop=True).drop_duplicates()

In [80]:
data_comm = pd.read_csv('/Users/rishabhbhardwaj/Desktop/Bootcamp project/NewsAPI/communication_all_news.csv')
df_comm = pd.DataFrame(data_comm)

# Convert the date_time column to datetime format
df_comm['publishedAt'] = pd.to_datetime(df_cons['publishedAt'])

# Sort by date_time in ascending order (past to future)
df_comm = df_comm.sort_values(by='publishedAt').reset_index(drop=True).drop_duplicates()

In [62]:
keywords = ["microsoft","msft","bill gates","gates","information","technology","cybersecurity"
            ," ai ","artificial","windows","intel","software","silicon","chips"]
analyzer = SentimentScoreAnalyzer(keywords, 'MSFT')
result_df = analyzer.add_sentiment_scores_to_df(df_msft, "msft_sentiment_scores.csv")

Error fetching article text: 403 Client Error: Forbidden for url: https://www.business-standard.com/world-news/us-president-biden-discourages-israel-from-attacking-iran-s-oil-fields-124100500089_1.html


In [69]:
keywords = ["apple","aapl","tim cook","cook","information","technology","cybersecurity"
            ," ai ","artificial","macos","macbook","software","iphone","chips"]
analyzer = SentimentScoreAnalyzer(keywords, 'AAPL')
result_df = analyzer.add_sentiment_scores_to_df(df_it, "aapl_sentiment_scores.csv")

Error fetching article text: 403 Client Error: Forbidden for url: https://www.business-standard.com/world-news/us-president-biden-discourages-israel-from-attacking-iran-s-oil-fields-124100500089_1.html


In [70]:
keywords = ["nvidia","nvda","jensen huang","information","technology","supercomputing"
            ," ai ","artificial","macos","graphics processing unit (GPU)","software"
            ,"deep learning","autonomous vehicles","gaming and esports","edge computing"]
analyzer = SentimentScoreAnalyzer(keywords, 'NVDA')
result_df = analyzer.add_sentiment_scores_to_df(df_it, "nvda_sentiment_scores.csv")

Error fetching article text: 403 Client Error: Forbidden for url: https://www.business-standard.com/world-news/us-president-biden-discourages-israel-from-attacking-iran-s-oil-fields-124100500089_1.html


In [78]:
keywords = ["amazon","amzn","jeff bezos","bezos","retail","supply chain","household",
            "e-commerce","aws"," ai ","delivery","streaming","digital"]
analyzer = SentimentScoreAnalyzer(keywords, 'AMZN')
result_df = analyzer.add_sentiment_scores_to_df(df_cons, "amzn_sentiment_scores.csv")

Access forbidden for URL: https://undark.org/2024/09/23/h5n1-old-drug-protect-against-new-pandemic/
Unauthorized access for URL: https://consent.yahoo.com/v2/collectConsent?sessionId=1_cc-session_a5bcc80c-318c-43dd-b61d-0f75e161cc92
Unprocessable content at URL: https://phys.org/news/2024-09-cement-coastline-ecosystems.html
Unprocessable content at URL: https://phys.org/news/2024-09-chief-sea-threatens-tide-misery.html
Article not found at URL: https://abcnews.go.com/US/wireStory/hurricane-helene-unusual-fujiwhara-effect-114208917
Unprocessable content at URL: https://phys.org/news/2024-09-regional-climate-essential-effective-policy.html
Bad gateway error for URL: https://springwise.com/innovation/sustainability/modular-floating-land-tackles-climate-change/. Retrying might help.
Access forbidden for URL: https://parade.com/health/top-vaccine-myth-to-stop-believing-according-to-infectious-disease-experts
Sentiment scores saved to amzn_sentiment_scores.csv


In [82]:
keywords = ["alphabet","goog","broadband","5g","media"," ai ", "sundar pichai","pichai","cloud computing"
            ,"data privacy","data","youtube","android os","quantum computing","search engine optimization"]
analyzer = SentimentScoreAnalyzer(keywords, 'GOOG')
result_df = analyzer.add_sentiment_scores_to_df(df_comm, "goog_sentiment_scores.csv")

Bad gateway error for URL: https://springwise.com/innovation/sustainability/modular-floating-land-tackles-climate-change/. Retrying might help.
Unprocessable content at URL: https://phys.org/news/2024-10-climate-escalating-crisis-urges-action.html
Sentiment scores saved to goog_sentiment_scores.csv
