In [1]:
# Import libraries
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import pandas as pd
import plotly.express as px
# NLTK VADER for sentiment analysis
import nltk
nltk.downloader.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from matplotlib import pyplot as plt
%matplotlib inline


# for extracting data from finviz
finviz_url = 'https://finviz.com/quote.ashx?t='

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\nermi\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
def get_news(ticker):
    url = finviz_url + ticker
    req = Request(url=url,headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Safari/605.1.15'}) 
    response = urlopen(req)    
    # Read the contents of the file into 'html'
    html = BeautifulSoup(response)
    # Find 'news-table' in the Soup and load it into 'news_table'
    news_table = html.find(id='news-table')
    return news_table

ticker = 'AMZN'
news_table = get_news(ticker)
#news_table 

In [3]:
# parse news into dataframe
def parse_news(news_table):
    parsed_news = []
    
    for x in news_table.findAll('tr'):
        # read the text from each tr tag into text
        # get text from a only
        text = x.a.get_text() 
        # splite text in the td tag into a list 
        date_scrape = x.td.text.split()
        # if the length of 'date_scrape' is 1, load 'time' as the only element

        if len(date_scrape) == 1:
            time = date_scrape[0]
            
        # else load 'date' as the 1st element and 'time' as the second    
        else:
            date = date_scrape[0]
            time = date_scrape[1]
        
        # Append ticker, date, time and headline as a list to the 'parsed_news' list
        parsed_news.append([date, time, text])        
        # Set column names
        columns = ['date', 'time', 'headline']
        # Convert the parsed_news list into a DataFrame called 'parsed_and_scored_news'
        parsed_news_df = pd.DataFrame(parsed_news, columns=columns)        
        # Create a pandas datetime object from the strings in 'date' and 'time' column
        #parsed_news_df['datetime'] = pd.to_datetime(parsed_news_df['date'] + ' ' + parsed_news_df['time'])
        
    return parsed_news_df
        
parsed_news_df = parse_news(news_table)
parsed_news_df.head()

Unnamed: 0,date,time,headline
0,Today,08:43AM,DOJs Google breakup remedy puts tech world on ...
1,Today,07:17AM,Amazon.com Inc (AMZN): A Promising AI Stock Ac...
2,Today,07:00AM,Amazon airline sells excess cargo capacity to ...
3,Today,06:50AM,Could Oracle Be Worth $1 Trillion by 2030?
4,Today,05:55AM,1 Warren Buffett Stock That Could Go Parabolic...


In [4]:
def score_news(parsed_news_df):
    # Instantiate the sentiment intensity analyzer
    vader = SentimentIntensityAnalyzer()
    
    # Iterate through the headlines and get the polarity scores using vader
    scores = parsed_news_df['headline'].apply(vader.polarity_scores).tolist()

    # Convert the 'scores' list of dicts into a DataFrame
    scores_df = pd.DataFrame(scores)

    # Join the DataFrames of the news and the list of dicts
    parsed_and_scored_news = parsed_news_df.join(scores_df, rsuffix='_right')        
    #parsed_and_scored_news = parsed_and_scored_news.set_index('datetime')    
    #parsed_and_scored_news = parsed_and_scored_news.drop(['date', 'time'], 1)          
    parsed_and_scored_news = parsed_and_scored_news.rename(columns={"compound": "sentiment_score"})

    return parsed_and_scored_news

parsed_and_scored_news = score_news(parsed_news_df)
parsed_and_scored_news.head()

Unnamed: 0,date,time,headline,neg,neu,pos,sentiment_score
0,Today,08:43AM,DOJs Google breakup remedy puts tech world on ...,0.0,1.0,0.0,0.0
1,Today,07:17AM,Amazon.com Inc (AMZN): A Promising AI Stock Ac...,0.0,0.748,0.252,0.4019
2,Today,07:00AM,Amazon airline sells excess cargo capacity to ...,0.0,0.825,0.175,0.1779
3,Today,06:50AM,Could Oracle Be Worth $1 Trillion by 2030?,0.0,0.787,0.213,0.2263
4,Today,05:55AM,1 Warren Buffett Stock That Could Go Parabolic...,0.0,1.0,0.0,0.0


In [5]:

def plot_hourly_sentiment(parsed_and_scored_news):
    parsed_and_scored_news['hour'] = parsed_and_scored_news['datetime'].dt.hour
    hourly_sentiment = parsed_and_scored_news.groupby('hour')['sentiment_score'].mean()

    print(hourly_sentiment)  # Vérifie les valeurs agrégées

    plt.figure(figsize=(10, 5))
    plt.plot(hourly_sentiment.index, hourly_sentiment.values, marker='o')
    plt.title('Average Hourly Sentiment Score')
    plt.xlabel('Hour of the Day')
    plt.ylabel('Average Sentiment Score')
    plt.xticks(hourly_sentiment.index)
    plt.grid()
    plt.show()


In [6]:
def plot_daily_sentiment(parsed_and_scored_news):
    parsed_and_scored_news['date'] = parsed_and_scored_news['datetime'].dt.date
    daily_sentiment = parsed_and_scored_news.groupby('date')['sentiment_score'].mean()

    print(daily_sentiment)  # Vérifie les valeurs agrégées

    plt.figure(figsize=(10, 5))
    plt.plot(daily_sentiment.index, daily_sentiment.values, marker='o', color='orange')
    plt.title('Average Daily Sentiment Score')
    plt.xlabel('Date')
    plt.ylabel('Average Sentiment Score')
    plt.xticks(rotation=45)
    plt.grid()
    plt.show()