In [1]:
!pip install flask
!pip install nltk
!pip install urllib3
!pip install bs4
%pip install plotly



In [None]:
# Import libraries
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import pandas as pd
import plotly.express as px
# NLTK VADER for sentiment analysis
import nltk
nltk.downloader.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# for extracting data from finviz
finviz_url = 'https://finviz.com/quote.ashx?t='

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## Get HTML News Headlines Table from FinViz

In [3]:
def get_news(ticker):
    url = finviz_url + ticker
    req = Request(url=url,headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0'}) 
    response = urlopen(req)    
    # Read the contents of the file into 'html'
    html = BeautifulSoup(response)
    # Find 'news-table' in the Soup and load it into 'news_table'
    news_table = html.find(id='news-table')
    return news_table

ticker = 'AMZN'
news_table = get_news(ticker)
news_table # pure html code, data is not parsed yet

<table border="0" cellpadding="1" cellspacing="0" class="fullview-news-outer" id="news-table" width="100%">
<tr><td align="right" style="white-space:nowrap" width="130">Apr-16-22 12:00AM  </td><td align="left"><div class="news-link-container"><div class="news-link-left"><a class="tab-link-news" href="https://www.wsj.com/articles/the-pandemic-was-supposed-to-push-all-shopping-online-it-didnt-11650081652?mod=itp_wsj" target="_blank">The Pandemic Was Supposed to Push All Shopping Online. It Didnt.</a></div><div class="news-link-right"><span style="color:#aa6dc0;font-size:9px"> The Wall Street Journal</span></div></div></td></tr>
<tr><td align="right" style="white-space:nowrap" width="130">Apr-15-22 07:23PM  </td><td align="left"><div class="news-link-container"><div class="news-link-left"><a class="tab-link-news" href="https://www.barrons.com/articles/costco-thrives-by-mastering-traditional-retailing-now-its-challenging-amazon-51650065013?siteid=yhoof2" target="_blank">Costco Thrives by M

## Parse News into DataFrame

In [4]:
# parse news into dataframe
def parse_news(news_table):
    parsed_news = []
    
    for x in news_table.findAll('tr'):
        # read the text from each tr tag into text
        # get text from a only
        text = x.a.get_text() 
        # splite text in the td tag into a list 
        date_scrape = x.td.text.split()
        # if the length of 'date_scrape' is 1, load 'time' as the only element

        if len(date_scrape) == 1:
            time = date_scrape[0]
            
        # else load 'date' as the 1st element and 'time' as the second    
        else:
            date = date_scrape[0]
            time = date_scrape[1]
        
        # Append ticker, date, time and headline as a list to the 'parsed_news' list
        parsed_news.append([date, time, text])        
        # Set column names
        columns = ['date', 'time', 'headline']
        # Convert the parsed_news list into a DataFrame called 'parsed_and_scored_news'
        parsed_news_df = pd.DataFrame(parsed_news, columns=columns)        
        # Create a pandas datetime object from the strings in 'date' and 'time' column
        parsed_news_df['datetime'] = pd.to_datetime(parsed_news_df['date'] + ' ' + parsed_news_df['time'])
        
    return parsed_news_df
        
parsed_news_df = parse_news(news_table)
parsed_news_df.head()

Unnamed: 0,date,time,headline,datetime
0,Apr-16-22,12:00AM,The Pandemic Was Supposed to Push All Shopping...,2022-04-16 00:00:00
1,Apr-15-22,07:23PM,Costco Thrives by Mastering Traditional Retail...,2022-04-15 19:23:00
2,Apr-15-22,05:10PM,"Amazon CEO Has Some Thoughts About Bitcoin, NFTs",2022-04-15 17:10:00
3,Apr-15-22,03:45PM,Plant-based foods should not be 'niche': Drew ...,2022-04-15 15:45:00
4,Apr-15-22,02:01PM,Houston firm buys Amazon's first distribution ...,2022-04-15 14:01:00


## Score News Sentiment and Save Results into DataFrame

In [5]:
def score_news(parsed_news_df):
    # Instantiate the sentiment intensity analyzer
    vader = SentimentIntensityAnalyzer()
    
    # Iterate through the headlines and get the polarity scores using vader
    scores = parsed_news_df['headline'].apply(vader.polarity_scores).tolist()

    # Convert the 'scores' list of dicts into a DataFrame
    scores_df = pd.DataFrame(scores)

    # Join the DataFrames of the news and the list of dicts
    parsed_and_scored_news = parsed_news_df.join(scores_df, rsuffix='_right')        
    parsed_and_scored_news = parsed_and_scored_news.set_index('datetime')    
    parsed_and_scored_news = parsed_and_scored_news.drop(['date', 'time'], 1)          
    parsed_and_scored_news = parsed_and_scored_news.rename(columns={"compound": "sentiment_score"})

    return parsed_and_scored_news

parsed_and_scored_news = score_news(parsed_news_df)
parsed_and_scored_news.head()

  parsed_and_scored_news = parsed_and_scored_news.drop(['date', 'time'], 1)


Unnamed: 0_level_0,headline,neg,neu,pos,sentiment_score
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-04-16 00:00:00,The Pandemic Was Supposed to Push All Shopping...,0.0,1.0,0.0,0.0
2022-04-15 19:23:00,Costco Thrives by Mastering Traditional Retail...,0.0,0.708,0.292,0.3182
2022-04-15 17:10:00,"Amazon CEO Has Some Thoughts About Bitcoin, NFTs",0.0,0.805,0.195,0.1779
2022-04-15 15:45:00,Plant-based foods should not be 'niche': Drew ...,0.0,1.0,0.0,0.0
2022-04-15 14:01:00,Houston firm buys Amazon's first distribution ...,0.0,1.0,0.0,0.0


## Resample Sentiment by Hour and Date and Use Plotly to Plot It

In [7]:
def plot_hourly_sentiment(parsed_and_scored_news, ticker):
   
    # Group by date and ticker columns from scored_news and calculate the mean
    mean_scores = parsed_and_scored_news.resample('H').mean()

    # Plot a bar chart with plotly 
    fig = px.bar(mean_scores, x=mean_scores.index, y='sentiment_score', title = ticker + ' Hourly Sentiment Scores')
    fig.show()
    
plot_hourly_sentiment(parsed_and_scored_news, ticker)

In [8]:
def plot_daily_sentiment(parsed_and_scored_news, ticker):
   
    # Group by date and ticker columns from scored_news and calculate the mean
    mean_scores = parsed_and_scored_news.resample('D').mean()

    # Plot a bar chart with plotly
    fig = px.bar(mean_scores, x=mean_scores.index, y='sentiment_score', title = ticker + ' Daily Sentiment Scores')
    fig.show()
    
plot_daily_sentiment(parsed_and_scored_news, ticker)

## Putting It All Together

In [9]:
ticker = 'AMZN'
news_table = get_news(ticker)
parsed_news_df = parse_news(news_table)
parsed_and_scored_news = score_news(parsed_news_df)
plot_hourly_sentiment(parsed_and_scored_news, ticker)
plot_daily_sentiment(parsed_and_scored_news, ticker)
parsed_and_scored_news.head()


In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only



Unnamed: 0_level_0,headline,neg,neu,pos,sentiment_score
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-04-16 00:00:00,The Pandemic Was Supposed to Push All Shopping...,0.0,1.0,0.0,0.0
2022-04-15 19:23:00,Costco Thrives by Mastering Traditional Retail...,0.0,0.708,0.292,0.3182
2022-04-15 17:10:00,"Amazon CEO Has Some Thoughts About Bitcoin, NFTs",0.0,0.805,0.195,0.1779
2022-04-15 15:45:00,Plant-based foods should not be 'niche': Drew ...,0.0,1.0,0.0,0.0
2022-04-15 14:01:00,Houston firm buys Amazon's first distribution ...,0.0,1.0,0.0,0.0


In [10]:
ticker = 'AAPL'
news_table = get_news(ticker)
parsed_news_df = parse_news(news_table)
parsed_and_scored_news = score_news(parsed_news_df)
plot_hourly_sentiment(parsed_and_scored_news, ticker)
plot_daily_sentiment(parsed_and_scored_news, ticker)
parsed_and_scored_news.head()


In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only



Unnamed: 0_level_0,headline,neg,neu,pos,sentiment_score
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-04-16 00:00:00,The Chips That Rebooted the Mac,0.0,1.0,0.0,0.0
2022-04-15 12:48:00,How streaming platforms like Spotify have crea...,0.0,0.71,0.29,0.5423
2022-04-15 12:45:00,Is Apple Stock A Buy Before iPhone Maker's Ear...,0.0,1.0,0.0,0.0
2022-04-15 11:37:00,3 Things About Snap That Smart Investors Know,0.0,0.69,0.31,0.4019
2022-04-15 10:16:00,"2 Reasons to Buy ExxonMobil, and 1 Reason to H...",0.231,0.769,0.0,-0.2732


In [11]:
ticker = 'TSLA'
news_table = get_news(ticker)
parsed_news_df = parse_news(news_table)
parsed_and_scored_news = score_news(parsed_news_df)
plot_hourly_sentiment(parsed_and_scored_news, ticker)
plot_daily_sentiment(parsed_and_scored_news, ticker)
parsed_and_scored_news.head()


In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only



Unnamed: 0_level_0,headline,neg,neu,pos,sentiment_score
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-04-15 20:00:00,Elon Musk looking to bring in partners for Twi...,0.0,1.0,0.0,0.0
2022-04-15 19:01:00,Dow Jones Futures: Market Rally Keeps Sliding;...,0.179,0.821,0.0,-0.5423
2022-04-15 18:57:00,Twitter Adopts Poison Pill to Ward Off Musk Ta...,0.287,0.574,0.139,-0.4215
2022-04-15 17:48:00,Is Tesla Stock Vulnerable to the Key-Man Risk?...,0.222,0.778,0.0,-0.4588
2022-04-15 17:27:00,Used Teslas Are Selling at Discounts in This City,0.0,1.0,0.0,0.0
