In [11]:
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import os
import pandas as pd
import datetime
import matplotlib.pyplot as plt
%matplotlib inline
# NLTK VADER for sentiment analysis
import nltk 
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\rriley\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [29]:
# Get Tickers from database or yahoo_fin
import psycopg2

conn = psycopg2.connect("dbname=StonksGoUp user=postgres host=localhost password=admin")
cur = conn.cursor()

SQL_tickers = """SELECT ticker FROM tickers ORDER BY ticker ASC"""
cur.execute(SQL_tickers)

tickers = ['AMZN', 'TSLA', 'GOOG', 'SQ']
tickers = list([i[0] for i in cur.fetchall()])
print(tickers)

['A', 'AACG', 'AACQ', 'AACQU', 'AACQW', 'AAL', 'AAME', 'AAOI', 'AAON', 'AAP', 'AAPL', 'AAWW', 'AAXJ', 'AAXN', 'ABBV', 'ABC', 'ABCB', 'ABEO', 'ABIO', 'ABMD', 'ABT', 'ABTX', 'ABUS', 'ACAD', 'ACAM', 'ACAMU', 'ACAMW', 'ACBI', 'ACCD', 'ACER', 'ACET', 'ACEV', 'ACEVU', 'ACEVW', 'ACGL', 'ACGLO', 'ACGLP', 'ACHC', 'ACHV', 'ACIA', 'ACIU', 'ACIW', 'ACLS', 'ACMR', 'ACN', 'ACNB', 'ACOR', 'ACRS', 'ACRX', 'ACST', 'ACT', 'ACTCU', 'ACTG', 'ACWI', 'ACWX', 'ADAP', 'ADBE', 'ADES', 'ADI', 'ADIL', 'ADILW', 'ADM', 'ADMA', 'ADMP', 'ADMS', 'ADP', 'ADPT', 'ADRE', 'ADRO', 'ADSK', 'ADTN', 'ADTX', 'ADUS', 'ADVM', 'ADXN', 'ADXS', 'AEE', 'AEGN', 'AEHR', 'AEIS', 'AEMD', 'AEP', 'AEPPL', 'AEPPZ', 'AERI', 'AES', 'AESE', 'AEY', 'AEYE', 'AEZS', 'AFIB', 'AFIN', 'AFINP', 'AFL', 'AFMD', 'AFYA', 'AGBA', 'AGBAR', 'AGBAU', 'AGBAW', 'AGCUU', 'AGEN', 'AGFS', 'AGIO', 'AGLE', 'AGMH', 'AGNC', 'AGNCM', 'AGNCN', 'AGNCO', 'AGNCP', 'AGRX', 'AGTC', 'AGYS', 'AGZD', 'AHACU', 'AHCO', 'AHPI', 'AIA', 'AIG', 'AIH', 'AIHS', 'AIKI', 'AIMC', 'AIMT

In [3]:
news_tables = {}

for ticker in tickers:
    finviz_url = f'https://finviz.com/quote.ashx?t={ticker}'

    req = Request(url=finviz_url,headers={'user-agent': 'my-app/0.0.1'}) 
    response = urlopen(req)    

    # Read the contents of the file into 'html'
    html = BeautifulSoup(response)

    # Find 'news-table' in the Soup and load it into 'news_table'
    news_table = html.find(id='news-table')
    
    # Add the table to our dictionary
    news_tables[ticker] = news_table
print(news_tables)

{'AMZN': <table border="0" cellpadding="1" cellspacing="0" class="fullview-news-outer" id="news-table" width="100%">
<tr><td align="right" style="white-space:nowrap" width="130">Oct-10-20 03:00PM  </td><td align="left"><div class="news-link-container"><div class="news-link-left"><a class="tab-link-news" href="https://finance.yahoo.com/news/stocks-made-rich-today-090000826.html" target="_blank">Stocks That Would Have Made You Rich Today</a></div><div class="news-link-right"><span style="color:#aa6dc0;font-size:9px"> GOBankingRates</span></div></div></td></tr>
<tr><td align="right" width="130">12:06PM  </td><td align="left"><div class="news-link-container"><div class="news-link-left"><a class="tab-link-news" href="https://www.fool.com/investing/2020/10/10/3-top-tech-stocks-to-buy-right-now/?source=eptyholnk0000202&amp;utm_source=yahoo-host&amp;utm_medium=feed&amp;utm_campaign=article" target="_blank">3 Top Tech Stocks to Buy Right Now</a></div><div class="news-link-right"><span style="co

In [6]:
parsed_news = []

# Iterate through the news
for file_name, news_table in news_tables.items():
    # Iterate through all tr tags in 'news_table'
    for x in news_table.findAll('tr'):
        # read the text from each tr tag into text
        # get text from a only
        text = x.a.get_text() 
        # splite text in the td tag into a list 
        date_scrape = x.td.text.split()
        # if the length of 'date_scrape' is 1, load 'time' as the only element

        if len(date_scrape) == 1:
            time = date_scrape[0]
            
        # else load 'date' as the 1st element and 'time' as the second    
        else:
            date = date_scrape[0]
            time = date_scrape[1]
        # Extract the ticker from the file name, get the string up to the 1st '_'  
        ticker = file_name.split('_')[0]
        
        # Append ticker, date, time and headline as a list to the 'parsed_news' list
        parsed_news.append([ticker, date, time, text])
print(parsed_news[0])

['AMZN', 'Oct-10-20', '03:00PM', 'Stocks That Would Have Made You Rich Today']


In [23]:
vader = SentimentIntensityAnalyzer()

parsed_and_scored_news = pd.DataFrame(parsed_news, columns = ['ticker', 'date', 'time', 'headline'])

scores = parsed_and_scored_news['headline'].apply(vader.polarity_scores).tolist()
df_scores = pd.DataFrame(scores)

parsed_and_scored_news = parsed_and_scored_news.join(df_scores, rsuffix='_right')
#parsed_and_scored_news['date'] = pd.to_datetime(parsed_and_scored_news.date).dt.date

parsed_and_scored_news

Unnamed: 0,ticker,date,time,headline,neg,neu,pos,compound
0,AMZN,Oct-10-20,03:00PM,Stocks That Would Have Made You Rich Today,0.0,0.660,0.340,0.5574
1,AMZN,Oct-10-20,12:06PM,3 Top Tech Stocks to Buy Right Now,0.0,0.769,0.231,0.2023
2,AMZN,Oct-10-20,11:30AM,Is Arlo Leaving the Home Security Field to Ama...,0.0,0.687,0.313,0.4767
3,AMZN,Oct-10-20,10:08AM,4 Reasons Levi's New Online Secondhand Store I...,0.0,1.000,0.000,0.0000
4,AMZN,Oct-10-20,10:02AM,2 Important Investing Tips From Peter Lynch,0.0,0.735,0.265,0.2023
...,...,...,...,...,...,...,...,...
395,SQ,Sep-03-20,12:03PM,Expect Square's stock to continue its upward t...,0.0,1.000,0.000,0.0000
396,SQ,Sep-03-20,11:31AM,Why Is Square (SQ) Up 11.1% Since Last Earning...,0.0,1.000,0.000,0.0000
397,SQ,Sep-03-20,06:08AM,3 Game-Changing Stocks That Are Better Buys Th...,0.0,0.707,0.293,0.4404
398,SQ,Sep-03-20,06:00AM,This Stock Could Be 2020's Best Profit Opportu...,0.0,0.336,0.664,0.8720


In [24]:
# Combine date and time columns and prep for db
parsed_and_scored_news['timestamp'] = pd.to_datetime(parsed_and_scored_news['date'] + ' ' + parsed_and_scored_news['time'])
del parsed_and_scored_news['date'], parsed_and_scored_news['time'], parsed_and_scored_news['neg'],  parsed_and_scored_news['neu'],  parsed_and_scored_news['pos'] 

# Add qualitative scale to scores?
score_name = {'very positive', 'positive', 'neutral', 'negative', 'very negative'}

print(parsed_and_scored_news.head())

  ticker                                           headline  compound  \
0   AMZN         Stocks That Would Have Made You Rich Today    0.5574   
1   AMZN                 3 Top Tech Stocks to Buy Right Now    0.2023   
2   AMZN  Is Arlo Leaving the Home Security Field to Ama...    0.4767   
3   AMZN  4 Reasons Levi's New Online Secondhand Store I...    0.0000   
4   AMZN        2 Important Investing Tips From Peter Lynch    0.2023   

            timestamp  
0 2020-10-10 15:00:00  
1 2020-10-10 12:06:00  
2 2020-10-10 11:30:00  
3 2020-10-10 10:08:00  
4 2020-10-10 10:02:00  


In [27]:
# Insert in to DB
# import psycopg2

# conn = psycopg2.connect("dbname=StonksGoUp user=postgres host=localhost password=admin")
# cur = conn.cursor()
SQL_sentiment = """ 
    DROP TABLE sentiment;
    CREATE TABLE sentiment (
        sentimentid SERIAL,
        ticker varchar(5) NOT NULL,
        timestamp timestamp with time zone NOT NULL,
        headline text NOT NULL,
        score numeric,
        CONSTRAINT pk_sentiment PRIMARY KEY (sentimentid),
        UNIQUE (ticker, timestamp, headline, score)
);
"""
cur.execute(SQL_sentiment, conn)
conn.commit()

insert = [list(row) for row in parsed_and_scored_news.itertuples(index=False)]

SQL_sentiment_insert= """ INSERT INTO public.sentiment(ticker, headline, score, timestamp) 
    VALUES (%s, %s, %s, %s) ON CONFLICT DO NOTHING"""
cur.executemany(SQL_sentiment_insert, insert)
conn.commit()

print(f'{cur.rowcount} rows inserted.')
cur.close()
conn.close()


400 rows inserted.
