In [1]:
#!pip install yfinance
#!pip install bs4
#!pip install urllib3
#!pip install plotly
#!pip install nltk

In [2]:
# libraries for webscraping, parsing and getting stock data
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import yfinance as yf

# for plotting and data manipulation
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import plotly
import plotly.express as px

# NLTK VADER for sentiment analysis
import nltk
nltk.downloader.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Gabriel\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# Tickers and Number of Shares in Portfolio

List Down all Tickers in Portfolio and the Corresponding Number of Shares for Each of Them

In [3]:
tickers_dict = {'AMZN': 5, 'TSLA': 1, 'GOOG': 3, 'META': 3, 'KO': 10, 'PEP': 5,  # amazon, tesla, google, meta, coke, pepsi
                'BA': 5, 'XOM': 5, 'CVX': 4, 'UNH': 1, 'JNJ': 3, 'JPM': 3, # boeing, exxon mobil, chevron, united health, johnson&johnson, jp morgan
                'BAC': 5, 'C': 5, 'SPG': 10, 'AAPL': 6, 'MSFT': 5, 'WMT': 6, # bank of america, citigroup, simon property group, apple, microsoft, walmart
                'LMT': 2, 'PFE': 10, 'MMM': 3, 'CRWD': 3, 'WBD': 20, 'DIS': 8, # lockheed martin, pfizer, 3M, crowdstrike, warner bros, disney
                'AIG': 5, 'BRK-B': 4, 'DDOG': 3, 'SLB': 16, 'SONY': 5, 'PLD': 5, # american international group, berkshire hathaway, datadog, schlumberger, sony, prologis
                'INT': 16, 'AMD': 5, 'ISRG': 3, 'INTC': 5} # world fuel services, advanced micro devices, intuitive surgical, intel

In [4]:
tickers = tickers_dict.keys()
number_of_shares = tickers_dict.values()

# Scrape the Date, Time and News Headlines Data

In [5]:
# Scrape the Date, Time and News Headlines Data
finwiz_url = 'https://finviz.com/quote.ashx?t='
news_tables = {}

for ticker in tickers:
    print(ticker)
    url = finwiz_url + ticker
    req = Request(url=url,headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0'}) 
    response = urlopen(req)    
    # Read the contents of the file into 'html'
    html = BeautifulSoup(response)
    # Find 'news-table' in the Soup and load it into 'news_table'
    news_table = html.find(id='news-table')
    # Add the table to our dictionary
    news_tables[ticker] = news_table

AMZN
TSLA
GOOG
META
KO
PEP
BA
XOM
CVX
UNH
JNJ
JPM
BAC
C
SPG
AAPL
MSFT
WMT
LMT
PFE
MMM
CRWD
WBD
DIS
AIG
BRK-B
DDOG
SLB
SONY
PLD
INT
AMD
ISRG
INTC


# Print the Data from news_table (optional)

In [6]:
# Print the Data from news_table (optional)
# Example: Read one single day of headlines for ‘AMZN’ 
amzn = news_tables['AMZN']
# Get all the table rows tagged in HTML with <tr> into ‘amzn_tr’
amzn_tr = amzn.findAll('tr')
for i, table_row in enumerate(amzn_tr):
     # Read the text of the element ‘a’ into ‘link_text’
     a_text = table_row.a.text
     # Read the text of the element ‘td’ into ‘data_text’
     td_text = table_row.td.text
     # Print the contents of ‘link_text’ and ‘data_text’ 
     print(a_text)
     print(td_text)
     # Exit after printing 4 rows of data
     if i == 3:
         break

Why Jeff Bezos and other billionaires want NFL teams
Feb-27-23 05:31PM
Palantir joins growing list of tech companies announcing layoffs
04:33PM
37 Mistakes We Make When Shopping at Costco, Amazon, Target and Walmart
12:00PM
Microsoft Pushes AI-Powered Tools for Telecoms. Its Targeting Amazon.
11:34AM


# Parse the Date, Time and News Headlines into a Python List

In [7]:
# Parse the Date, Time and News Headlines into a Python List
parsed_news = []
# Iterate through the news
for file_name, news_table in news_tables.items():
    # Iterate through all tr tags in 'news_table'
    for x in news_table.findAll('tr'):
        # read the text from each tr tag into text
        # get text from a only
        text = x.a.get_text() 
        # splite text in the td tag into a list 
        date_scrape = x.td.text.split()
        # if the length of 'date_scrape' is 1, load 'time' as the only element
        if len(date_scrape) == 1:
            time = date_scrape[0]
            
        # else load 'date' as the 1st element and 'time' as the second    
        else:
            date = date_scrape[0]
            time = date_scrape[1]
        # Extract the ticker from the file name, get the string up to the 1st '_'  
        ticker = file_name.split('_')[0]
        
        # Append ticker, date, time and headline as a list to the 'parsed_news' list
        parsed_news.append([ticker, date, time, text])
        
parsed_news[:5] # print first 5 rows of news

[['AMZN',
  'Feb-27-23',
  '05:31PM',
  'Why Jeff Bezos and other billionaires want NFL teams'],
 ['AMZN',
  'Feb-27-23',
  '04:33PM',
  'Palantir joins growing list of tech companies announcing layoffs'],
 ['AMZN',
  'Feb-27-23',
  '12:00PM',
  '37 Mistakes We Make When Shopping at Costco, Amazon, Target and Walmart'],
 ['AMZN',
  'Feb-27-23',
  '11:34AM',
  'Microsoft Pushes AI-Powered Tools for Telecoms. Its Targeting Amazon.'],
 ['AMZN',
  'Feb-27-23',
  '09:55AM',
  'If You Invested $10,000 In Amazon Stock 10 Years Ago, This Is How Much You Would Have Today']]

# Perform Sentiment Analysis with Vader

In [8]:
# Perform Sentiment Analysis with Vader
# Instantiate the sentiment intensity analyzer
vader = SentimentIntensityAnalyzer()
# Set column names
columns = ['ticker', 'date', 'time', 'headline']
# Convert the parsed_news list into a DataFrame called 'parsed_and_scored_news'
parsed_and_scored_news = pd.DataFrame(parsed_news, columns=columns)

# Iterate through the headlines and get the polarity scores using vader
scores = parsed_and_scored_news['headline'].apply(vader.polarity_scores).tolist()
# Convert the 'scores' list of dicts into a DataFrame
scores_df = pd.DataFrame(scores)

# Join the DataFrames of the news and the list of dicts
parsed_and_scored_news = parsed_and_scored_news.join(scores_df, rsuffix='_right')
# Convert the date column from string to datetime
parsed_and_scored_news['date'] = pd.to_datetime(parsed_and_scored_news.date).dt.date
parsed_and_scored_news.head()

Unnamed: 0,ticker,date,time,headline,neg,neu,pos,compound
0,AMZN,2023-02-27,05:31PM,Why Jeff Bezos and other billionaires want NFL...,0.0,0.86,0.14,0.0772
1,AMZN,2023-02-27,04:33PM,Palantir joins growing list of tech companies ...,0.0,0.825,0.175,0.1779
2,AMZN,2023-02-27,12:00PM,"37 Mistakes We Make When Shopping at Costco, A...",0.176,0.704,0.12,-0.2023
3,AMZN,2023-02-27,11:34AM,Microsoft Pushes AI-Powered Tools for Telecoms...,0.0,0.825,0.175,0.1779
4,AMZN,2023-02-27,09:55AM,"If You Invested $10,000 In Amazon Stock 10 Yea...",0.0,0.909,0.091,0.1779


In [9]:
parsed_and_scored_news

Unnamed: 0,ticker,date,time,headline,neg,neu,pos,compound
0,AMZN,2023-02-27,05:31PM,Why Jeff Bezos and other billionaires want NFL...,0.000,0.860,0.140,0.0772
1,AMZN,2023-02-27,04:33PM,Palantir joins growing list of tech companies ...,0.000,0.825,0.175,0.1779
2,AMZN,2023-02-27,12:00PM,"37 Mistakes We Make When Shopping at Costco, A...",0.176,0.704,0.120,-0.2023
3,AMZN,2023-02-27,11:34AM,Microsoft Pushes AI-Powered Tools for Telecoms...,0.000,0.825,0.175,0.1779
4,AMZN,2023-02-27,09:55AM,"If You Invested $10,000 In Amazon Stock 10 Yea...",0.000,0.909,0.091,0.1779
...,...,...,...,...,...,...,...,...
3395,INTC,2023-02-10,06:06PM,Arm Holdings Is Destined to Go Public. It Coul...,0.000,0.870,0.130,0.2716
3396,INTC,2023-02-10,03:54PM,10 Least Innovative Companies That Are Still I...,0.211,0.789,0.000,-0.3412
3397,INTC,2023-02-10,12:38PM,Inside Intel: A Look at the Mega Chip Maker,0.000,1.000,0.000,0.0000
3398,INTC,2023-02-10,10:48AM,"US STOCKS-Nasdaq edges lower, Lyft sinks on do...",0.216,0.784,0.000,-0.2960


# Calculate Mean Sentiment for Each Ticker

In [10]:
# Group by each ticker and get the mean of all sentiment scores
mean_scores = parsed_and_scored_news.groupby(['ticker']).mean()
mean_scores

Unnamed: 0_level_0,neg,neu,pos,compound
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AAPL,0.05279,0.83471,0.11248,0.07114
AIG,0.06065,0.87135,0.06801,0.023673
AMD,0.03991,0.81652,0.14358,0.177748
AMZN,0.03688,0.77489,0.18824,0.21578
BA,0.05276,0.84917,0.09809,0.093787
BAC,0.04321,0.86881,0.08798,0.074852
BRK-B,0.04007,0.85613,0.1038,0.108367
C,0.05126,0.85874,0.09002,0.067364
CRWD,0.02046,0.82724,0.1523,0.228804
CVX,0.04363,0.83551,0.12083,0.102599


# Get Current Price, Sector and Industry of each Ticker

In [11]:
# get the price, sector and industry of each ticker using the yahoo finance api
# as an example this is the information that the api returns for TSLA
tickerdata = yf.Ticker('TSLA')
tickerdata.info

Exception: yfinance failed to decrypt Yahoo data response

In [None]:
sectors = []
industries = []
prices = []
for ticker in tickers:
    print(ticker)
    tickerdata = yf.Ticker(ticker)
    prices.append(tickerdata.info['regularMarketPrice'])
    sectors.append(tickerdata.info['sector'])
    industries.append(tickerdata.info['industry'])

AMZN


Exception: yfinance failed to decrypt Yahoo data response

In [None]:
sectors

['Consumer Cyclical',
 'Consumer Cyclical',
 'Communication Services',
 'Communication Services',
 'Consumer Defensive',
 'Consumer Defensive',
 'Industrials',
 'Energy',
 'Energy',
 'Healthcare',
 'Healthcare',
 'Financial Services',
 'Financial Services',
 'Financial Services',
 'Real Estate',
 'Technology',
 'Technology',
 'Consumer Defensive',
 'Industrials',
 'Healthcare',
 'Industrials',
 'Technology',
 'Communication Services',
 'Communication Services',
 'Financial Services',
 'Financial Services',
 'Technology',
 'Energy',
 'Technology',
 'Real Estate',
 'Energy',
 'Technology',
 'Healthcare',
 'Technology']

In [None]:
industries

['Internet Retail',
 'Auto Manufacturers',
 'Internet Content & Information',
 'Internet Content & Information',
 'Beverages—Non-Alcoholic',
 'Beverages—Non-Alcoholic',
 'Aerospace & Defense',
 'Oil & Gas Integrated',
 'Oil & Gas Integrated',
 'Healthcare Plans',
 'Drug Manufacturers—General',
 'Banks—Diversified',
 'Banks—Diversified',
 'Banks—Diversified',
 'REIT—Retail',
 'Consumer Electronics',
 'Software—Infrastructure',
 'Discount Stores',
 'Aerospace & Defense',
 'Drug Manufacturers—General',
 'Conglomerates',
 'Software—Infrastructure',
 'Entertainment',
 'Entertainment',
 'Insurance—Diversified',
 'Insurance—Diversified',
 'Software—Application',
 'Oil & Gas Equipment & Services',
 'Consumer Electronics',
 'REIT—Industrial',
 'Oil & Gas Refining & Marketing',
 'Semiconductors',
 'Medical Instruments & Supplies',
 'Semiconductors']

# Combine the Information Above and the Corresponding Tickers into a DataFrame

In [None]:
# dictionary {'column name': list of values for column} to be converted to dataframe
d = {'Sector': sectors, 'Industry': industries, 'Price': prices, 'No. of Shares': number_of_shares}
# create dataframe from 
df_info = pd.DataFrame(data=d, index = tickers)
df_info

NameError: name 'sectors' is not defined

# Calculate the Total Value of Each Ticker in the Portfolio

In [None]:
df_info['Total Stock Value in Portfolio'] = df_info['Price']*df_info['No. of Shares']
df_info

Unnamed: 0,Sector,Industry,Price,No. of Shares,Total Stock Value in Portfolio
AMZN,Consumer Cyclical,Internet Retail,142.3,5,711.5
TSLA,Consumer Cyclical,Auto Manufacturers,908.61,1,908.61
GOOG,Communication Services,Internet Content & Information,120.86,3,362.58
META,Communication Services,Internet Content & Information,174.66,3,523.98
KO,Consumer Defensive,Beverages—Non-Alcoholic,65.22,10,652.2
PEP,Consumer Defensive,Beverages—Non-Alcoholic,180.4,5,902.0
BA,Industrials,Aerospace & Defense,168.69,5,843.45
XOM,Energy,Oil & Gas Integrated,94.38,5,471.9
CVX,Energy,Oil & Gas Integrated,159.02,4,636.08
UNH,Healthcare,Healthcare Plans,545.22,1,545.22


# Join all the Information into a Single DataFrame

In [None]:
df = mean_scores.join(df_info)
df = df.rename(columns={"compound": "Sentiment Score", "neg": "Negative", "neu": "Neutral", "pos": "Positive"})
df = df.reset_index()
df

Unnamed: 0,ticker,Negative,Neutral,Positive,Sentiment Score,Sector,Industry,Price,No. of Shares,Total Stock Value in Portfolio
0,AAPL,0.046,0.88786,0.06614,0.045105,Technology,Consumer Electronics,174.15,6,1044.9
1,AIG,0.0339,0.85817,0.10794,0.116029,Financial Services,Insurance—Diversified,57.33,5,286.65
2,AMD,0.08752,0.81381,0.09868,0.040556,Technology,Semiconductors,100.44,5,502.2
3,AMZN,0.07667,0.78021,0.1431,0.084955,Consumer Cyclical,Internet Retail,142.3,5,711.5
4,BA,0.06676,0.80862,0.12465,0.089244,Industrials,Aerospace & Defense,168.69,5,843.45
5,BAC,0.06079,0.87055,0.06865,0.020812,Financial Services,Banks—Diversified,36.28,5,181.4
6,BRK-B,0.04808,0.85515,0.09676,0.077493,Financial Services,Insurance—Diversified,304.27,4,1217.08
7,C,0.08289,0.85726,0.05985,-0.042347,Financial Services,Banks—Diversified,53.55,5,267.75
8,CRWD,0.03986,0.82033,0.13979,0.158167,Technology,Software—Infrastructure,195.76,3,587.28
9,CVX,0.04751,0.82877,0.12371,0.12453,Energy,Oil & Gas Integrated,159.02,4,636.08


# Generate the Treemap Plot!

In [None]:
# group data into sectors at the highest level, breaks it down into industry, and then ticker, specified in the 'path' parameter
# the 'values' parameter uses the value of the column to determine the relative size of each box in the chart
# the color of the chart follows the sentiment score
# when the mouse is hovered over each box in the chart, the negative, neutral, positive and overall sentiment scores will all be shown
# the color is red (#ff0000) for negative sentiment scores, black (#000000) for 0 sentiment score and green (#00FF00) for positive sentiment scores
fig = px.treemap(df, path=[px.Constant("Sectors"), 'Sector', 'Industry', 'ticker'], values='Total Stock Value in Portfolio',
                  color='Sentiment Score', hover_data=['Price', 'Negative', 'Neutral', 'Positive', 'Sentiment Score'],
                  color_continuous_scale=['#FF0000', "#000000", '#00FF00'],
                  color_continuous_midpoint=0)

fig.data[0].customdata = df[['Price', 'Negative', 'Neutral', 'Positive', 'Sentiment Score']].round(3) # round to 3 decimal places
fig.data[0].texttemplate = "%{label}<br>%{customdata[4]}"

fig.update_traces(textposition="middle center")
fig.update_layout(margin = dict(t=30, l=10, r=10, b=10), font_size=20)

plotly.offline.plot(fig, filename='stock_sentiment.html') # this writes the plot into a html file and opens it
fig.show()

In [None]:
from yahooquery import Ticker

# Define the stock symbol you want to get information for
symbol = 'AAPL'

# Create a Ticker object for the symbol
tkr = Ticker(symbol)

# Get the regular market price for the symbol
regular_market_price = tkr.price[symbol]['regularMarketPrice']

# Get the sector for the symbol
sector = tkr.asset_profile[symbol]['sector']

# Get the industry for the symbol
industry = tkr.asset_profile[symbol]['industry']

# Print the results
print(f'Symbol: {symbol}')
print(f'Regular Market Price: {regular_market_price}')
print(f'Sector: {sector}')
print(f'Industry: {industry}')
