Stock Profile Heatmap based on Sentiment and Returns

In [59]:
# libraries for webscraping, parsing and getting stock data
from urllib.request import urlopen
from urllib.request import Request
from bs4 import BeautifulSoup
import yfinance as yf
import pandas as pd

In [60]:
#ticker list for the project
tickers_dict = {'AMZN': 5, 'TSLA': 1, 'GOOG': 3, 'META': 3, 'KO': 10, 'PEP': 5,  # amazon, tesla, google, meta, coke, pepsi
                'BA': 5, 'XOM': 5, 'CVX': 4, 'UNH': 1, 'JNJ': 3, 'JPM': 3, # boeing, exxon mobil, chevron, united health, johnson&johnson, jp morgan
                'BAC': 5, 'C': 5, 'SPG': 10, 'AAPL': 6, 'MSFT': 5, 'WMT': 6, # bank of america, citigroup, simon property group, apple, microsoft, walmart
                'LMT': 2, 'PFE': 10, 'MMM': 3, 'CRWD': 3, 'WBD': 20, 'DIS': 8, # lockheed martin, pfizer, 3M, crowdstrike, warner bros, disney
                'AIG': 5, 'BRK-B': 4, 'DDOG': 3, 'SLB': 16, 'SONY': 5, 'PLD': 5, # american international group, berkshire hathaway, datadog, schlumberger, sony, prologis
                'INT': 16, 'AMD': 5, 'ISRG': 3, 'INTC': 5} # world fuel services, advanced micro devices, intuitive surgical, intel

tickers = tickers_dict.keys()
n_shares = tickers_dict.values()

In [61]:
#this part is copied fromt the crawler notebook I made

#scrape data
url_1 = "https://finviz.com/quote.ashx?t="
news_tables = {}

for ticker in tickers:
    url_2 = url_1 + ticker
    req =  Request(url = url_2, headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15'})
    response = urlopen(req)
    #extract into html
    html = BeautifulSoup(response)
    #find and load news_table
    news_table = html.find(id = 'news-table')
    #add table to dictionary
    news_tables[ticker] = news_table

parsed_news = []
#news iteration
for file_name, news_table in news_tables.items():
    #iterate through all tr tags
    for x in news_table.findAll('tr'):
        text = x.a.get_text() #read text from each a tag under tr tag
        date_scrape = x.td.text.split()
        if len(date_scrape) == 1:
            time = date_scrape[0] #to make sure all news time for a given date is taken
        else:
            date = date_scrape[0]
            time = date_scrape[1]
        ticker = file_name.split('_')[0]
        parsed_news.append([ticker, date, time, text])


parsed_news[:5]

[['AMZN',
  'Jan-22-23',
  '11:51AM',
  "1 Tech Stock You'll Be Glad You Bought When the Bull Market Starts"],
 ['AMZN',
  'Jan-22-23',
  '10:30AM',
  'Target, Amazon and 4 More Retailers That Will Reward You for Turning in Your Old Stuff'],
 ['AMZN',
  'Jan-22-23',
  '08:37AM',
  'Got $2,500? 2 Top Stocks That You Can Buy and Hold for a Lifetime'],
 ['AMZN',
  'Jan-22-23',
  '08:00AM',
  'Tech Layoffs Unwind Recent Head-Count Growth, Torpedo Long-Shot Projects'],
 ['AMZN',
  'Jan-22-23',
  '07:25AM',
  "My Top ETF to Buy for 2023 (and It's Not Even Close)"]]

In [62]:
# NLTK VADER for sentiment analysis
import nltk
nltk.downloader.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/rajarshibhattacharjee/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [63]:
# Instantiate the sentiment intensity analyzer
vader = SentimentIntensityAnalyzer()
# Set column names
columns = ['ticker', 'date', 'time', 'headline']
# Convert the parsed_news list into a DataFrame called 'news_df'
news_df = pd.DataFrame(parsed_news, columns=columns)

# Iterate through the headlines and get the polarity scores using vader
scores = news_df['headline'].apply(vader.polarity_scores).tolist()
# Convert the 'scores' list of dicts into a DataFrame
scores_df = pd.DataFrame(scores)

# Join the DataFrames of the news and the list of dicts
news_df = news_df.join(scores_df, rsuffix='_right')
# Convert the date column from string to datetime
news_df['date'] = pd.to_datetime(news_df.date).dt.date
news_df.tail()

Unnamed: 0,ticker,date,time,headline,neg,neu,pos,compound
3395,INTC,2023-01-08,07:30AM,Should You Buy the 3 Highest-Paying Dividend S...,0.0,1.0,0.0,0.0
3396,INTC,2023-01-08,05:52AM,"The Dogs of the Dow Worked Last Year, But This...",0.0,1.0,0.0,0.0
3397,INTC,2023-01-06,07:15PM,Analyst Report: Intel Corporation,0.0,1.0,0.0,0.0
3398,INTC,2023-01-06,08:58AM,The Smartest Dividend Stocks to Buy With $400 ...,0.0,0.692,0.308,0.6124
3399,INTC,2023-01-06,08:35AM,AMD Challenges Intel's Gaming Dominance With N...,0.0,0.721,0.279,0.2732


In [64]:
# Group by each ticker and get the mean of all sentiment scores
mean_scores = news_df.groupby(['ticker']).mean()
mean_scores.head()


The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



Unnamed: 0_level_0,neg,neu,pos,compound
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AAPL,0.06571,0.84033,0.09398,0.062473
AIG,0.04078,0.88955,0.06966,0.051789
AMD,0.03832,0.80791,0.15377,0.184061
AMZN,0.03676,0.80122,0.16203,0.174336
BA,0.05809,0.84908,0.09283,0.069331


In [65]:
# Get Current Price, Sector and Industry of each Ticker
# as an example this is the information that the api returns for Google
tickerdata = yf.Ticker('GOOG')
tickerdata.info

{'zip': '94043',
 'sector': 'Communication Services',
 'fullTimeEmployees': 186779,
 'longBusinessSummary': 'Alphabet Inc. provides various products and platforms in the United States, Europe, the Middle East, Africa, the Asia-Pacific, Canada, and Latin America. It operates through Google Services, Google Cloud, and Other Bets segments. The Google Services segment offers products and services, including ads, Android, Chrome, hardware, Gmail, Google Drive, Google Maps, Google Photos, Google Play, Search, and YouTube. It is also involved in the sale of apps and in-app purchases and digital content in the Google Play store; and Fitbit wearable devices, Google Nest home products, Pixel phones, and other devices, as well as in the provision of YouTube non-advertising services. The Google Cloud segment offers infrastructure, platform, and other services; Google Workspace that include cloud-based collaboration tools for enterprises, such as Gmail, Docs, Drive, Calendar, and Meet; and other se

In [66]:
# Now, we do it for all the companies and extract the necessary information

sectors = []
industries = []
prices = []
for ticker in tickers:
    print(ticker)
    tickerdata = yf.Ticker(ticker)
    prices.append(tickerdata.info['regularMarketPrice'])
    sectors.append(tickerdata.info['sector'])
    industries.append(tickerdata.info['industry'])

d = {'Sector': sectors, 'Industry': industries, 'Price': prices, 'No. of Shares': n_shares}
df_info = pd.DataFrame(data=d, index = tickers)
df_info

AMZN
TSLA
GOOG
META
KO
PEP
BA
XOM
CVX
UNH
JNJ
JPM
BAC
C
SPG
AAPL
MSFT
WMT
LMT
PFE
MMM
CRWD
WBD
DIS
AIG
BRK-B
DDOG
SLB
SONY
PLD
INT
AMD
ISRG
INTC


Unnamed: 0,Sector,Industry,Price,No. of Shares
AMZN,Consumer Cyclical,Internet Retail,97.25,5
TSLA,Consumer Cyclical,Auto Manufacturers,133.42,1
GOOG,Communication Services,Internet Content & Information,99.28,3
META,Communication Services,Internet Content & Information,139.37,3
KO,Consumer Defensive,Beverages—Non-Alcoholic,60.08,10
PEP,Consumer Defensive,Beverages—Non-Alcoholic,169.88,5
BA,Industrials,Aerospace & Defense,206.76,5
XOM,Energy,Oil & Gas Integrated,113.35,5
CVX,Energy,Oil & Gas Integrated,180.9,4
UNH,Healthcare,Healthcare Plans,486.72,1


In [67]:
# we  create a column of the total value of each share
df_info['Total Stock Value in Portfolio'] = df_info['Price']*df_info['No. of Shares']
df_info.head()

Unnamed: 0,Sector,Industry,Price,No. of Shares,Total Stock Value in Portfolio
AMZN,Consumer Cyclical,Internet Retail,97.25,5,486.25
TSLA,Consumer Cyclical,Auto Manufacturers,133.42,1,133.42
GOOG,Communication Services,Internet Content & Information,99.28,3,297.84
META,Communication Services,Internet Content & Information,139.37,3,418.11
KO,Consumer Defensive,Beverages—Non-Alcoholic,60.08,10,600.8


In [68]:
df = mean_scores.join(df_info)
df = df.rename(columns={"compound": "Sentiment Score", "neg": "Negative", "neu": "Neutral", "pos": "Positive"})
df = df.reset_index()
df.head()

Unnamed: 0,ticker,Negative,Neutral,Positive,Sentiment Score,Sector,Industry,Price,No. of Shares,Total Stock Value in Portfolio
0,AAPL,0.06571,0.84033,0.09398,0.062473,Technology,Consumer Electronics,137.87,6,827.22
1,AIG,0.04078,0.88955,0.06966,0.051789,Financial Services,Insurance—Diversified,62.47,5,312.35
2,AMD,0.03832,0.80791,0.15377,0.184061,Technology,Semiconductors,70.07,5,350.35
3,AMZN,0.03676,0.80122,0.16203,0.174336,Consumer Cyclical,Internet Retail,97.25,5,486.25
4,BA,0.05809,0.84908,0.09283,0.069331,Industrials,Aerospace & Defense,206.76,5,1033.8


In [69]:
#Now we generate the required treemap
import matplotlib.pyplot as plt
%matplotlib inline
import plotly
import plotly.express as px

In [70]:
fig = px.treemap(df, path=[px.Constant("Sectors"), 'Sector', 'Industry', 'ticker'], values='Total Stock Value in Portfolio',
                  color='Sentiment Score', hover_data=['Price', 'Negative', 'Neutral', 'Positive', 'Sentiment Score'],
                  color_continuous_scale=['#FF0000', "#000000", '#39FF14'],
                  color_continuous_midpoint=0)

fig.data[0].customdata = df[['Price', 'Negative', 'Neutral', 'Positive', 'Sentiment Score']].round(3) # round to 3 decimal places
fig.data[0].texttemplate = "%{label}<br>%{customdata[4]}"

fig.update_traces(textposition="middle center")
fig.update_layout(margin = dict(t=30, l=10, r=10, b=10), font_size=20)

plotly.offline.plot(fig, filename='stock_sentiment.html') # this writes the plot into a html file and opens it
fig.show()