<h1>Yahoo Finance Ticker Statistics Page Scraper</h1>
<h3>This script takes a list of tickers and scraps its info from the yahoo finance statistics page.</h3>

<a href = "https://finance.yahoo.com/quote/TSLA/key-statistics?p=TSLA"><img src="sample.JPG"></a>

In [1]:
import pandas as pd
import numpy as np
from datetime import date
from bs4 import BeautifulSoup
import requests
import time
from tabulate import tabulate

<h2>Reading portfolio tickers from csv file</h2>

In [2]:
input_tickers = pd.read_csv('portfolio_tickers.csv')
tickers = input_tickers['Tickers'].tolist()
tickers

['TSLA',
 'TTD',
 'NVDA',
 'GOOG',
 'CRWD',
 'AMZN',
 'IDXX',
 'MELI',
 'ABNB',
 'ZS',
 'ASML',
 'SE',
 'AXON',
 'TEAM',
 'LULU',
 'SNOW',
 'NET',
 'HUBS',
 'DDOG',
 'DOCN',
 'U',
 'SEMR',
 'LMND',
 'ZM',
 'CDNS',
 'PINS',
 'MDB',
 'MRNA',
 'ROKU',
 'GLBE',
 'ASAN',
 'RBLX',
 'UPST',
 'TWLO',
 'CRSP',
 'APPN',
 'NNOX',
 'SHOP']

In [3]:
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9'}
soups=[]

for ticker in tickers:
    url = f'https://finance.yahoo.com/quote/{ticker}/key-statistics?p={ticker}'
    resp = requests.get(url, headers = headers)
    print(ticker, resp.status_code)
    #time.sleep(1)
    soup = BeautifulSoup(resp.text, "html.parser")
    soups.append(soup)
print('Extraction complete!')

TSLA 200
TTD 200
NVDA 200
GOOG 200
CRWD 200
AMZN 200
IDXX 200
MELI 200
ABNB 200
ZS 200
ASML 200
SE 200
AXON 200
TEAM 200
LULU 200
SNOW 200
NET 200
HUBS 200
DDOG 200
DOCN 200
U 200
SEMR 200
LMND 200
ZM 200
CDNS 200
PINS 200
MDB 200
MRNA 200
ROKU 200
GLBE 200
ASAN 200
RBLX 200
UPST 200
TWLO 200
CRSP 200
APPN 200
NNOX 200
SHOP 200
Extraction complete!


<h2>Getting the headers and cleaning dates/annotations</h2>

In [4]:
tds = soup.find_all("td")

n= 0
titles= []

for td in tds:
    n+=1
    if n%2 == 0:
        continue
    else:
        text = td.text.strip()
        titles.append(text)
    
all_stats_df = pd.DataFrame({"Metrics":titles})
all_stats_df.head()

Unnamed: 0,Metrics
0,Market Cap (intraday)
1,Enterprise Value
2,Trailing P/E
3,Forward P/E
4,PEG Ratio (5 yr expected)


In [5]:
all_stats_df['Metrics'].replace(regex={r'[0-9]$': ''}, inplace = True) 
#removes the annotations appearing at the end of rows

display(all_stats_df.iloc[23:28])
all_stats_df.iloc[23:28,0].replace(regex={r'(\(.+\))': ''}, inplace = True) 
#remove the dates under rows 23-27

all_stats_df

Unnamed: 0,Metrics
23,"Shares Short (Sep 14, 2022)"
24,"Short Ratio (Sep 14, 2022)"
25,"Short % of Float (Sep 14, 2022)"
26,"Short % of Shares Outstanding (Sep 14, 2022)"
27,"Shares Short (prior month Aug 14, 2022)"


Unnamed: 0,Metrics
0,Market Cap (intraday)
1,Enterprise Value
2,Trailing P/E
3,Forward P/E
4,PEG Ratio (5 yr expected)
5,Price/Sales (ttm)
6,Price/Book (mrq)
7,Enterprise Value/Revenue
8,Enterprise Value/EBITDA
9,Beta (5Y Monthly)


<h2>Retrieving the stats for each ticker</h2>

In [6]:
for soup, ticker in zip(soups, tickers):
    tds = soup.find_all("td")
    n= 0
    stats= []

    for td in tds:
        n+=1
        if n%2 != 0:
            continue
        else:
            text = td.text.strip()
            stats.append(text)
    all_stats_df[f'{ticker}'] = stats
    
all_stats_df

Unnamed: 0,Metrics,TSLA,TTD,NVDA,GOOG,CRWD,AMZN,IDXX,MELI,ABNB,...,ROKU,GLBE,ASAN,RBLX,UPST,TWLO,CRSP,APPN,NNOX,SHOP
0,Market Cap (intraday),831.15B,29.69B,302.26B,1.25T,38.46B,1.15T,27.62B,41.67B,68.21B,...,7.78B,4.19B,4.69B,21.11B,1.71B,12.66B,5.03B,2.96B,598.90M,33.95B
1,Enterprise Value,818.90B,28.74B,296.92B,1.15T,36.92B,1.22T,29.01B,43.48B,60.69B,...,6.45B,3.92B,4.70B,19.48B,,9.53B,3.21B,2.88B,517.43M,28.19B
2,Trailing P/E,95.76,868.57,39.67,17.91,,101.76,42.92,175.75,53.06,...,,,,,22.11,,124.34,,,
3,Forward P/E,45.05,52.36,36.10,15.92,136.99,46.95,35.59,72.99,37.04,...,,,,270.27,12.80,588.24,,,,5.00k
4,PEG Ratio (5 yr expected),1.26,2.18,3.13,1.19,3.40,4.50,4.34,0.98,,...,,,,11.55,,44.96,,,,
5,Price/Sales (ttm),13.55,21.56,10.34,4.63,20.71,2.38,8.60,4.69,9.46,...,2.63,13.33,8.95,9.57,1.87,3.66,390.61,6.95,109.33,6.79
6,Price/Book (mrq),22.85,16.68,12.67,4.91,31.39,8.76,60.45,26.37,13.00,...,2.78,4.52,50.93,38.59,2.26,1.17,2.41,14.43,2.28,3.90
7,Enterprise Value/Revenue,12.19,20.69,9.98,4.15,20.13,2.50,8.77,4.92,8.22,...,2.12,12.84,10.05,8.83,,2.80,246.52,6.82,97.41,5.64
8,Enterprise Value/EBITDA,57.15,335.75,31.73,11.52,-950.08,23.97,30.15,48.52,41.09,...,133.12,-73.32,-13.32,-43.82,,-12.39,-5.27,-24.51,-7.37,-14.61
9,Beta (5Y Monthly),2.19,1.93,1.69,1.09,1.27,1.33,1.17,1.63,,...,1.68,,,,,1.57,1.96,1.60,1.72,1.82


<h2>Transposing and cleaning the data to numericals</h2>

In [7]:
col_list = all_stats_df['Metrics']
t_all_stats_df = all_stats_df.transpose()
t_all_stats_df.columns = col_list
t_all_stats_df.drop(index='Metrics', inplace= True)
t_all_stats_df.reset_index(inplace=True)
t_all_stats_df.rename({'index' : 'Ticker'}, axis = 1, inplace=True)

t_all_stats_df

Metrics,Ticker,Market Cap (intraday),Enterprise Value,Trailing P/E,Forward P/E,PEG Ratio (5 yr expected),Price/Sales (ttm),Price/Book (mrq),Enterprise Value/Revenue,Enterprise Value/EBITDA,...,Diluted EPS (ttm),Quarterly Earnings Growth (yoy),Total Cash (mrq),Total Cash Per Share (mrq),Total Debt (mrq),Total Debt/Equity (mrq),Current Ratio (mrq),Book Value Per Share (mrq),Operating Cash Flow (ttm),Levered Free Cash Flow (ttm)
0,TSLA,831.15B,818.90B,95.76,45.05,1.26,13.55,22.85,12.19,57.15,...,2.74,97.80%,18.92B,6.04,6.66B,17.7,1.43,11.65,14.08B,5.96B
1,TTD,29.69B,28.74B,868.57,52.36,2.18,21.56,16.68,20.69,335.75,...,0.07,,1.21B,2.48,267.63M,15.04,1.92,3.64,531.03M,492.31M
2,NVDA,302.26B,296.92B,39.67,36.10,3.13,10.34,12.67,9.98,31.73,...,3.73,-72.40%,17.04B,6.84,11.84B,49.63,3.62,9.58,7.55B,6.69B
3,GOOG,1.25T,1.15T,17.91,15.92,1.19,4.63,4.91,4.15,11.52,...,5.26,-13.60%,125B,9.58,28.81B,11.28,2.81,19.53,95B,51.07B
4,CRWD,38.46B,36.92B,,136.99,3.4,20.71,31.39,20.13,-950.08,...,-0.79,,2.32B,9.94,771.92M,62.08,1.8,5.25,743.64M,656.29M
5,AMZN,1.15T,1.22T,101.76,46.95,4.5,2.38,8.76,2.5,23.97,...,1.14,,60.71B,5.96,157.56B,119.91,0.95,12.9,35.57B,-13.44B
6,IDXX,27.62B,29.01B,42.92,35.59,4.34,8.6,60.45,8.77,30.15,...,7.99,-34.90%,114.36M,1.37,1.5B,327.52,0.89,5.48,577.72M,385.85M
7,MELI,41.67B,43.48B,175.75,72.99,0.98,4.69,26.37,4.92,48.52,...,4.7,80.90%,2.23B,44.26,4.97B,314.56,1.3,31.39,1.64B,-1.71B
8,ABNB,68.21B,60.69B,53.06,37.04,,9.46,13.0,8.22,41.09,...,1.91,,9.9B,15.47,2.38B,45.4,1.58,8.2,2.8B,2.3B
9,ZS,23.51B,22.83B,,140.85,3.03,21.23,41.01,20.92,-81.42,...,-2.67,,1.73B,12.1,1.05B,182.4,1.99,4.01,321.91M,480.98M


Reformatting large sums (Billion, million, thousand) and removing % values

In [8]:
import re

def cleaning(x):
    cleaned = x
    if re.match('(^\S+[T])', x):
        cleaned = re.sub("[^\-0-9]", "", x)
        cleaned = cleaned + "000000000000"
    if re.match('(^\S+[B])', x):
        cleaned = re.sub("[^\-0-9]", "", x)
        cleaned = cleaned + "000000000"
    if re.match('(^\S+[M])', x):
        cleaned = re.sub("[^\-0-9]", "", x)
        cleaned = cleaned + "000000"
    if re.match('(^\S+[k])', x):
        cleaned = re.sub("[^\-0-9]", "", x)
        cleaned = cleaned + "000"
    cleaned = re.sub("[%,]", "", cleaned)
    if x == "N/A":
        cleaned = 0
    return cleaned


cleaned_df = t_all_stats_df.applymap(cleaning)
cleaned_df

Metrics,Ticker,Market Cap (intraday),Enterprise Value,Trailing P/E,Forward P/E,PEG Ratio (5 yr expected),Price/Sales (ttm),Price/Book (mrq),Enterprise Value/Revenue,Enterprise Value/EBITDA,...,Diluted EPS (ttm),Quarterly Earnings Growth (yoy),Total Cash (mrq),Total Cash Per Share (mrq),Total Debt (mrq),Total Debt/Equity (mrq),Current Ratio (mrq),Book Value Per Share (mrq),Operating Cash Flow (ttm),Levered Free Cash Flow (ttm)
0,TSLA,83115000000000,81890000000000,95.76,45.05,1.26,13.55,22.85,12.19,57.15,...,2.74,97.8,1892000000000,6.04,666000000000,17.7,1.43,11.65,1408000000000,596000000000
1,000000000000,2969000000000,2874000000000,868.57,52.36,2.18,21.56,16.68,20.69,335.75,...,0.07,0.0,121000000000,2.48,26763000000,15.04,1.92,3.64,53103000000,49231000000
2,NVDA,30226000000000,29692000000000,39.67,36.1,3.13,10.34,12.67,9.98,31.73,...,3.73,-72.4,1704000000000,6.84,1184000000000,49.63,3.62,9.58,755000000000,669000000000
3,GOOG,125000000000000,115000000000000,17.91,15.92,1.19,4.63,4.91,4.15,11.52,...,5.26,-13.6,125000000000,9.58,2881000000000,11.28,2.81,19.53,95000000000,5107000000000
4,CRWD,3846000000000,3692000000000,0.0,136.99,3.4,20.71,31.39,20.13,-950.08,...,-0.79,0.0,232000000000,9.94,77192000000,62.08,1.8,5.25,74364000000,65629000000
5,000000,115000000000000,122000000000000,101.76,46.95,4.5,2.38,8.76,2.5,23.97,...,1.14,0.0,6071000000000,5.96,15756000000000,119.91,0.95,12.9,3557000000000,-1344000000000
6,IDXX,2762000000000,2901000000000,42.92,35.59,4.34,8.6,60.45,8.77,30.15,...,7.99,-34.9,11436000000,1.37,15000000000,327.52,0.89,5.48,57772000000,38585000000
7,MELI,4167000000000,4348000000000,175.75,72.99,0.98,4.69,26.37,4.92,48.52,...,4.7,80.9,223000000000,44.26,497000000000,314.56,1.3,31.39,164000000000,-171000000000
8,000000000,6821000000000,6069000000000,53.06,37.04,0.0,9.46,13.0,8.22,41.09,...,1.91,0.0,99000000000,15.47,238000000000,45.4,1.58,8.2,28000000000,23000000000
9,ZS,2351000000000,2283000000000,0.0,140.85,3.03,21.23,41.01,20.92,-81.42,...,-2.67,0.0,173000000000,12.1,105000000000,182.4,1.99,4.01,32191000000,48098000000


Removing unwanted columns and columns with non-numerical values

In [9]:
print(cleaned_df.columns[[0,35,36,37,38,39,40]])
cleaned_df = cleaned_df.drop(cleaned_df.columns[[0,35,36,37,38,39,40]],axis = 1)


Index(['Ticker', 'Dividend Date ', 'Ex-Dividend Date ', 'Last Split Factor ',
       'Last Split Date ', 'Fiscal Year Ends', 'Most Recent Quarter (mrq)'],
      dtype='object', name='Metrics')


In [10]:
#print(tabulate(numerical, numerical.columns))
final_df = cleaned_df.iloc[:,1:].astype('float64')
final_df.insert(0,'Ticker',tickers)
final_df

Metrics,Ticker,Enterprise Value,Trailing P/E,Forward P/E,PEG Ratio (5 yr expected),Price/Sales (ttm),Price/Book (mrq),Enterprise Value/Revenue,Enterprise Value/EBITDA,Beta (5Y Monthly),...,Diluted EPS (ttm),Quarterly Earnings Growth (yoy),Total Cash (mrq),Total Cash Per Share (mrq),Total Debt (mrq),Total Debt/Equity (mrq),Current Ratio (mrq),Book Value Per Share (mrq),Operating Cash Flow (ttm),Levered Free Cash Flow (ttm)
0,TSLA,81890000000000.0,95.76,45.05,1.26,13.55,22.85,12.19,57.15,2.19,...,2.74,97.8,1892000000000.0,6.04,666000000000.0,17.7,1.43,11.65,1408000000000.0,596000000000.0
1,TTD,2874000000000.0,868.57,52.36,2.18,21.56,16.68,20.69,335.75,1.93,...,0.07,0.0,121000000000.0,2.48,26763000000.0,15.04,1.92,3.64,53103000000.0,49231000000.0
2,NVDA,29692000000000.0,39.67,36.1,3.13,10.34,12.67,9.98,31.73,1.69,...,3.73,-72.4,1704000000000.0,6.84,1184000000000.0,49.63,3.62,9.58,755000000000.0,669000000000.0
3,GOOG,115000000000000.0,17.91,15.92,1.19,4.63,4.91,4.15,11.52,1.09,...,5.26,-13.6,125000000000.0,9.58,2881000000000.0,11.28,2.81,19.53,95000000000.0,5107000000000.0
4,CRWD,3692000000000.0,0.0,136.99,3.4,20.71,31.39,20.13,-950.08,1.27,...,-0.79,0.0,232000000000.0,9.94,77192000000.0,62.08,1.8,5.25,74364000000.0,65629000000.0
5,AMZN,122000000000000.0,101.76,46.95,4.5,2.38,8.76,2.5,23.97,1.33,...,1.14,0.0,6071000000000.0,5.96,15756000000000.0,119.91,0.95,12.9,3557000000000.0,-1344000000000.0
6,IDXX,2901000000000.0,42.92,35.59,4.34,8.6,60.45,8.77,30.15,1.17,...,7.99,-34.9,11436000000.0,1.37,15000000000.0,327.52,0.89,5.48,57772000000.0,38585000000.0
7,MELI,4348000000000.0,175.75,72.99,0.98,4.69,26.37,4.92,48.52,1.63,...,4.7,80.9,223000000000.0,44.26,497000000000.0,314.56,1.3,31.39,164000000000.0,-171000000000.0
8,ABNB,6069000000000.0,53.06,37.04,0.0,9.46,13.0,8.22,41.09,0.0,...,1.91,0.0,99000000000.0,15.47,238000000000.0,45.4,1.58,8.2,28000000000.0,23000000000.0
9,ZS,2283000000000.0,0.0,140.85,3.03,21.23,41.01,20.92,-81.42,1.02,...,-2.67,0.0,173000000000.0,12.1,105000000000.0,182.4,1.99,4.01,32191000000.0,48098000000.0


In [11]:
current_datetime = date.today().isoformat()
final_df.to_csv(f'data/portfolio_{current_datetime}.csv')
print(f'Saved to file: portfolio_{current_datetime}.csv')

Saved to file: portfolio_2022-10-04.csv
