In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from bs4 import BeautifulSoup
from datetime import date
from datetime import datetime
import time
import requests
import re

headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9'}

<h1>Part 1 - S&P and portfolio web scraper</h1>
<h2>Introduction</h2>
<p>The Standard and Poor's 500, or simply the S&P 500, is a stock market index tracking the stock performance of 500 large companies listed on stock exchanges in the United States. Having consisting of 11 different sectors and over 500 different companies, the index can be used as a benchmark for a basic stock portfolio diversification in the US stock marker. It is one of the most commonly followed equity indices.</p>
    
<p>This script scraps data from the yahoo finance statistics page.</p>
<img src="sample.JPG">
<p>Two sets of data are in focus</p>
<ol>
    <li>The S&P index constituents.</li>
    <li>One's portfolio</li>
    </ol>

## Item 1 - Function Definitions
<h3>Getting updated S&P tickers from slickcharts website</h3>

In [2]:
def scrap_SnP_tickers(url='https://www.slickcharts.com/sp500', tableclass="table-responsive"):
    '''
    Function to scrap the latest S&P data from a website containing S&P data
    Input:
        url = website url
        tableclass = name of tableclass containing the data
    '''
    resp = requests.get(url, headers = headers)
    soup = BeautifulSoup(resp.text, "html.parser")
    table = soup.find(class_ = tableclass)

    table_head = table.find('thead')
    header_list = []
    ths = table_head.find_all('th')
    for th in ths:
        header_list.append(th.text.strip())

    table_body = table.find('tbody')

    rows = table_body.find_all('tr')
    sp_data = []
    for row in rows:
        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]
        sp_data.append([ele for ele in cols if ele]) # Get rid of empty values

    sp_df = pd.DataFrame(np.array(sp_data))
    sp_df.columns = header_list
    sp_df = sp_df.drop('#', axis=1)
    sp_df['Symbol'].replace(regex={r'[\.]': '-'}, inplace=True) #tickers need to have - instead of . for proper search on yahoo
    tickers = sp_df['Symbol'].tolist()
    print(f'Number of S&P constituent tickers = {len(tickers)}')
    display(sp_df.head())
    return sp_df, tickers


<h3>Retrieving headers of yahoo finance stats page (Optional)</h3>

In [3]:
def get_yf_headers(ticker_sample): #ticker_sample is any ticker(str) for the purpose of retrieving the headers
    #Getting headers
    url = f'https://finance.yahoo.com/quote/{ticker_sample}/key-statistics?p={ticker_sample}'
    resp = requests.get(url, headers = headers)
    print(f'Using {ticker_sample} for headers, status - {resp.status_code}')
    soup = BeautifulSoup(resp.text, "html.parser")

    titles = ['Ticker']
    rows = soup.find_all("tr")
    for row in rows:
        cols = row.find_all("td")
        ele = cols[0].text.strip() 
        titles.append(ele)

    metrics_df = pd.DataFrame({"Metrics":titles})
    metrics_df['Metrics'].replace(regex={r'[0-9]$': ''}, inplace = True) #removes the annotations appearing at the end of rows
    metrics_df.iloc[24:29,0].replace(regex={r'(\(.+\))': ''}, inplace = True) #remove the dates under rows 24-28
    metrics = metrics_df['Metrics'].tolist()
    print(f'Extraction of headers complete! Total metric columns = {len(metrics)}')
    return metrics

<h3>Renaming the headers manually to indicate units clearly</h3>

In [4]:
metrics = ['Ticker',
 'Market Cap (B)',
 'Enterprise Value (B)',
 'Trailing P/E',
 'Forward P/E',
 'PEG Ratio (5 yr expected)',
 'Price/Sales (ttm)',
 'Price/Book (mrq)',
 'Enterprise Value/Revenue',
 'Enterprise Value/EBITDA',
 'Beta (5Y Monthly)',
 '52 Week Change (%)',
 'S&P500 52-Week Change (%)',
 '52 Week High',
 '52 Week Low',
 '50-Day Moving Average',
 '200-Day Moving Average',
 'Avg Vol 3 month (M)',
 'Avg Vol 10 day (M)',
 'Shares Outstanding',
 'Implied Shares Outstanding',
 'Float',
 '% Held by Insiders',
 '% Held by Institutions',
 'Shares Short (M)',
 'Short Ratio (M)',
 'Short % of Float',
 'Short % of Shares Outstanding',
 'Shares Short',
 'Forward Annual Dividend Rate',
 'Forward Annual Dividend Yield (%)',
 'Trailing Annual Dividend Rate',
 'Trailing Annual Dividend Yield (%)',
 '5 Year Average Dividend Yield',
 'Payout Ratio (%)',
 'Dividend Date',
 'Ex-Dividend Date',
 'Last Split Factor (x:1)',
 'Last Split Date',
 'Fiscal Year Ends',
 'Most Recent Quarter (mrq)',
 'Profit Margin (%)',
 'Operating Margin (ttm) (%)',
 'Return on Assets (ttm) (%)',
 'Return on Equity (ttm) (%)',
 'Revenue (ttm) (B)',
 'Revenue Per Share (ttm)',
 'Quarterly Revenue Growth (yoy) (%)',
 'Gross Profit (ttm) (B)',
 'EBITDA (B)',
 'Net Income Avi to Common (ttm) (B)',
 'Diluted EPS (ttm)',
 'Quarterly Earnings Growth (yoy) (%)',
 'Total Cash (mrq) (B)',
 'Total Cash Per Share (mrq)',
 'Total Debt (mrq) (B)',
 'Total Debt/Equity (mrq)',
 'Current Ratio (mrq)',
 'Book Value Per Share (mrq)',
 'Operating Cash Flow (ttm) (B)',
 'Levered Free Cash Flow (ttm) (B)']

#for visually checking if the list referencing is correct
#temp = get_yf_headers('AAPL')
#test = pd.DataFrame({'Old':temp, 'new':metrics})
#test.iloc[0:60,:]

### Scraping and casting ticker stats into 2D list

In [5]:
def scrap_ticker(batch, start=0, stop=None, sleeptime=5, batch_interval=50):  
    '''
    Function take takes in a list of tickers and scraps the yahoo stats into a dictionary.
    Input:
        batch : list of tickers
        start : start point of list, default to 0
        stop : end point of list, default to None, will initialize in function to length of batch
        sleeptime : time to sleep between scraps
        batch_interval : number of scraps before a longer rest
        '''
    
    start_time = datetime.now()

    if all_data:
        batch_number = max(all_data.keys())+1
        print(f'Batch number: {batch_number}')
    else:
        batch_number = 1
    
    batch_n_count = 0
    batch_data = []
    current_try = 1
    missed_tickers=[]
    if stop:
        pass
    else:
        stop = len(batch)
    
    for n in range(start, stop):
        url = f'https://finance.yahoo.com/quote/{batch[n]}/key-statistics?p={batch[n]}'
        resp = requests.get(url, headers = headers)
        print(f'{batch[n]} status - {resp.status_code}, {n+1}/{len(batch)}')
        soup = BeautifulSoup(resp.text, "html.parser")
        
        title = soup.find("h1")
        data= [title.text]
        
        rows = soup.find_all("tr")
        for row in rows:
            cols = row.find_all("td")
            ele = cols[1].text.strip()
            data.append(ele)
        
        if len(data)!=len(metrics) or data[1] == 'N/A':
            if len(data)!=len(metrics):
                print(f'length error({len(data)} instead of {len(metrics)}) in {batch[n]}')
            elif data[1] == 'N/A':
                print(f'N/A found in Marketcap of {batch[n]}')
            missed_tickers.append(batch[n])
            print(f'Sleeping for {sleeptime*6*current_try}s')
            time.sleep(sleeptime*6*current_try)
            current_try +=1
        else:
            batch_data.append(data)
            print(f'{batch[n]} complete!')
            batch_n_count +=1
            time.sleep(sleeptime)
        if batch_n_count == batch_interval:
            all_data[batch_number] = pd.DataFrame(np.array(batch_data))
            print(f'\nLength of info extracted is {len(batch_data)} in batch {batch_number} \n')
            batch_n_count = 0
            batch_number +=1
            batch_data = []
            print(f'Sleeping for {sleeptime*4}s')
            time.sleep(sleeptime*4)
            
    #the final appending for last batch with n smaller than 50
    all_data[batch_number] = pd.DataFrame(np.array(batch_data))
    print(f'Length of info extracted is {len(batch_data)} in batch {batch_number}')
    end_time = datetime.now()
    print('Elapsed time was', (end_time - start_time))
    print()
    return all_data, missed_tickers

In [6]:
def rescrap_missed(missed_tickers):
    '''Extracts tickers that had errors/incomplete info'''
    count = 0
    while missed_tickers:
        print('Extracting missed tickers:')
        temp = missed_tickers.copy()
        all_data, missed_tickers = scrap_ticker(temp, sleeptime=2)
        count +=1
        if missed_tickers:
            cont = input('There are still missed tickers, continue? Y/N')
            if cont.lower() == 'n':
                break
    print('Ran', count ,'times')
    return all_data, missed_tickers

<h3>Casting the dataframe and cleaning all the data</h3>

In [7]:
def clean_df(all_data, metrics, display_progress=False):
    '''
    Function to cast and clean the dataframe via the following:
    1. Cast dictionary(all_data) into a pandas dataframe
    2. Format strings into numbers according (large number format)
    3. Clean stocksplit ratios 
    4. Recast dates into datetime format

    Inputs:
    all_data : scrapped data [dictionary]
    metrics : list of column headers [list]
    display_progress: boolean to show processes
    '''
    #Casting the dataframe
    frames = [all_data[x] for x in all_data]
    all_data_df = pd.concat(frames)
    all_data_df.columns = metrics
    all_data_tickers = all_data_df['Ticker']
    if display_progress:
        print('Casting data, index and columns to build the dataframe')
        display(all_data_df.head()) 

    def _num_reformat(x):
        '''
        Reformats large sums (Billion, million, thousand) and removing (%,) values
        Casts numerical values into float type
        '''
        x = re.sub("[,]", "", x)
        if x[-1] == 'T':
            x = round(float(x[:-1])*1000,2)
        elif x[-1] == 'B':
            x = round(float(x[:-1]),2)
        elif x[-1] == 'M':
            x = round(float(x[:-1])*0.001,2)
        elif x[-1] == 'k':
            x = round(float(x[:-1])*0.000001,2)
        elif x[-1] == '%':
            x = round(float(x[:-1]),0)             
        elif x == "N/A":
            x = 0
        return x

    cleaned_df = all_data_df.iloc[:,1:].applymap(_num_reformat)
    cleaned_df.insert(0, 'Ticker', all_data_tickers)
    if display_progress:
        print('Reformatting large sums (Billion, million, thousand) and removing (%,) values')
        display(cleaned_df.head())
    
    def _stocksplits(x):
        '''
        Changes split factors into x:1 whole ratios
        '''
        if type(x)==str:
            x = x.split(':')
            return round(int(x[0])/ int(x[1]),2)
        else:
            return x
    
    cleaned_df['Last Split Factor (x:1)'] = cleaned_df['Last Split Factor (x:1)'].apply(_stocksplits)
    if display_progress:
        print('Handling the stock split factor column')
        display(cleaned_df.iloc[:,[0,37]].head())
    
    dates = cleaned_df.iloc[:,[35, 36, 38, 39, 40]]
    
    def _date_conversion(x):
        '''
        Converts dates int datetime format at the end of the dataframe
        '''
        if x == 0:
            return np.nan
        else:
            x = str(x)
            return datetime.strptime(x, '%b %d %Y').date()

    dates = dates.applymap(_date_conversion)
    dates = dates.replace({0:np.nan})

    final_df = cleaned_df.drop(cleaned_df.columns[[0, 35, 36, 38, 39, 40]], axis = 1).astype('float64')
    final_df = pd.concat([final_df, dates], axis = 1)
    final_df.insert(0, 'Ticker', all_data_tickers)
    if display_progress:
        print('Converting dates to datetime format at the end of the table')
        display(final_df.head())
    
    #return the completed pandas dataframe
    return final_df

## Option 1 - S&P data

In [8]:
#sp_df, tickers = scrap_SnP_tickers()

## Option 2 - Personal portfolio data

In [10]:
input_tickers = pd.read_csv('portfolio_tickers.csv') #input file with a list of portfolio tickers
tickers = input_tickers['Tickers'].tolist()

### Run code

In [11]:
#Test run
tickers = ['wing','mcd','dpz','sbux','yum']

all_data={}
all_data, missed_tickers = scrap_ticker(tickers)
if missed_tickers:
    all_data, missed_tickers = rescrap_missed(missed_tickers)
final_df = clean_df(all_data, metrics)
final_df

wing status - 200, 1/5
wing complete!
mcd status - 200, 2/5
mcd complete!
dpz status - 200, 3/5
dpz complete!
sbux status - 200, 4/5
N/A found in Marketcap of sbux
Sleeping for 30s
yum status - 200, 5/5
yum complete!
Length of info extracted is 4 in batch 1
Elapsed time was 0:00:55.757085

Extracting missed tickers:
Batch number: 2
sbux status - 200, 1/1
N/A found in Marketcap of sbux
Sleeping for 12s
Length of info extracted is 0 in batch 2
Elapsed time was 0:00:13.011642

Ran 1 times


Unnamed: 0,Ticker,Market Cap (B),Enterprise Value (B),Trailing P/E,Forward P/E,PEG Ratio (5 yr expected),Price/Sales (ttm),Price/Book (mrq),Enterprise Value/Revenue,Enterprise Value/EBITDA,...,Total Debt/Equity (mrq),Current Ratio (mrq),Book Value Per Share (mrq),Operating Cash Flow (ttm) (B),Levered Free Cash Flow (ttm) (B),Dividend Date,Ex-Dividend Date,Last Split Date,Fiscal Year Ends,Most Recent Quarter (mrq)
0,Wingstop Inc. (WING),4.43,4.97,105.08,76.92,6.43,13.67,0.0,15.32,57.59,...,0.0,3.68,-13.59,0.05,0.01,2022-12-01,2022-11-08,,2021-12-24,2022-09-23
1,McDonald's Corporation (MCD),190.12,235.84,31.83,24.51,3.4,8.2,0.0,10.0,21.82,...,0.0,1.65,-8.97,7.85,4.13,2022-12-14,2022-11-29,1999-03-07,2021-12-30,2022-09-29
2,"Domino's Pizza, Inc. (DPZ)",12.01,17.28,27.48,23.98,2.35,2.74,0.0,3.85,20.19,...,0.0,1.43,-121.94,0.5,0.36,2022-12-29,2022-12-13,,2022-01-01,2022-09-10
3,"Yum! Brands, Inc. (YUM)",36.2,47.38,29.27,25.13,2.58,5.59,0.0,7.06,20.5,...,0.0,1.28,-30.08,1.39,0.78,2022-12-08,2022-11-22,2016-10-31,2021-12-30,2022-09-29


In [12]:
filename = input("Enter filename to save as: ")
if filename == '':
    filename = 'portfolio'
current_datetime = date.today().isoformat()
final_df.to_csv(f'data/{filename}_{current_datetime}.csv')
print(f'Saved to file: {filename}_{current_datetime}.csv')

Saved to file: testrun_2023-01-29.csv
