In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from datetime import date
from datetime import datetime
import time
import requests
import requests_cache
import re

#headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9'}
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
    'Accept-Language': 'en-US,en;q=0.5',
    'DNT': '1',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1'
}
requests_cache.install_cache('yahoo_cache')

<h1>Part 1 - S&P and portfolio web scraper</h1>
<h2>Introduction</h2>
<p>The Standard and Poor's 500, or simply the S&P 500, is a stock market index tracking the stock performance of 500 large companies listed on stock exchanges in the United States. Having consisting of 11 different sectors and over 500 different companies, the index can be used as a benchmark for a basic stock portfolio diversification in the US stock marker. It is one of the most commonly followed equity indices.</p>
    
<p>This script scraps data from the yahoo finance statistics page.</p>
<img src="sample.JPG">
<p>Two sets of data are in focus</p>
<ol>
    <li>The S&P index constituents.</li>
    <li>One's portfolio</li>
    </ol>

## Item 1 - Function Definitions
<h3>Getting updated S&P tickers from slickcharts website</h3>

In [2]:
def scrap_SnP_tickers(url='https://www.slickcharts.com/sp500', tableclass="table-responsive"):
    '''
    Function to scrap the latest S&P data from a website containing S&P data
    Input:
        url = website url
        tableclass = name of tableclass containing the data
    '''
    resp = requests.get(url, headers = headers)
    soup = BeautifulSoup(resp.text, "html.parser")
    table = soup.find(class_ = tableclass)

    table_head = table.find('thead')
    header_list = []
    ths = table_head.find_all('th')
    for th in ths:
        header_list.append(th.text.strip())

    table_body = table.find('tbody')

    rows = table_body.find_all('tr')
    sp_data = []
    for row in rows:
        cols = row.find_all('td')
        cols = [ele.text.strip() for ele in cols]
        sp_data.append([ele for ele in cols if ele]) # Get rid of empty values

    sp_df = pd.DataFrame(np.array(sp_data))
    sp_df.columns = header_list
    sp_df = sp_df.drop('#', axis=1)
    sp_df['Symbol'].replace(regex={r'[\.]': '-'}, inplace=True) #tickers need to have - instead of . for proper search on yahoo
    tickers = sp_df['Symbol'].tolist()
    print(f'Number of S&P constituent tickers = {len(tickers)}')
    display(sp_df.head())
    return sp_df, tickers


<h3>Retrieving headers of yahoo finance stats page (Optional)</h3>

In [3]:
def get_yf_headers(ticker_sample): #ticker_sample is any ticker(str) for the purpose of retrieving the headers
    #Getting headers
    url = f'https://finance.yahoo.com/quote/{ticker_sample}/key-statistics?p={ticker_sample}'
    resp = requests.get(url, headers = headers)
    print(f'Using {ticker_sample} for headers, status - {resp.status_code}')
    soup = BeautifulSoup(resp.text, "html.parser")

    titles = ['Ticker']
    rows = soup.find_all("tr")
    for row in rows:
        cols = row.find_all("td")
        ele = cols[0].text.strip() 
        titles.append(ele)

    metrics_df = pd.DataFrame({"Metrics":titles})
    metrics_df['Metrics'].replace(regex={r'[0-9]$': ''}, inplace = True) #removes the annotations appearing at the end of rows
    metrics_df.iloc[24:29,0].replace(regex={r'(\(.+\))': ''}, inplace = True) #remove the dates under rows 24-28
    metrics = metrics_df['Metrics'].tolist()
    print(f'Extraction of headers complete! Total metric columns = {len(metrics)}')
    return metrics

<h3>Renaming the headers manually to indicate units clearly</h3>

In [4]:
metrics = ['Name',
 'Market Cap (B)',
 'Enterprise Value (B)',
 'Trailing P/E',
 'Forward P/E',
 'PEG Ratio (5 yr expected)',
 'Price/Sales (ttm)',
 'Price/Book (mrq)',
 'Enterprise Value/Revenue',
 'Enterprise Value/EBITDA',
 'Beta (5Y Monthly)',
 '52 Week Change (%)',
 'S&P500 52-Week Change (%)',
 '52 Week High',
 '52 Week Low',
 '50-Day Moving Average',
 '200-Day Moving Average',
 'Avg Vol 3 month (M)',
 'Avg Vol 10 day (M)',
 'Shares Outstanding',
 'Implied Shares Outstanding',
 'Float',
 '% Held by Insiders',
 '% Held by Institutions',
 'Shares Short (M)',
 'Short Ratio (M)',
 'Short % of Float',
 'Short % of Shares Outstanding',
 'Shares Short',
 'Forward Annual Dividend Rate',
 'Forward Annual Dividend Yield (%)',
 'Trailing Annual Dividend Rate',
 'Trailing Annual Dividend Yield (%)',
 '5 Year Average Dividend Yield',
 'Payout Ratio (%)',
 'Dividend Date',
 'Ex-Dividend Date',
 'Last Split Factor (x:1)',
 'Last Split Date',
 'Fiscal Year Ends',
 'Most Recent Quarter (mrq)',
 'Profit Margin (%)',
 'Operating Margin (ttm) (%)',
 'Return on Assets (ttm) (%)',
 'Return on Equity (ttm) (%)',
 'Revenue (ttm) (B)',
 'Revenue Per Share (ttm)',
 'Quarterly Revenue Growth (yoy) (%)',
 'Gross Profit (ttm) (B)',
 'EBITDA (B)',
 'Net Income Avi to Common (ttm) (B)',
 'Diluted EPS (ttm)',
 'Quarterly Earnings Growth (yoy) (%)',
 'Total Cash (mrq) (B)',
 'Total Cash Per Share (mrq)',
 'Total Debt (mrq) (B)',
 'Total Debt/Equity (mrq)',
 'Current Ratio (mrq)',
 'Book Value Per Share (mrq)',
 'Operating Cash Flow (ttm) (B)',
 'Levered Free Cash Flow (ttm) (B)']

#for visually checking if the list referencing is correct
#temp = get_yf_headers('AAPL')
#test = pd.DataFrame({'Old':temp, 'new':metrics})
#test.iloc[0:60,:]

### Scraping and casting ticker stats into 2D list

In [16]:
def scrap_ticker(tickers, data_dict, sleeptime=2, batch_interval=50):  
    '''
    Function take takes in a list of tickers and scraps the yahoo stats into a dictionary.
    Input:
        tickers : list of tickers
        all_data : dictionary to update with the scrapped data, should be created outside the function
        sleeptime : time to sleep between scraps
        batch_interval : number of scraps before a longer rest
        '''
    
    start_time = datetime.now()
    
    #check if there is any existing data in all_data (dictionary), if yes, assigns this run to the next batch number
    if data_dict:
        batch_number = max(data_dict.keys())+1
        print(f'Batch number: {batch_number}')
    else:
        batch_number = 1
    
    #initialize the batch_count, and wait_counter (for increasing waits between failed scraps)
    batch_count = 0
    wait_counter = 1

    #initialize list for collecting batch data and list for collecting tickers with errors
    batch_data = []
    missed_tickers=[]
    
    try:
        for count, ticker in enumerate(tickers):
            url = f'https://finance.yahoo.com/quote/{ticker}/key-statistics?p={ticker}'
            resp = requests.get(url, headers = headers)
            print(f'{ticker} status - {resp.status_code}, {count+1}/{len(tickers)}', end=' ')
            soup = BeautifulSoup(resp.text, "html.parser")
            
            title = soup.find("h1")
            data= [title.text]
            
            rows = soup.find_all("tr")
            for row in rows:
                cols = row.find_all("td")
                ele = cols[1].text.strip()
                data.append(ele)
            
            #to account for errors in the extraction length or data
            if len(data)!=len(metrics) or data[1] == 'N/A':
                if len(data)!=len(metrics):
                    print(f'length error({len(data)} instead of {len(metrics)}) in {ticker}')
                elif data[1] == 'N/A':
                    print(f'N/A found in Marketcap of {ticker}')
                missed_tickers.append(ticker)
                print(f'Sleeping for {sleeptime*2*wait_counter}s')
                time.sleep(sleeptime*2*wait_counter)
                wait_counter +=1
            else:
                batch_data.append(data)
                print(f'complete!')
                batch_count +=1
                time.sleep(sleeptime)

            #code to sleep after a batch
            if batch_count == batch_interval:
                data_dict[batch_number] = pd.DataFrame(np.array(batch_data))
                print(f'\nLength of info extracted is {len(batch_data)} in batch {batch_number} \n')
                batch_count = 0
                batch_number +=1
                batch_data = []
                print(f'Sleeping for {sleeptime*2}s')
                time.sleep(sleeptime*2)
    except Exception as e:
        print(e)
    finally:
        #the final appending for last batch with n smaller than 50
        data_dict[batch_number] = pd.DataFrame(np.array(batch_data))
        print(f'Length of info extracted is {len(batch_data)} in batch {batch_number}')
        end_time = datetime.now()
        print('Elapsed time was', (end_time - start_time))
        print()
        return missed_tickers

In [18]:
def rescrap_missed(missed_tickers, data_dict, max_tries=5):
    '''
    Extracts tickers that had errors/incomplete info
    Input:
        list of missed_tickers
        max_tries (int) default 5
        user_prompt (boolean) prompt to continue if max_tries exceeeded 
    '''
    count = 0
    while missed_tickers:
        print(f'Extracting missed tickers: Attempt {count+1}')
        temp = missed_tickers.copy()
        missed_tickers = scrap_ticker(temp, data_dict, sleeptime=2)
        count +=1
        if count >= max_tries:
            break
    print('Unresolved tickers:', missed_tickers)
    return missed_tickers

<h3>Casting the dataframe and cleaning all the data</h3>

In [7]:
def clean_df(all_data, metrics, display_progress=False):
    '''
    Function to cast and clean the dataframe via the following:
    1. Cast dictionary(all_data) into a pandas dataframe
    2. Format strings into numbers according (large number format)
    3. Clean stocksplit ratios 
    4. Recast dates into datetime format

    Inputs:
    all_data : scrapped data [dictionary]
    metrics : list of column headers [list]
    display_progress: boolean to show processes
    '''
    #Casting the dataframe
    frames = [all_data[x] for x in all_data]
    all_data_df = pd.concat(frames)
    all_data_df.columns = metrics

    # Save the name list first to ensure correct order of tickers scrapped
    # Split the name into company name and ticker

    def extract_tickers(name):
        ticker_symbol = re.findall('\(([a-zA-Z\-]+)\)', name)
        try:
            # Check for accidental extractions
            if len(ticker_symbol)>1:
                index = -1
            else:
                index = 0
            return ticker_symbol[index]
        except IndexError:
            print(f'No ticker found for company: {name}')
            return np.nan

    all_data_tickers = all_data_df['Name'].apply(extract_tickers)
    all_data_name = all_data_df['Name'].apply(lambda x: x.split('(')[0])


    if display_progress:
        print('Casting data, index and columns to build the dataframe')
        display(all_data_df.head()) 

    def _num_reformat(x):
        '''
        Reformats large sums (Billion, million, thousand) and removing (%,) values
        Casts numerical values into float type
        '''
        x = re.sub("[,]", "", x)
        if x[-1] == 'T':
            x = round(float(x[:-1])*1000,2)
        elif x[-1] == 'B':
            x = round(float(x[:-1]),2)
        elif x[-1] == 'M':
            x = round(float(x[:-1])*0.001,2)
        elif x[-1] == 'k':
            x = round(float(x[:-1])*0.000001,2)
        elif x[-1] == '%':
            x = round(float(x[:-1]),0)             
        elif x == "N/A":
            x = 0
        return x

    cleaned_df = all_data_df.iloc[:,1:].applymap(_num_reformat)

    # Insert back columns for progress display
    cleaned_df.insert(0, 'Name', all_data_name)
    if display_progress:
        print('Reformatting large sums (Billion, million, thousand) and removing (%,) values')
        display(cleaned_df.head())
    
    def _stocksplits(x):
        '''
        Changes split factors into x:1 whole ratios
        '''
        if type(x)==str:
            x = x.split(':')
            return round(int(x[0])/ int(x[1]),2)
        else:
            return x
    
    cleaned_df['Last Split Factor (x:1)'] = cleaned_df['Last Split Factor (x:1)'].apply(_stocksplits)
    if display_progress:
        print('Handling the stock split factor column')
        display(cleaned_df.iloc[:,[0,37]].head())
    
    
    
    def _date_conversion(x):
        '''
        Converts dates int datetime format at the end of the dataframe
        '''
        if x == 0:
            return np.nan
        else:
            x = str(x)
            return datetime.strptime(x, '%b %d %Y').date()

    # Slicing columns which should be dates
    dates = cleaned_df.iloc[:,[35, 36, 38, 39, 40]]
    dates = dates.applymap(_date_conversion)
    dates = dates.replace({0:np.nan})

    if display_progress:
        print('Converting dates to datetime format')
        display(dates.head())

    # Drop the old date columns, add datetime columns and add back all ticker & company names
    final_df = cleaned_df.drop(cleaned_df.columns[[0, 35, 36, 38, 39, 40]], axis = 1).astype('float64')
    final_df = pd.concat([final_df, dates], axis = 1)
    final_df.insert(0, 'Name', all_data_name)
    final_df.insert(1, 'Ticker', all_data_tickers)
    
    if display_progress:
        print('Final_df cleaned')
        display(final_df.head())
    
    #return the completed pandas dataframe
    return final_df

## Option 1 - S&P data

In [8]:
sp_df, tickers = scrap_SnP_tickers()

Number of S&P constituent tickers = 503


Unnamed: 0,Company,Symbol,Weight,Price,Chg,% Chg
0,Apple Inc.,AAPL,6.960996,161.36,1.26,(0.79%)
1,Microsoft Corporation,MSFT,6.169148,283.63,0.14,(0.05%)
2,Amazon.com Inc.,AMZN,2.549714,98.44,0.61,(0.62%)
3,NVIDIA Corporation,NVDA,1.90542,265.98,1.03,(0.39%)
4,Alphabet Inc. Class A,GOOGL,1.821983,105.29,0.65,(0.62%)


## Option 2 - Personal portfolio data

In [19]:
input_tickers = pd.read_csv('portfolio_tickers.csv') #input file with a list of portfolio tickers
tickers = input_tickers['Tickers'].tolist()

### Run code

In [20]:
scrapped_data = {}

try:
    missed_tickers = scrap_ticker(tickers, data_dict=scrapped_data, sleeptime=2, batch_interval=50)
    if missed_tickers:
        missed_tickers = rescrap_missed(missed_tickers, data_dict=scrapped_data, max_tries=2)
except Exception as e:
    print(e)
finally:
    final_df = clean_df(scrapped_data, metrics, display_progress=False)
    final_df.reset_index(drop=True, inplace=True)
final_df

TSLA status - 200, 1/37 complete!
TTD status - 200, 2/37 complete!
NVDA status - 200, 3/37 complete!
GOOG status - 200, 4/37 complete!
CRWD status - 200, 5/37 complete!
AMZN status - 200, 6/37 complete!
IDXX status - 200, 7/37 complete!
MELI status - 200, 8/37 complete!
ABNB status - 200, 9/37 complete!
ZS status - 200, 10/37 complete!
ASML status - 200, 11/37 complete!
SE status - 200, 12/37 complete!
AXON status - 200, 13/37 complete!
TEAM status - 200, 14/37 complete!
LULU status - 200, 15/37 complete!
SNOW status - 200, 16/37 complete!
NET status - 200, 17/37 complete!
HUBS status - 200, 18/37 complete!
DDOG status - 200, 19/37 complete!
DOCN status - 200, 20/37 complete!
U status - 200, 21/37 complete!
SEMR status - 200, 22/37 complete!
LMND status - 200, 23/37 complete!
ZM status - 200, 24/37 complete!
CDNS status - 200, 25/37 complete!
PINS status - 200, 26/37 complete!
MDB status - 200, 27/37 complete!
MRNA status - 200, 28/37 complete!
ROKU status - 200, 29/37 complete!
GLBE s

Unnamed: 0,Name,Ticker,Market Cap (B),Enterprise Value (B),Trailing P/E,Forward P/E,PEG Ratio (5 yr expected),Price/Sales (ttm),Price/Book (mrq),Enterprise Value/Revenue,...,Total Debt/Equity (mrq),Current Ratio (mrq),Book Value Per Share (mrq),Operating Cash Flow (ttm) (B),Levered Free Cash Flow (ttm) (B),Dividend Date,Ex-Dividend Date,Last Split Date,Fiscal Year Ends,Most Recent Quarter (mrq)
0,"Tesla, Inc.",TSLA,585.55,569.11,51.12,47.39,1.65,7.89,13.1,6.99,...,12.52,1.53,14.13,14.72,4.21,,,2022-08-24,2022-12-30,2022-12-30
1,"The Trade Desk, Inc.",TTD,29.86,28.68,553.18,57.47,3.19,19.28,14.12,18.18,...,12.34,1.89,4.31,0.55,0.49,,,2021-06-16,2022-12-30,2022-12-30
2,NVIDIA Corporation,NVDA,667.81,666.55,155.39,59.88,3.36,25.13,30.22,24.71,...,54.44,3.52,8.96,5.64,4.53,2023-03-28,2023-03-06,2021-07-19,2023-01-28,2023-01-28
3,Alphabet Inc.,GOOG,1370.0,1280.0,23.45,19.8,1.17,4.98,5.35,4.53,...,11.7,2.38,19.93,91.49,52.53,,,2022-07-17,2022-12-30,2022-12-30
4,"CrowdStrike Holdings, Inc.",CRWD,30.15,28.23,0.0,55.56,1.72,13.3,20.6,12.59,...,52.68,1.73,6.21,0.94,0.73,,,,2023-01-30,2023-01-30
5,"Amazon.com, Inc.",AMZN,1050.0,1120.0,0.0,63.69,2.3,2.02,7.16,2.17,...,116.36,0.94,14.26,46.75,0.01,,,2022-06-05,2022-12-30,2022-12-30
6,"IDEXX Laboratories, Inc.",IDXX,39.21,40.56,58.84,54.35,5.32,11.87,64.41,12.05,...,241.49,0.89,7.34,0.54,0.33,,,2015-06-15,2022-12-30,2022-12-30
7,"MercadoLibre, Inc.",MELI,62.57,64.95,130.64,88.5,2.18,6.07,34.25,6.16,...,297.15,1.28,36.35,2.94,-0.23,2018-01-15,2017-12-27,,2022-12-30,2022-12-30
8,"Airbnb, Inc.",ABNB,69.25,61.97,39.32,30.49,1.53,8.88,12.46,7.38,...,42.1,1.86,8.81,3.43,2.6,,,,2022-12-30,2022-12-30
9,"Zscaler, Inc.",ZS,15.05,14.37,0.0,53.19,1.11,11.0,28.48,10.66,...,230.0,2.01,3.64,0.4,0.49,,,,2022-07-30,2023-01-30


In [21]:
filename = input("Enter filename to save as: ")
if filename == '':
    filename = 'portfolio'
current_date = date.today().isoformat()
final_df.to_csv(f'data/{filename}_{current_date}.csv')
print(f'Saved to file: {filename}_{current_date}.csv')

Saved to file: portfolio_2023-04-14.csv
