# Web Scraping Stock Data

In [None]:
#import the basics
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import datetime

In [None]:
#import web scraping packages
from bs4 import BeautifulSoup
import requests
import urllib.request as ur
import pandas_datareader.data as web

In [None]:
ticker = "ASML.AS"

In [None]:
#summary
url_summary = "https://finance.yahoo.com/quote/"+ ticker + "?p=" + ticker + "&.tsrc=fin-srch"
url_summary

In [None]:
#statistics
url_stat = "https://finance.yahoo.com/quote/" + ticker + "/key-statistics?p=" + ticker

In [None]:
#historical data
url_hist = "https://finance.yahoo.com/quote/" + ticker + "/history?p=" + ticker

In [None]:
#financials
url_fin = "https://finance.yahoo.com/quote/" + ticker + "/financials?p=" + ticker
url_bs = "https://finance.yahoo.com/quote/" + ticker + "/balance-sheet?p=" + ticker

In [None]:
#analysis by analysts
url_analists = "https://finance.yahoo.com/quote/" + ticker + "/analysis?p=" + ticker

In [None]:
#sustainability
url_sust = "https://finance.yahoo.com/quote/" + ticker + "/sustainability?p=" + ticker

## Financials

In [None]:
def income_statement(ticker, type='balance-sheet'):
    """
    Webscrape financial data from Yahoo! Finance.

    Parameters
    ----------
    ticker : str
        Ticker of the company you want to consult.
    type : str
        Type of financial information you would like.
        -Options for type: financials, balance-sheet & cash-flow. Default is balance-sheet.

    Returns
    -------
    pandas DataFrame

    Caution
    --------
    Numbers are in thousands.
    """
    
    #scraping
    url_fin = "https://finance.yahoo.com/quote/" + ticker + "/" + type + "?p=" + ticker
    
    response = requests.get(url_fin, headers={
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.111 Safari/537.36",
    })
    soup = BeautifulSoup(response.text, 'html.parser')
    
    
    ls = []

    #find all HTML code that is div
    for i in soup.find_all('div'):
        ls.append(i.string)
        
        #fill up missing titles
        if i.string != i.get('title'):
            ls.append(i.get('title'))
                
    #filter out all the 'none'-values
    new_ls = list(filter(None,ls))
    
    #remove the first elements of new_ls until we find the starting point of the table
    new_ls = new_ls[new_ls.index('Expand All'):]
    
    #zip per 6 for inc-st & cash flow and per 5 for balance sheet
    if type == 'balance-sheet':
        zipped_ls = list(zip(*[iter(new_ls)]*5))
    else:
        zipped_ls = list(zip(*[iter(new_ls)]*6))
    
    #turn list into dataframe
    df = pd.DataFrame(zipped_ls)
    
    #cleaning up the dataframe
    df.columns = df.iloc[0]
    df = df.iloc[1:]
    df.rename(columns = {'Expand All':'Breakdown'}, inplace = True)
    df.set_index('Breakdown', inplace=True, drop=True)
    df = df.T
    df.index.name = None #remove zero in index title
    df.sort_index(inplace=True)
    
    #convert values to int
    col = df.columns
    for col in df: 
        temp = df[col].to_string().replace(',','').split('\n')
        df[col]= [i[10:].strip() for i in temp]
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)
        
    return df

In [None]:
df=income_statement('ABI.BR', 'cash-flow')

In [None]:
df

In [None]:
df['Free Cash Flow'].plot()

## Statistics

In [None]:
response = requests.get(url_stat, headers={
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.111 Safari/537.36",
})
soup = BeautifulSoup(response.text, 'html.parser')
stock_data = soup.find_all("table")

In [None]:
for table in stock_data:
    # Scrape all table rows into variable trs
    trs = table.find_all('tr')
    for tr in trs:
        # Scrape all table data tags into variable tds
        tds = tr.find_all('td')
        # Index 0 of tds will contain the measurement
        print("Measure: {}".format(tds[0].get_text()))
        # Index 1 of tds will contain the value
        print("Value: {}".format(tds[1].get_text()))
        print("")


def get_measurement(table_array, measurement):
    for table in table_array:
        trs = table.find_all('tr')
        for tr in trs:
            tds = tr.find_all('td')
            if measurement.lower() in tds[0].get_text().lower():
                return(tds[1].get_text())


# print only one measurement, e.g. operating cash flow
print(get_measurement(stock_data, "operating cash flow"))

In [None]:
#scraping
url_fin = "https://finance.yahoo.com/quote/" + ticker + "/key-statistics?p=" + ticker

response = requests.get(url_fin, headers={
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.111 Safari/537.36",
})
soup = BeautifulSoup(response.text, 'html.parser')


ls = []

#find all HTML code that is div
for i in soup.find_all('tr'):
    ls.append(i.string)
    
    #fill up missing titles
    if i.string != i.get('title'):
        ls.append(i.get('title'))
        
new_ls = list(filter(None,ls))

## Historical Data