# Value Investing with Python - Part 02 (updated version)

In the updated part two of the **Value Investing with Python**-series the `BeautifulSoup` module is used to scrape stock/company data from https://morningstar.com. The web scraped data consists of useful metrics (like revenue, return on equity, current ratio and more) for several years (up to 10 years). Moreover, the function modifies all datapoints as well as the structure of the dataset.

Example(s) for scraped webpage:
* https://www.morningstar.com/stocks/xnas/aapl/valuation
* https://www.morningstar.com/stocks/xnas/aapl/financials
* https://www.morningstar.com/stocks/xnas/aapl/dividends

The function has two inputs:
* *stock_ticker*: stock ticker symbol as indicated on Morningstar
* *exchange_ticker*: exchange ticker symbol as indicated on Morningstar

The function outputs the web scraped data.

In [2]:
def get_modify_morningstar_data(stock_ticker, exchange_ticker):
    
    # Import relevant modules
    from bs4 import BeautifulSoup
    import requests
    import pandas as pd
    import re
    import time
    
    # Get Morningstar 'identifier' for stock
    url_first_part = 'https://www.morningstar.com/stocks/'
    url_complete = url_first_part + exchange_ticker + '/' + stock_ticker + "/valuation"
    soup = BeautifulSoup(requests.get(url_complete).text, 'lxml')
    temporary_list = list(soup.findAll("script")[4].string.split(","))
    identifier_index = [i for i, item in enumerate(temporary_list) if re.match('byId', item)][0]
    morningstar_stock_identifier = re.findall(r'\"(.*)\"', temporary_list[identifier_index])[0]
    
    # Scape data from Morningstar
    ## Create links
    link_growth = 'https://api-global.morningstar.com/sal-service/v1/stock/keyStats/growthTable/'\
    + morningstar_stock_identifier
    link_efficiency = 'https://api-global.morningstar.com/sal-service/v1/stock/keyStats/OperatingAndEfficiency/'\
    + morningstar_stock_identifier
    link_finHealth = 'https://api-global.morningstar.com/sal-service/v1/stock/keyStats/financialHealth/'\
    + morningstar_stock_identifier
    link_cashflow = 'https://api-global.morningstar.com/sal-service/v1/stock/keyStats/cashFlow/'\
    + morningstar_stock_identifier
    link_dividends = link_dividends = 'https://api-global.morningstar.com/sal-service/v1/stock/dividends/v4/'\
    + morningstar_stock_identifier + '/data'
    link_financials = 'https://api-global.morningstar.com/sal-service/v1/stock/newfinancials/'\
    + morningstar_stock_identifier + '/annual/summary'
    
    links_dict = {'growth': link_growth, 'efficiency': link_efficiency, 'finHealth': link_finHealth,
                  'cashflow': link_cashflow, 'dividends': link_dividends, 'financials': link_financials}
    
    ## Define parameters for web scrape process (including public API key)
    payload_dict = {'growth': 'sal-components-key-stats-growth-table',
                    'efficiency': 'sal-components-key-stats-oper-efficiency',
                    'finHealth':'sal-components-key-stats-financial-health',
                    'cashflow': 'sal-components-key-stats-cash-flow',
                    'dividends': 'sal-components-dividends',
                    'financials': 'sal-components-equity-financials-summary'}
    
    headers = {
        'apikey': 'lstzFDEOhfFNMLikKa0am9mgEKLBl49T',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' \
        + '(KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'}
    
    ## Start the scraping
    for category, link in links_dict.items():
    
        # Configure payload (parameter, see above)
        if category == 'financials':
            payload = {
                'reportType': 'A',
                'languageId': 'en',
                'locale': 'en',
                'clientId': 'MDC',
                # Different for each table:
                'component': payload_dict[category],
                'version': '3.71.0'
            }
        else:
            payload = {
                'languageId': 'en',
                'locale': 'en',
                'clientId': 'MDC',
                # Different for each table:
                'component': payload_dict[category],
                'version': '3.71.0'
            }        

        # Open webpage session
        with requests.Session() as sess:
            sess.headers.update(headers)
            resp = sess.get(link, params = payload)
            container = resp.json()

        # Collect data
        if category == 'growth':
            growth_dict = {}
            growth_dict['names'] = ['revenue_growth', 'operating_income_growth', 'net_income_growth', 'eps_growth']

            for item in container["dataList"][:-1]:
                year = item["fiscalPeriodYearMonth"][:4]
                growth_dict[year] = [item['revenuePer']['yearOverYear'], item['operatingIncome']['yearOverYear'],
                              item['netIncomePer']['yearOverYear'], item['epsPer']['yearOverYear']]

            growth_df = pd.DataFrame(growth_dict, index = growth_dict['names']).drop(['names'], axis = 1)

        elif category == 'efficiency':
            efficiency_dict = {}
            efficiency_dict['names'] = ['gross_margin_pct', 'operating_margin_pct', 'net_margin_pct', 'tax_rate_pct',
                                        'roa', 'roe', 'roic', 'interest_coverage_ratio', 'assets_turnover']

            for item in container["dataList"][:-3]:
                year = item["fiscalPeriodYear"]
                efficiency_dict[year] = [item['grossMargin'], item['operatingMargin'], item['netMargin'], item['taxRate'],
                          item['roa'], item['roe'], item['roic'], item['interestCoverage'], item['assetsTurnover']]

            efficiency_df = pd.DataFrame(efficiency_dict, index = efficiency_dict['names']).drop(['names'], axis = 1)

        elif category == 'finHealth':
            finHealth_dict = {}
            finHealth_dict['names'] = ['current_ratio', 'debt_to_equity_ratio', 'bvps']

            for item in container["dataList"][:-1]:
                year = item["fiscalPeriodYearMonth"][:4]
                finHealth_dict[year] = [item['currentRatio'], item['debtEquityRatio'], item['bookValuePerShare']]

                finHealth_df = pd.DataFrame(finHealth_dict, index = finHealth_dict['names']).drop(['names'], axis = 1)

        elif category == 'cashflow':
            cashflow_dict = {}
            cashflow_dict['names'] = ['operating_cf_growth', 'free_cf_growth', 'free_cf_to_revenue', 'free_cf_to_shares',
                          'capex_as_pct_of_sales']

            for item in container["dataList"][:-1]:
                year = item["fiscalPeriodYearMonth"][:4]
                cashflow_dict[year] = [item['operatingCFGrowthPer'], item['freeCashFlowGrowthPer'], item['freeCFPerSales'],
                           item['freeCashFlowPerShare'], item['capExAsPerOfSales']]

            cashflow_df = pd.DataFrame(cashflow_dict, index = cashflow_dict['names']).drop(['names'], axis = 1)

        elif category == 'dividends':
            dividends_list = container['rows'][0]['datum'][0:-3]
            payout_ratio_list = container['rows'][4]['datum'][0:-3]

        elif category == 'financials':
            latest_revenue = container['incomeStatement']['rows'][0]['datum'][-2]
            latest_operating_income = container['incomeStatement']['rows'][1]['datum'][-2]
            latest_net_income = container['incomeStatement']['rows'][2]['datum'][-2]
            latest_eps = container['incomeStatement']['rows'][5]['datum'][-2]
            latest_operating_cf = container['cashFlow']['rows'][0]['datum'][-2]
            latest_capex = container['cashFlow']['rows'][3]['datum'][-2]
            latest_free_cf = container['cashFlow']['rows'][4]['datum'][-2]

            # Determine historical values via latest values and growth rates:
            ## Revenue
            revenue_list = [latest_revenue * 1000]
            for gr in growth_df.loc['revenue_growth', :][::-1]:
                revenue_previous_year = revenue_list[-1] / (1 + (gr / 100))
                revenue_list.append(round(revenue_previous_year, 2))
            revenue_list = list(reversed(revenue_list[:-1]))

            ## Operating Income
            operating_income_list = [latest_operating_income * 1000]
            for gr in growth_df.loc['operating_income_growth', :][::-1]:
                operating_income_previous_year = operating_income_list[-1] / (1 + (gr / 100))
                operating_income_list.append(round(operating_income_previous_year, 2))
            operating_income_list = list(reversed(operating_income_list[:-1]))

            ## Net Income
            net_income_list = [latest_net_income * 1000]
            for gr in growth_df.loc['net_income_growth', :][::-1]:
                net_income_previous_year = net_income_list[-1] / (1 + (gr / 100))
                net_income_list.append(round(net_income_previous_year, 2))
            net_income_list = list(reversed(net_income_list[:-1]))

            ## EPS
            eps_list = [latest_eps]
            for gr in growth_df.loc['eps_growth', :][::-1]:
                eps_previous_year = eps_list[-1] / (1 + (gr / 100))
                eps_list.append(round(eps_previous_year, 2))
            eps_list = list(reversed(eps_list[:-1]))

            ## Operating Cash Flow
            operating_cf_list = [latest_operating_cf * 1000]
            for gr in cashflow_df.loc['operating_cf_growth', :][::-1]:
                operating_cf_previous_year = operating_cf_list[-1] / (1 + (gr / 100))
                operating_cf_list.append(round(operating_cf_previous_year, 2))
            operating_cf_list = list(reversed(operating_cf_list[:-1]))

            ## Free Cash Flow
            free_cf_list = [latest_free_cf * 1000]
            for gr in cashflow_df.loc['free_cf_growth', :][::-1]:
                free_cf_previous_year = free_cf_list[-1] / (1 + (gr / 100))
                free_cf_list.append(round(free_cf_previous_year, 2))
            free_cf_list = list(reversed(free_cf_list[:-1]))

            ## CapEx
            capex_sales_list = list(cashflow_df.loc['capex_as_pct_of_sales', :])
            capex_list = [-(rev * capex_sales / 100) for (rev, capex_sales) in list(zip(revenue_list, capex_sales_list))]

            ## Shares
            fcf_to_shares_list = list(cashflow_df.loc['free_cf_to_shares', :])
            shares_list = [free_cf / fcf_to_shares for (free_cf, fcf_to_shares) in list(zip(free_cf_list, fcf_to_shares_list))]

            ## Equity Ratio
            ### First: approximation of total equity since no data is available
            bvps_list = finHealth_df.loc['bvps', :]
            total_equity_list = [bvps * shares for (bvps, shares) in list(zip(bvps_list, shares_list))]
            ### Second: approximation of total assets since no data is available 
            roa_list = efficiency_df.loc['roa', :]
            total_assets_list = [net_income / (roa / 100) for (net_income, roa) in list(zip(net_income_list, roa_list))]
            ### Third: divide total equity by total assets to determine equity ratio
            equity_ratio_list = [equity / assets for (equity, assets) in list(zip(total_equity_list, total_assets_list))]

        time.sleep(10.0)
    
    # Change metric names for convenience:
    new_metric_names = ['shares_mil', 'revenue_mil', 'operating_income_mil', 'net_income_mil', 'eps', 'dividends',
       'payout_ratio_pct', 'bvps', 'operating_margin_pct', 'net_margin_pct', 'gross_margin_pct', 'interest_coverage_ratio',
        'tax_rate_pct', 'return_on_assets_pct', 'return_on_equity_pct', 'return_on_invested_capital_pct',
        'operating_cashflow_mil', 'free_cashflow_mil', 'free_cashflow_to_revenue_pct', 'cap_ex_mil', 'current_ratio',
        'debt_to_equity_ratio', 'equity_ratio_pct', 'asset_turnover']

    # Combine all data in one DataFrame object
    all_data_df = pd.DataFrame([shares_list, revenue_list, operating_income_list, net_income_list, eps_list, dividends_list,
                            payout_ratio_list, bvps_list, list(efficiency_df.loc['operating_margin_pct', :]),
                            list(efficiency_df.loc['net_margin_pct', :]), list(efficiency_df.loc['gross_margin_pct', :]),
                            list(efficiency_df.loc['interest_coverage_ratio', :]),
                            list(efficiency_df.loc['tax_rate_pct', :]), list(efficiency_df.loc['roa', :]),
                            list(efficiency_df.loc['roe', :]), list(efficiency_df.loc['roic', :]),
                            operating_cf_list, free_cf_list, list(cashflow_df.loc['free_cf_to_revenue', :]), capex_list,
                            list(finHealth_df.loc['current_ratio', :]), list(finHealth_df.loc['debt_to_equity_ratio', :]),
                            equity_ratio_list, list(efficiency_df.loc['assets_turnover', :])],
                            columns = growth_df.columns, index = new_metric_names)
    
    # Change type of values to float and round to two decimals
    df_cols = all_data_df.columns

    for col in df_cols:
        all_data_df[col] = all_data_df[col].astype('float')
    
    all_data_df = all_data_df.round(2)
    
    # Transpose the DataFrame and return it
    all_data_df = all_data_df.T
    
    return all_data_df

Example:

In [4]:
apple_data = get_modify_morningstar_data("AAPL", "XNAS")

In [5]:
apple_data

Unnamed: 0,shares_mil,revenue_mil,operating_income_mil,net_income_mil,eps,dividends,payout_ratio_pct,bvps,operating_margin_pct,net_margin_pct,...,return_on_equity_pct,return_on_invested_capital_pct,operating_cashflow_mil,free_cashflow_mil,free_cashflow_to_revenue_pct,cap_ex_mil,current_ratio,debt_to_equity_ratio,equity_ratio_pct,asset_turnover
2012,26263.42,156516.38,55241.25,41728.57,1.59,0.19,12.02,4.25,35.3,26.67,...,42.84,42.01,50853.59,41446.3,26.49,-9402.5,1.5,,0.76,1.07
2013,27571.92,170915.89,48998.99,37034.11,1.43,0.42,28.68,4.9,28.67,21.67,...,30.64,26.08,53665.79,44583.79,26.09,-9076.31,1.68,0.14,0.71,0.89
2014,25808.88,182794.54,52502.42,39507.99,1.62,0.46,28.08,5.15,28.72,21.61,...,33.61,26.2,59713.92,49893.72,27.3,-9812.98,1.08,0.26,0.61,0.83
2015,23601.71,233721.1,71230.03,53391.1,2.31,0.51,21.48,5.63,30.48,22.85,...,46.25,31.32,81264.68,69771.38,29.86,-11488.3,1.11,0.45,0.51,0.89
2016,23300.67,215654.46,60025.55,45686.76,2.08,0.56,26.23,5.93,27.84,21.19,...,36.9,21.95,65824.39,52272.72,24.24,-13548.97,1.35,0.59,0.45,0.7
2017,21035.5,229240.69,61346.11,48350.3,2.3,0.62,26.06,6.46,26.76,21.09,...,36.87,19.86,63599.53,50798.63,22.16,-12795.37,1.28,0.73,0.39,0.66
2018,22269.39,265598.26,70897.7,59528.89,2.98,0.7,22.84,6.04,26.69,22.41,...,49.36,24.41,77438.79,64118.03,24.14,-13313.16,1.12,0.87,0.36,0.72
2019,19133.34,260180.06,63928.46,55254.72,2.97,0.76,25.23,5.43,24.57,21.24,...,55.92,25.75,69392.9,58892.41,22.64,-10495.24,1.54,1.01,0.29,0.74
2020,18144.16,274515.98,66287.42,57409.65,3.28,0.81,24.24,4.26,24.15,20.91,...,73.69,30.11,80676.18,73362.27,26.73,-7309.03,1.36,1.51,0.23,0.83
2021,16693.91,365820.0,108950.0,94680.0,5.61,0.86,15.15,3.91,29.78,25.88,...,147.44,51.7,104040.0,92950.0,25.41,-11085.09,1.07,1.73,0.19,1.08
