In [1]:
import pandas as pd
import numpy as np
from io import StringIO
#import benford as bf
#import matplotlib.pyplot as plt
from urllib.error import HTTPError
import altair as alt
import yfinance as yf

from os import environ

try:
    # for local execution
    apiKeyFromFile = open("/Users/kyledunn/fredApiKey.txt", "r").read().strip()
except FileNotFoundError:
    apiKeyFromFile = None
    pass
# for CI
apiKey = environ.get("FRED_API_KEY", apiKeyFromFile)

from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
    
def getSeries(series="", apiKey=None, description=None):
    fetchCommand = "https://api.stlouisfed.org/fred/series/observations?series_id={s}&realtime_end=9999-12-31&api_key={k}&file_type=txt" 
    
    resp = urlopen(fetchCommand.format(s=series, k=apiKey))
    zipfile = ZipFile(BytesIO(resp.read()))
    
    filesInZip = zipfile.namelist()
    
    data = zipfile.open(filesInZip[1])
    
    if description is None:
        description = series
    
    df = pd.read_csv(data, sep="\t", header=None, skiprows=1,
                       names=["date", description, "rt_start", "rt_end"], na_values=".")
    
    df['date'] = pd.to_datetime(df.date)
    
    return df.set_index("date")



In [2]:
sp500 = yf.Ticker("^GSPC")

# get historical market data
df_sp500 = sp500.history(period="max")

df_sp500.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1927-12-30,17.66,17.66,17.66,17.66,0,0,0
1928-01-03,17.76,17.76,17.76,17.76,0,0,0
1928-01-04,17.72,17.72,17.72,17.72,0,0,0
1928-01-05,17.55,17.55,17.55,17.55,0,0,0
1928-01-06,17.66,17.66,17.66,17.66,0,0,0


In [3]:
# CPIAUCSL
df_cpi = getSeries("CPIAUCSL", apiKey=apiKey, description="CPI")

## What does the S&P 500 look like when adjusted by "inflation" (CPI)?

In [4]:
returns = (df_sp500['Close'].resample("1M").last() / df_cpi.CPI.resample("1M").last()).dropna().reset_index()
returns.columns = ['Date', 'Adj-close']

alt.Chart(returns.reset_index()).mark_line().encode(
    alt.X('Date:T'),
    alt.Y('Adj-close:Q')
).properties(
    title='CPI Adjusted SP500',
    width=700,
    height=450
)

In [5]:
# Compute CAGR for each period

totals = []
for i in range(0, int(returns.shape[0]-(40*12)-1)):
    totals.append(dict({
        "start": returns.iloc[i, 0],
        "return": ((returns.iloc[(40*12)+(i), 1] / returns.iloc[i, 1])**(1/40) - 1) * 100
    }))

In [6]:
df_totals = pd.DataFrame.from_dict(totals)

alt.Chart(df_totals).mark_line().encode(
    alt.X('start:T', axis=alt.Axis(title='')),
    alt.Y('return:Q', axis=alt.Axis(title='Total 40yr Return [CAGR]'))
).properties(
    title='Inflation Adjusted S&P 500 Returns (CAGR) vs Start Date of Investment',
    width=700,
    height=450
)

## How likely is it for an 40 year investment in the S&P 500 to make 7% CAGR?

In [7]:
hist = alt.Chart(df_totals).transform_joinaggregate(
    total='count(*)'
).transform_calculate(
    pct='1 / datum.total',
    decade='toString(floor(year(datum.start) / 10)) + "0\'s"'
).mark_bar(opacity=0.6).encode(
    alt.X("return:Q", bin=True, axis=alt.Axis(title='40yr CAGR Return [%]')),
    alt.Y('sum(pct):Q', axis=alt.Axis(title='Likelihood')),
    #alt.Row("decade:N")
).properties(
    title='Distribution of Inflation Adjusted S&P 500 Returns (CAGR)',
    width=700,
    height=450
)

cumu = alt.Chart(df_totals).mark_line(color='black', interpolate='step-after').transform_joinaggregate(
    total='count(*)'
).transform_calculate(
    pct='1 / datum.total'
).transform_window(
    frame=[None, 0],
    sort=[{"field": "return"}],
    cumu='sum(pct)'
).encode(
    alt.X("return:Q"),
    alt.Y('cumu:Q', axis=alt.Axis(title='Cumulative Likelihood'))
).properties(
    title='Distribution of Inflation Adjusted S&P 500 Returns (CAGR)',
    width=700,
    height=450,
)

(hist + cumu).resolve_scale(y='independent').properties(background='white')

## How do the S&P 500 returns vary with the decade the investment started?

In [8]:
histf = alt.Chart(df_totals).transform_joinaggregate(
    total='count(*)'
).transform_calculate(
    pct='1 / datum.total',
    decade='toString(floor(year(datum.start) / 10)) + "0\'s"'
).mark_bar(opacity=0.6).encode(
    alt.X("return:Q", bin=True, axis=alt.Axis(title='40yr CAGR Return [%]')),
    alt.Y('sum(pct):Q', axis=alt.Axis(title='Likelihood')),
    alt.Color("decade:N"),
    alt.Row("decade:N")
).properties(
    title='Distribution of Inflation Adjusted S&P 500 Returns (CAGR)',
    width=700,
    height=450
)

histf.display()

## How healthy are the balance sheets of S&P 500 companies, in aggregate (ex tech companies)?

In [9]:
df_spex = pd.read_html("http://www.proshares.com/funds/spxt_daily_holdings.html")[0]

#df_spex.head()

In [10]:
%%time

urlFor = lambda t: "https://stockrow.com/api/companies/{}/financials.xlsx?dimension=Q&section=Balance%20Sheet&sort=desc".format(t)
def dfFor(t):
    try:
        tmp = pd.read_excel(urlFor(t))
        tmp.columns = list(map(lambda v: v if isinstance(v, str) else "{}-{}-{}".format(v.year, v.month, v.day), tmp.columns.to_list()))
        tmp = tmp.dropna().set_index("Unnamed: 0").T

        tmp['Ticker'] = [t]*tmp.shape[0]
        return tmp
    except HTTPError:
        print("{} failed".format(t))
        return None

map_sp500 = map(dfFor, df_spex['Ticker Symbol'].values)

all_df = pd.concat(map_sp500)

#all_df.head()

all_df['dt'] = pd.to_datetime(all_df.index, format="%Y-%m-%d")

#all_df['dt'].head()

#all_df.dtypes

HCN failed
CBG failed
JEC failed
BHGE failed
- failed
CPU times: user 18.4 s, sys: 421 ms, total: 18.8 s
Wall time: 3min 50s


In [11]:
alt.data_transformers.disable_max_rows()

def violinFor(df, metric='Total Debt'):
    return alt.Chart(df[['dt', metric]].dropna()).transform_calculate(
        yr='year(datum.dt)'
    ).transform_density(
        metric,
        as_=['{}'.format(metric.replace(" ", "_")), 'density'],
        extent=[df[metric].quantile(0.1), df[metric].quantile(0.9)],
        maxsteps=1000,
        groupby=['yr']
    ).mark_area(orient='horizontal').encode(
        alt.Y('{}:Q'.format(metric.replace(" ", "_"))),
        alt.Color('yr:N', title='Year'),
        x=alt.X(
            'density:Q',
            stack='center',
            impute=None,
            title=None,
            axis=alt.Axis(labels=False, values=[0],grid=False, ticks=True),
        ),
        column=alt.Column(
            'yr:N',
            header=alt.Header(
                titleOrient='bottom',
                labelOrient='bottom',
                labelPadding=0,
            ),
        )
    ).properties(
        title="Changing {} Profile of S&P 500 (ex tech)".format(metric),
        width=100
    )

violinFor(all_df)

In [12]:
violinFor(all_df, metric='Treasury Stock')

In [13]:
charts = []
for c in all_df.columns:
    if c in ['dt', 'Ticker']:
        continue
    charts.append(violinFor(all_df, metric=c))
    
#alt.vconcat(*charts)

In [14]:
def chartFor(df, metric='Long Term Debt (Total)'):
    summed = df.copy().reset_index().groupby("dt").sum().resample('1Q').mean()
    #yoy = summed.sort_index().dropna().pct_change(12).dropna().apply(lambda v: v * 100)

    return alt.Chart(summed.sort_index().reset_index()[:-1]).mark_line().encode(
        alt.X("dt:T", axis=alt.Axis(title='')),
        alt.Y("{}:Q".format(metric), axis=alt.Axis(title="{} [USD]".format(metric))),
        tooltip=[alt.Tooltip("dt:T", format="%b %Y"), alt.Tooltip("{}:Q".format(metric))]
    ).properties(
        title="{} trend for S&P 500 (Ex Technology)".format(metric),
        width=700,
        height=450
    )
    
chartFor(all_df)

In [15]:
lcharts = []
for c in all_df.columns:
    if c in ['dt', 'Ticker']:
        continue
    lcharts.append(chartFor(all_df.dropna(how='all'), metric=c))
    
alt.vconcat(*lcharts)

In [16]:
def yoyChartFor(df, metric='Long Term Debt (Total)'):
    summed = df.copy().reset_index().groupby("dt").sum().resample('1Q').mean()
    yoy = summed.sort_index().dropna().pct_change(4).dropna().apply(lambda v: v * 100)

    return alt.Chart(yoy.sort_index().reset_index()[:-1]).mark_bar().encode(
        alt.X("dt:T", axis=alt.Axis(title='')),
        alt.Y("{}:Q".format(metric), axis=alt.Axis(title="{} Growth [year-over-year %]".format(metric))),
        tooltip=[alt.Tooltip("dt:T", format="%b %Y"), alt.Tooltip("{}:Q".format(metric))]
    ).properties(
        title="{} growth for S&P 500 (Ex Technology)".format(metric),
        width=700,
        height=450
    )
    
yoyChartFor(all_df)

In [17]:
ycharts = []
for c in all_df.columns:
    if c in ['dt', 'Ticker']:
        continue
    ycharts.append(yoyChartFor(all_df.dropna(how='all'), metric=c))
    
alt.vconcat(*ycharts)

In [18]:
yoyChartFor(all_df, 'Pension and Post-Retirement Liabilities')

In [19]:
yoyChartFor(all_df, 'Liabilities (Preferred Stock)')

In [20]:
yoyChartFor(all_df, 'Shares (Common)')

In [21]:
chartFor(all_df, 'Shares (Common)')

In [22]:
chartFor(all_df, 'Goodwill and Intangible Assets (Total)')

In [23]:
chartFor(all_df, 'Dividends Payable')

In [24]:
chartFor(all_df, 'Property, Plant, Equpment (Net)')

In [25]:
chartFor(all_df, 'Receivables')

In [26]:
chartFor(all_df, 'Inventory')

In [27]:
chartFor(all_df, 'Receivables') + chartFor(all_df, 'Accounts Payable')

In [28]:
chartFor(all_df, 'Total Assets') + chartFor(all_df, 'Total liabilities')

In [29]:
chartFor(all_df, 'Treasury Stock')

In [30]:
chartFor(all_df, 'Cash and Short Term Investments')