In [6]:
import yfinance as yf
import pandas as pd
import numpy as np

import plotly.express as px


# wont give exact match; going out 40 years will be off by a few weeks, but it doesn't matter
# for our purposes
num_trading_days_per_year = 252

period = "100y"
symbol_sp500 = "^GSPC"  # S&P 500
symbol_dow_jones = "DJI"  # Dow Jones Index
symbol_vanguard_total_stock_etf = "VTI" # Vanguard Total Stock Market ETF

def create_title(title: str, subtitle: str = None) -> str:
    title = f"<b>{title}</b>"
    if subtitle:
        title += f"<br><sup>{subtitle}</sup>"
    return title

def annualized_return(initial: float, final: float, num_years) -> float:
    return (final / initial) ** (1 / num_years) - 1

def percent_increase(initial: float, final: float) -> float:
    return (final - initial) / initial

def format_percent(x: float, round_by=2) -> str:
    return f"{round(x * 100, round_by)}%"

In [19]:
def get_percent_increases(df: pd.DataFrame, num_years: int) -> pd.DataFrame:
    """
    Given a DataFrame of daily close prices with a datetime index and a corresponding `Close`
    column, this function will return a DataFrame with three added columns. One column, `Target`
    will contain the value of the stock/etf/etc *approximately* `num_years` from the coresponding 
    date for each row of the DataFrame. Note that the daily prices only contain rows for weekdays
    excluding holidays so it will be close, but not exactly `num_years` from the date. The second
    column added will be a `Percent Increase` column, and the third will be a `Annualized
    Return` column
    
    Args:
        df: DataFrame containing daily close prices
        num_years: the number of years in the future to grab the close price.
    """
    # X years in this data is not X*10, because it only shows weekdays excluding holidays
    # so we are just getting an approximate number of rows to skip that represents a X-year span 
    num_rows_horizon = num_years * num_trading_days_per_year

    df = df[['Close']].copy()
    df['Target Date'] = pd.DataFrame(pd.to_datetime(df.index)).shift(num_rows_horizon * -1)['Date'].values
    df['Target'] = df['Close'].shift(num_rows_horizon * -1)
    df['Percent Increase'] = percent_increase(initial=df['Close'], final=df['Target'])
    df['Annualized Return'] = annualized_return(
        initial=df['Close'],
        final=df['Target'],
        num_years=num_years
    )
    return df

# get_percent_increases(daily_prices, num_years=10)

# Dow Jones

In [9]:
symbol = symbol_dow_jones
daily_prices = yf.Ticker(symbol).history(period=period)
daily_prices.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1970-01-02 00:00:00-05:00,809.200012,809.200012,809.200012,809.200012,0,0.0,0.0
1970-01-05 00:00:00-05:00,811.309998,811.309998,811.309998,811.309998,0,0.0,0.0
1970-01-06 00:00:00-05:00,803.659973,803.659973,803.659973,803.659973,0,0.0,0.0
1970-01-07 00:00:00-05:00,801.809998,801.809998,801.809998,801.809998,0,0.0,0.0
1970-01-08 00:00:00-05:00,802.070007,802.070007,802.070007,802.070007,0,0.0,0.0


In [4]:
fig = px.line(
    daily_prices,
    x=daily_prices.index,
    y='Close',
    title=create_title(f"Daily Close Price of `{symbol}`"),
    labels={'Close': "Close Price"},
)
fig.show()

In [5]:
# create a scatter plot so that we can get extract the regression line
# (can only do regression line with scatter)
subset = daily_prices#.loc['1980-01-01':'2020-01-01']
fig_scatter = px.scatter(
    subset,
    x=subset.index,
    y=np.log(subset['Close']),
    trendline="ols",
)
fig = px.line(
    daily_prices,
    x=daily_prices.index,
    y=np.log(daily_prices['Close']),
    title=create_title(
        title=f"Log of Daily Close Prices of `{symbol}`",
        subtitle="On a logarithmic scale, each increment represents a multiplicative increase " \
        "rather than an additive increase."
    ),
    labels={'y': "Log Close Price"},
)
fig.add_trace(fig_scatter.data[1])
fig.show()

- need to consider various time windows (am i investing for 10 years, 20 years?)

# How much would I make in 10 years, if I invested one amount at a single point in time?

In [10]:
prices_after_10_years = get_percent_increases(df=daily_prices, num_years=10)
prices_after_10_years

Unnamed: 0_level_0,Close,Target Date,Target,Percent Increase,Annualized Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1970-01-02 00:00:00-05:00,809.200012,1980-01-02 05:00:00,824.570007,0.018994,0.001883
1970-01-05 00:00:00-05:00,811.309998,1980-01-03 05:00:00,820.309998,0.011093,0.001104
1970-01-06 00:00:00-05:00,803.659973,1980-01-04 05:00:00,828.840027,0.031332,0.003090
1970-01-07 00:00:00-05:00,801.809998,1980-01-07 05:00:00,832.000000,0.037652,0.003703
1970-01-08 00:00:00-05:00,802.070007,1980-01-08 05:00:00,851.710022,0.061890,0.006023
...,...,...,...,...,...
2022-02-24 00:00:00-05:00,33223.800781,NaT,,,
2022-02-25 00:00:00-05:00,34058.800781,NaT,,,
2022-02-28 00:00:00-05:00,33892.601562,NaT,,,
2022-03-01 00:00:00-05:00,33294.898438,NaT,,,


In [11]:
fig = px.box(
    data_frame=prices_after_10_years,
    y='Percent Increase',
    title=create_title(
        title=f"Percent Increase over 10 Years (`{symbol}`)",
        subtitle="Each point in the data represents investing at a particular point in time.<br>" \
            "This graph shows the percent increase of investing a single sum at a specific " \
            "point and then waiting 10 years."
    ),
)
fig.layout.yaxis.tickformat = ',.0%'
fig.show()

In [12]:
fig = px.box(
    data_frame=prices_after_10_years,
    y='Annualized Return',
    title=create_title(
        title=f"Annualized Rate of Return over 10 Years (`{symbol}`)",
        subtitle="Each point in the data represents investing at a particular point in time.<br>" \
            "This graph shows the annualized rate of investing a single sum at a specific " \
            "point and then waiting 10 years."
    ),
)
fig.layout.yaxis.tickformat = ',.0%'
fig.show()

In [13]:
# create a scatter plot so that we can get extract the regression line (can only do regression line with scatter)
fig_scatter = px.scatter(
    prices_after_10_years,
    x=prices_after_10_years.index,
    y='Percent Increase',
    trendline="ols",
)
fig = px.line(
    prices_after_10_years,
    x=prices_after_10_years.index,
    y='Percent Increase',
    title=create_title(
        title=f"Percent Increase in Investment Over 10 Years (`{symbol}`)",
        subtitle="Assuming a single investment at investment date," \
            "<br>This graph shows the percent increase of investing at that " \
            "point and then waiting 10 years."
    ),
    labels={'x', "Investment Date"}
)
fig.add_hline(y=0, line_color='red')
fig.add_trace(fig_scatter.data[1])
fig.layout.yaxis.tickformat = ',.0%'
fig.show()

In [14]:
# create a scatter plot so that we can get extract the regression line (can only do regression line with scatter)
fig_scatter = px.scatter(
    prices_after_10_years,
    x=prices_after_10_years.index,
    y='Annualized Return',
    trendline="ols",
)
fig = px.line(
    prices_after_10_years,
    x=prices_after_10_years.index,
    y='Annualized Return',
    title=create_title(
        title=f"Annualized Return in Investment Over 10 Years (`{symbol}`)",
        subtitle="Assuming a single investment at investment date," \
            "<br>This graph shows the annualized return of investing at that " \
            "point and then waiting 10 years."
    ),
    labels={'x', "Investment Date"}
)
fig.add_hline(y=0, line_color='red')
fig.add_trace(fig_scatter.data[1])
fig.layout.yaxis.tickformat = ',.0%'
fig.show()

In [15]:
temp = pd.melt(
    prices_after_10_years.drop(columns='Percent Increase'),
    value_vars=['Close', 'Target'],
    ignore_index=False
)
fig = px.line(
    temp,
    x=temp.index,
    y='value',
    color='variable',
    labels={'x', "Date"}
)
fig.show()

---

# If I invested $`X` in each month over `Y` years with `Z` index(es), how much money will I have made?

In [21]:
monthly_investment = 100
year_start = 1970
num_years = 10

prices_after_10_years.head()

Unnamed: 0_level_0,Close,Target Date,Target,Percent Increase,Annualized Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1970-01-02 00:00:00-05:00,809.200012,1980-01-02 05:00:00,824.570007,0.018994,0.001883
1970-01-05 00:00:00-05:00,811.309998,1980-01-03 05:00:00,820.309998,0.011093,0.001104
1970-01-06 00:00:00-05:00,803.659973,1980-01-04 05:00:00,828.840027,0.031332,0.00309
1970-01-07 00:00:00-05:00,801.809998,1980-01-07 05:00:00,832.0,0.037652,0.003703
1970-01-08 00:00:00-05:00,802.070007,1980-01-08 05:00:00,851.710022,0.06189,0.006023


- For now, assume I start investing on the first of each month

In [22]:
mask = (prices_after_10_years.index.year == 1970) & (prices_after_10_years.index.month == 12)
first_row = prices_after_10_years.loc[mask].iloc[1]
first_row.name

Timestamp('1970-12-02 00:00:00-0500', tz='America/New_York')

In [23]:
investments = np.empty(num_years * 12)
investments[:] = np.nan

payouts = np.empty(num_years * 12)
payouts[:] = np.nan

index = 0
for year_index in range(num_years):
    # print(year_start + year_index)
    for month in range(1, 13):
        mask = (prices_after_10_years.index.year == year_start) & (prices_after_10_years.index.month == month)
        row = prices_after_10_years.loc[mask].iloc[1]
        investments[index] = 100
        assert ~np.isnan(row['Percent Increase'])
        payouts[index] = 100 * (1 + row['Percent Increase'])
        index += 1

assert (~np.isnan(payouts)).all()
assert (~np.isnan(investments)).all()
assert sum(investments) == 10*12*100

print(investments)
print(payouts)

[100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100.
 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100.
 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100.
 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100.
 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100.
 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100.
 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100.
 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100. 100.
 100. 100. 100. 100. 100. 100. 100. 100.]
[101.10931703 116.37313586 109.61621922  99.16453233 113.18713345
 119.41066122 127.11785717 128.26284147 125.97271552 122.98214776
 122.0201288  121.134008   101.10931703 116.37313586 109.61621922
  99.16453233 113.18713345 119.41066122 127.11785717 128.26284147
 125.97271552 122.98214776 122.0201288  121.134008   101.10931703
 116.37313586 109.61621922  99.16453233 113.18713345 119.410

In [24]:
sum(investments)

12000.0

In [25]:
sum(payouts)

14063.50697833597

In [26]:
f"{round(percent_increase(initial=sum(investments), final=sum(payouts)) * 100, 2)}%"

'17.2%'

In [27]:
f"{round(annualized_return(initial=sum(investments), final=sum(payouts), num_years=num_years) * 100, 2)}%"

'1.6%'

In [30]:
from typing import Tuple


#This assumes e.g. that i hold all investments for 10 years; i.e. on day 1 i sell it 10 years from
# that date so ~1980 and the last investment is also held for 10 years so it is sold e.g. ~1990

# Alternatively I could hold them until a specific date

# Alternatively i could start selling off X years from a target date

# Need to measure success not only on returns but also risk; e.g. best/worst case scenario

def monthly_investments(
        daily_prices: pd.DataFrame,
        start_year: int,
        # start_month: int,
        monthly_investment: float,
        investment_years: int,
        horizon_years: int, 
        ) -> Tuple[float, float, float, float]:
    """
    Description
    
    Args:
        daily_prices:
            dataframe that has `Close` and `Target` columns.
            `Close` represents the closing price at the end of the correpsonding day
            `Target` represents the closing price X years from that date where X is represented by
                `horizon_years`
        start_year: the year to start investing (current assumes the first month of that year)
        start_month: not implemented yet
        monthly_investment: the amount to invest each month
        investment_years:
            the number of years to invest the monthly sum of `monthly_investment`
        horizon_years:
            the number of years to hold the investment
    """

    investments = np.empty(investment_years * 12)
    investments[:] = np.nan

    payouts = np.empty(investment_years * 12)
    payouts[:] = np.nan

    # get the close price X years into the future
    df = get_percent_increases(df=daily_prices, num_years=horizon_years)

    index = 0
    for year_index in range(investment_years):
        # print(year_start + year_index)
        for month in range(1, 13):
            mask = (df.index.year == start_year) & (df.index.month == month)
            row = df.loc[mask].iloc[1]
            investments[index] = monthly_investment
            assert ~np.isnan(row['Percent Increase'])
            payouts[index] = monthly_investment * (1 + row['Percent Increase'])
            index += 1

    assert (~np.isnan(payouts)).all()
    assert (~np.isnan(investments)).all()
    assert sum(investments) == investment_years*12*monthly_investment

    perc_increase = percent_increase(initial=sum(investments), final=sum(payouts))
    ann_return = annualized_return(
        initial=sum(investments),
        final=sum(payouts),
        num_years=horizon_years
    )
    return investments, payouts, perc_increase, ann_return

12000.0
14063.50697833597
17.2%
1.6%


In [36]:
investments, payouts, perc_increase, ann_return = monthly_investments(
    daily_prices=daily_prices,
    start_year=1970,
    # start_month=1,
    monthly_investment=100,
    investment_years=10,
    horizon_years=10,
)
print(sum(investments))
print(sum(payouts))
print(format_percent(perc_increase))
print(format_percent(ann_return))

12000.0
14063.50697833597
17.2%
1.6%


In [37]:
print(investments[-1])
print(payouts[-1])

100.0
121.13400799698626


In [38]:
prices_after_10_years.iloc[2526]

Close                         824.570007
Target Date          1989-12-28 05:00:00
Target                       2732.300049
Percent Increase                2.313606
Annualized Return               0.127276
Name: 1980-01-02 00:00:00-05:00, dtype: object

In [None]:
investments

In [33]:
investments, payouts, perc_increase, ann_return = monthly_investments(
    daily_prices=daily_prices,
    start_year=1980,
    # start_month=1,
    monthly_investment=100,
    investment_years=10,
    horizon_years=10,
)
print(sum(investments))
print(sum(payouts))
print(format_percent(perc_increase))
print(format_percent(ann_return))

12000.0
36501.26336767385
204.18%
11.77%


In [35]:
prices_after_10_years.iloc[2526]

Close                         824.570007
Target Date          1989-12-28 05:00:00
Target                       2732.300049
Percent Increase                2.313606
Annualized Return               0.127276
Name: 1980-01-02 00:00:00-05:00, dtype: object

# Archive

## Show how log is multiplicative increase 

In [None]:
# Starting at 100, create an array where each subsequent value is an increase of 10%
temp = np.zeros(20)
temp[0] = 100

percent_increase = 0.10
previous = temp[0]

for index, value in enumerate(temp[1:]):
    temp[index + 1] = previous * (1 + percent_increase)
    previous = temp[index + 1]

temp

In [None]:
import matplotlib.pyplot as plt
import numpy as np
fig, ax = plt.subplots()
ax.plot(temp)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
fig, ax = plt.subplots()
ax.plot(np.log(temp))
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
fig, ax = plt.subplots()
ax.plot(np.log2(temp))
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
fig, ax = plt.subplots()
ax.plot(np.log10(temp))
plt.show()

## Figure out close price X years from given date

In [None]:
# number of rows for 5 years
np.argmax(daily_prices.index >= '1975-01-01')

In [None]:
1263 * 2

In [None]:
# number of rows for 10 years
np.argmax(daily_prices.index >= '1980-01-01')

In [None]:
2526 * 2

In [None]:
# number of rows for 20 years
np.argmax(daily_prices.index >= '1990-01-01')

In [None]:
# 10 years in this data is not 365*10, because it only shows weekdays excluding holidays
# so we are just getting an approximate number of rows to skip that represents a 10-year span 
num_rows_in_10_years = np.argmax(daily_prices.index >= '1980-01-01')

df = daily_prices[['Close']].copy()
df['Target'] = df['Close'].shift(num_rows_in_10_years * -1)
df['Percent Increase'] = percent_increase(initial=df['Close'], final=df['Target'])
df['Annualized Return'] = annualized_return(
    initial=df['Close'],
    final=df['Target'],
    num_years=10
)
print(num_rows_in_10_years)
df

In [None]:
def get_percent_increases(df: pd.DataFrame, num_years: int) -> pd.DataFrame:
    """
    Given a DataFrame of daily close prices with a datetime index and a corresponding `Close`
    column, this function will return a DataFrame with three added columns. One column, `Target`
    will contain the value of the stock/etf/etc *approximately* `num_years` from the coresponding 
    date for each row of the DataFrame. Note that the daily prices only contain rows for weekdays
    excluding holidays so it will be close, but not exactly `num_years` from the date. The second
    column added will be a `Percent Increase` column, and the third will be a `Annualized
    Return` column
    
    Args:
        df: DataFrame containing daily close prices
        num_years: the number of years in the future to grab the close price.
    """
    # X years in this data is not X*10, because it only shows weekdays excluding holidays
    # so we are just getting an approximate number of rows to skip that represents a X-year span 
    if num_years == 5:
        num_rows_horizon = 1263
    elif num_years == 10:
        num_rows_horizon = 2526
    elif num_years == 20:
        num_rows_horizon = 5054
    else:
        raise ValueError("This horizon is not supported.")

    df = df[['Close']].copy()
    df['Target Date'] = pd.DataFrame(pd.to_datetime(df.index)).shift(num_rows_horizon * -1)['Date'].values
    df['Target'] = df['Close'].shift(num_rows_horizon * -1)
    df['Percent Increase'] = percent_increase(initial=df['Close'], final=df['Target'])
    return df

get_percent_increases(daily_prices, num_years=10)

In [None]:
daily_prices.head()  # should remain unchanged