# Import Libraries

In [2]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import time

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn; seaborn.set(style='white')

# Import Data

Yahoo Finance ^GSPC
* Select
    * Time Period: Jan 03, 1950 - Jul 05, 2018
    * Show: Historical Prices
    * Frequency: Daily 
    * Click -> "Apply"
    * Click -> "Download Data"
* URL: https://finance.yahoo.com/quote/%5EGSPC/history/

In [2]:
sp_data_csv_path = 'sp500_yahoo_data_gspc_daily.csv'
df = pd.read_csv(sp_data_csv_path)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17237 entries, 0 to 17236
Data columns (total 7 columns):
Date         17237 non-null object
Open         17237 non-null float64
High         17237 non-null float64
Low          17237 non-null float64
Close        17237 non-null float64
Adj Close    17237 non-null float64
Volume       17237 non-null int64
dtypes: float64(5), int64(1), object(1)
memory usage: 942.7+ KB


In [4]:
df.head(2)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1950-01-03,16.66,16.66,16.66,16.66,16.66,1260000
1,1950-01-04,16.85,16.85,16.85,16.85,16.85,1890000


In [5]:
df.shape

(17237, 7)

In [6]:
# start program runtime
time_start = time.time()

# Time Period Calculations

In [7]:
list_of_dataframes = []

start_range = 1
end_range = 252*30

for i in np.arange(start_range, end_range + 1):
    
    trading_days = i
    
    df_tds = df[['Date', 'Close']].copy(deep=True)
    df_tds.rename(columns={'Date': 'start_date', 'Close': 'start_price'}, inplace=True)
    
    df_tds['trading_days'] = trading_days
    df_tds['end_date'] = df_tds['start_date'].shift(periods=-trading_days)
    df_tds['end_price'] = df_tds['start_price'].shift(periods=-trading_days)
    df_tds.dropna(inplace=True)
    
    list_of_dataframes.append(df_tds)

df_list = pd.concat(list_of_dataframes)
df_list.reset_index(drop=True, inplace=True)
df_list.head(2)

Unnamed: 0,start_date,start_price,trading_days,end_date,end_price
0,1950-01-03,16.66,1,1950-01-04,16.85
1,1950-01-04,16.85,1,1950-01-05,16.93


In [8]:
df_list.shape

(101731140, 5)

In [9]:
df_list['price_difference'] = df_list['end_price'] - df_list['start_price']
df_list['roi'] = df_list['price_difference'] / df_list['start_price']
df_list.head(2)

Unnamed: 0,start_date,start_price,trading_days,end_date,end_price,price_difference,roi
0,1950-01-03,16.66,1,1950-01-04,16.85,0.19,0.011405
1,1950-01-04,16.85,1,1950-01-05,16.93,0.08,0.004748


In [10]:
# rearrange columns
df_list = df_list[['start_date', 'end_date', 'trading_days', 'start_price', 'end_price', 'price_difference', 'roi']]

In [11]:
df_list.tail(2)

Unnamed: 0,start_date,end_date,trading_days,start_price,end_price,price_difference,roi
101731138,1988-07-01,2018-07-02,7560,271.779999,2726.709961,2454.929962,9.032784
101731139,1988-07-05,2018-07-03,7560,275.809998,2713.219971,2437.409973,8.837279


In [12]:
df_list.shape

(101731140, 7)

In [13]:
print(len(df_list.index.unique()))

101731140


In [14]:
# print program runtime
time_end = time.time()

def timer(start,end):
    hours, rem = divmod(end-start, 3600)
    minutes, seconds = divmod(rem, 60)
    print("Calculations Runtime: {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))

timer(time_start, time_end)

Calculations Runtime: 00:01:12.77


## Store to_csv: df_list

In [15]:
# store pandas dateframe to csv
prefix = 'sp500_timeperiodpricereturn_output_'
trading_days_range = str(start_range) + '-' + str(end_range)
filetype = '.csv'
filename = prefix + trading_days_range + filetype

print(filename)
df_list.to_csv(filename, index=False, chunksize=100000)

sp500_timeperiodpricereturn_output_1-7560.csv


# Summary of trading_days

In [3]:
df_list = pd.read_csv('sp500_timeperiodpricereturn_output_1-7560.csv')

In [29]:
df_list_summary = (df_list.groupby(['trading_days'])
                   .agg({'trading_days': 'count', 'roi': ['mean', 'median', 'min', 'max']}))

df_list_summary = df_list_summary.rename(columns={"trading_days": "number_of_periods"})
df_list_summary.columns = ["_".join(x) for x in df_list_summary.columns.ravel()]
df_list_summary.rename(columns={'number_of_periods_count': 'number_of_periods'}, inplace=True)
df_list_summary['roi_range'] = df_list_summary['roi_max'] - df_list_summary['roi_min']
df_list_summary.head(3)

Unnamed: 0_level_0,number_of_periods,roi_mean,roi_median,roi_min,roi_max,roi_range
trading_days,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,17236,0.000342,0.000468,-0.204669,0.1158,0.32047
2,17235,0.000686,0.001031,-0.245706,0.149173,0.394879
3,17234,0.001026,0.001707,-0.263375,0.13948,0.402855


In [30]:
df_count_nonneg = df_list.groupby(['trading_days'])['roi'].apply(lambda x: x[x >= 0].count()).to_frame(name='number_of_periods_nonneg')
df_count_neg = df_list.groupby(['trading_days'])['roi'].apply(lambda x: x[x < 0].count()).to_frame(name='number_of_periods_neg')

In [31]:
df_list_summary = df_list_summary.merge(df_count_nonneg, how='left', left_index=True, right_index=True)
df_list_summary = df_list_summary.merge(df_count_neg, how='left', left_index=True, right_index=True)
df_list_summary.head(3)

Unnamed: 0_level_0,number_of_periods,roi_mean,roi_median,roi_min,roi_max,roi_range,number_of_periods_nonneg,number_of_periods_neg
trading_days,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,17236,0.000342,0.000468,-0.204669,0.1158,0.32047,9255,7981
2,17235,0.000686,0.001031,-0.245706,0.149173,0.394879,9420,7815
3,17234,0.001026,0.001707,-0.263375,0.13948,0.402855,9614,7620


In [32]:
df_list_summary['percent_of_periods_nonneg'] = df_list_summary['number_of_periods_nonneg'] / df_list_summary['number_of_periods']
df_list_summary['percent_of_periods_neg'] = df_list_summary['number_of_periods_neg'] / df_list_summary['number_of_periods']
df_list_summary.head(3)

Unnamed: 0_level_0,number_of_periods,roi_mean,roi_median,roi_min,roi_max,roi_range,number_of_periods_nonneg,number_of_periods_neg,percent_of_periods_nonneg,percent_of_periods_neg
trading_days,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,17236,0.000342,0.000468,-0.204669,0.1158,0.32047,9255,7981,0.536958,0.463042
2,17235,0.000686,0.001031,-0.245706,0.149173,0.394879,9420,7815,0.546562,0.453438
3,17234,0.001026,0.001707,-0.263375,0.13948,0.402855,9614,7620,0.557851,0.442149


In [33]:
df_list_summary['marginal_nonneg_periods_per_day'] = (df_list_summary['percent_of_periods_nonneg'] - 
                                                   df_list_summary['percent_of_periods_nonneg'].shift(1))

df_list_summary['cumulative_pos_periods_per_day'] = (df_list_summary['percent_of_periods_nonneg'] - 
                                                   df_list_summary['percent_of_periods_nonneg'].iloc[0])

df_list_summary.head(3)

Unnamed: 0_level_0,number_of_periods,roi_mean,roi_median,roi_min,roi_max,roi_range,number_of_periods_nonneg,number_of_periods_neg,percent_of_periods_nonneg,percent_of_periods_neg,marginal_nonneg_periods_per_day,cumulative_pos_periods_per_day
trading_days,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,17236,0.000342,0.000468,-0.204669,0.1158,0.32047,9255,7981,0.536958,0.463042,,0.0
2,17235,0.000686,0.001031,-0.245706,0.149173,0.394879,9420,7815,0.546562,0.453438,0.009605,0.009605
3,17234,0.001026,0.001707,-0.263375,0.13948,0.402855,9614,7620,0.557851,0.442149,0.011289,0.020893


In [34]:
df_list_summary.tail()

Unnamed: 0_level_0,number_of_periods,roi_mean,roi_median,roi_min,roi_max,roi_range,number_of_periods_nonneg,number_of_periods_neg,percent_of_periods_nonneg,percent_of_periods_neg,marginal_nonneg_periods_per_day,cumulative_pos_periods_per_day
trading_days,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
7556,9681,8.047896,7.717431,3.042454,19.805455,16.763001,9681,0,1.0,0.0,0.0,0.463042
7557,9680,8.050274,7.721897,3.056901,19.601385,16.544483,9680,0,1.0,0.0,0.0,0.463042
7558,9679,8.052662,7.723589,3.06268,19.739295,16.676614,9679,0,1.0,0.0,0.0,0.463042
7559,9678,8.055076,7.716492,3.079911,19.635878,16.555967,9678,0,1.0,0.0,0.0,0.463042
7560,9677,8.057493,7.720622,3.095808,20.322556,17.226748,9677,0,1.0,0.0,0.0,0.463042


In [35]:
# expect 1 NaN in "Marginal_Pos_Periods_Per_Day" due to shift calculation
df_list_summary.isnull().sum()

number_of_periods                  0
roi_mean                           0
roi_median                         0
roi_min                            0
roi_max                            0
roi_range                          0
number_of_periods_nonneg           0
number_of_periods_neg              0
percent_of_periods_nonneg          0
percent_of_periods_neg             0
marginal_nonneg_periods_per_day    1
cumulative_pos_periods_per_day     0
dtype: int64

In [36]:
df_list_summary.fillna(value=0, inplace=True)
df_list_summary.isnull().sum()

number_of_periods                  0
roi_mean                           0
roi_median                         0
roi_min                            0
roi_max                            0
roi_range                          0
number_of_periods_nonneg           0
number_of_periods_neg              0
percent_of_periods_nonneg          0
percent_of_periods_neg             0
marginal_nonneg_periods_per_day    0
cumulative_pos_periods_per_day     0
dtype: int64

In [37]:
df_list_summary.head()

Unnamed: 0_level_0,number_of_periods,roi_mean,roi_median,roi_min,roi_max,roi_range,number_of_periods_nonneg,number_of_periods_neg,percent_of_periods_nonneg,percent_of_periods_neg,marginal_nonneg_periods_per_day,cumulative_pos_periods_per_day
trading_days,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,17236,0.000342,0.000468,-0.204669,0.1158,0.32047,9255,7981,0.536958,0.463042,0.0,0.0
2,17235,0.000686,0.001031,-0.245706,0.149173,0.394879,9420,7815,0.546562,0.453438,0.009605,0.009605
3,17234,0.001026,0.001707,-0.263375,0.13948,0.402855,9614,7620,0.557851,0.442149,0.011289,0.020893
4,17233,0.001366,0.002385,-0.285133,0.179735,0.464868,9685,7548,0.562003,0.437997,0.004152,0.025046
5,17232,0.001704,0.0028,-0.27328,0.191112,0.464391,9759,7473,0.56633,0.43367,0.004327,0.029373


In [38]:
max(df_list_summary.index)

7560

In [39]:
df_list_summary.shape

(7560, 12)

## Store to_csv: df_list_summary

In [41]:
# store pandas dateframe to csv
prefix = 'sp500_timeperiodpricereturn_output_summary_'
trading_days_range = str(start_range) + '-' + str(end_range)
filetype = '.csv'
filename = prefix + trading_days_range + filetype

print(filename)
df_list_summary.to_csv(filename, chunksize=100000)

sp500_timeperiodpricereturn_output_summary_1-7560.csv
