## This notebook analyses missingness of the base Index S&P500

In [1]:
import pandas as pd
import numpy as np
from datetime import date, datetime
import holidays

# Analysis for S&P500 index

### Are there missing values on the original data?

In [2]:
v_sp500 = pd.read_csv('input\/S&P500.csv', parse_dates=True)
v_sp500.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
5079,2020-03-12,2630.860107,2660.949951,2478.860107,2480.639893,2480.639893,8829380000
5080,2020-03-13,2569.98999,2711.330078,2492.370117,2711.02002,2711.02002,8258670000
5081,2020-03-16,2508.590088,2562.97998,2380.939941,2386.129883,2386.129883,7781540000
5082,2020-03-17,2425.659912,2553.929932,2367.040039,2529.189941,2529.189941,8358500000
5083,2020-03-18,2436.5,2453.570068,2280.52002,2398.100098,2398.100098,8755780000


In [3]:
print("Number of lines:", len(v_sp500))

Number of lines: 5084


In [4]:
at_least_one_value_missing = v_sp500.apply(lambda x: (len(x)-x.count())>=1, axis=1)

In [5]:
at_least_one_value_missing.value_counts()

False    5084
dtype: int64

#### There are no missing values on the original data.
### Are there missing days on the original data?

In [6]:
business_dates = pd.bdate_range(start='2000-01-03', end='2020-03-18').date
business_dates = business_dates.tolist()

In [7]:
num_business_days = len(business_dates)
print("Number of business days:", num_business_days)

Number of business days: 5273


In [8]:
difference = num_business_days-len(v_sp500)
print("Difference between number of business days and number of available data:", difference)

Difference between number of business days and number of available data: 189


#### There are 189  days on the data.
### Are those holidays? 
#### It is possible, since there are 20 years of data and approximately 10 holidays per year in the USA, and a few might have happened in weekends (that are already not being considered)

In [9]:
dates = list()
years = list(range(2000,2021))
us_holidays = holidays.UnitedStates(years = years, expand = False, observed=False)

In [10]:
for date in business_dates:
    if(date not in list(us_holidays.keys())):
        dates.append(date)
    else:
        print("Holiday", date) 

Holiday 2000-01-17
Holiday 2000-02-21
Holiday 2000-05-29
Holiday 2000-07-04
Holiday 2000-09-04
Holiday 2000-10-09
Holiday 2000-11-23
Holiday 2000-12-25
Holiday 2001-01-01
Holiday 2001-01-15
Holiday 2001-02-19
Holiday 2001-05-28
Holiday 2001-07-04
Holiday 2001-09-03
Holiday 2001-10-08
Holiday 2001-11-22
Holiday 2001-12-25
Holiday 2002-01-01
Holiday 2002-01-21
Holiday 2002-02-18
Holiday 2002-05-27
Holiday 2002-07-04
Holiday 2002-09-02
Holiday 2002-10-14
Holiday 2002-11-11
Holiday 2002-11-28
Holiday 2002-12-25
Holiday 2003-01-01
Holiday 2003-01-20
Holiday 2003-02-17
Holiday 2003-05-26
Holiday 2003-07-04
Holiday 2003-09-01
Holiday 2003-10-13
Holiday 2003-11-11
Holiday 2003-11-27
Holiday 2003-12-25
Holiday 2004-01-01
Holiday 2004-01-19
Holiday 2004-02-16
Holiday 2004-05-31
Holiday 2004-09-06
Holiday 2004-10-11
Holiday 2004-11-11
Holiday 2004-11-25
Holiday 2005-01-17
Holiday 2005-02-21
Holiday 2005-05-30
Holiday 2005-07-04
Holiday 2005-09-05
Holiday 2005-10-10
Holiday 2005-11-11
Holiday 2005

### Dates present in sp500 but not on our list date

In [11]:
count = 0
for date in v_sp500['Date']:
    if datetime.strptime(date, '%Y-%m-%d').date() not in dates:
        print(datetime.strptime(date, '%Y-%m-%d').date())
        count +=1
         
print(count, "dates are missing from our dates list")

2000-10-09
2001-10-08
2002-10-14
2002-11-11
2003-10-13
2003-11-11
2004-10-11
2004-11-11
2005-10-10
2005-11-11
2006-10-09
2007-10-08
2008-10-13
2008-11-11
2009-10-12
2009-11-11
2010-10-11
2010-11-11
2011-10-10
2011-11-11
2012-10-08
2013-10-14
2013-11-11
2014-10-13
2014-11-11
2015-10-12
2015-11-11
2016-10-10
2016-11-11
2017-10-09
2018-10-08
2019-10-14
2019-11-11
33 dates are missing from our dates list


#### they are all holidays that still had the index informed. No problem here.


### Dates present on our date list but missing from sp500

In [12]:
count = 0
for date in dates:
    if date.strftime('%Y-%m-%d') not in v_sp500['Date'].unique():
        print(date.strftime('%Y-%m-%d'))
        count += 1
print(count, " dates missing from sp500 list")

2000-04-21
2001-04-13
2001-09-11
2001-09-12
2001-09-13
2001-09-14
2002-03-29
2003-04-18
2004-04-09
2004-06-11
2004-07-05
2004-12-24
2005-03-25
2005-12-26
2006-01-02
2006-04-14
2007-01-02
2007-04-06
2008-03-21
2009-04-10
2009-07-03
2010-04-02
2010-07-05
2010-12-24
2011-04-22
2011-12-26
2012-01-02
2012-04-06
2012-10-29
2012-10-30
2013-03-29
2014-04-18
2015-04-03
2015-07-03
2016-03-25
2016-12-26
2017-01-02
2017-04-14
2018-03-30
2018-12-05
2019-04-19
41  dates missing from sp500 list


#### 41 dates are missing and need further investigation

#### After checking each of the missing dates, it was discovered that these were all days in which the stock was closed. These days were then added to the holiday list and the analysis was re-done. 

In [13]:
dates = list()
years = list(range(2000,2021))
us_holidays = holidays.UnitedStates(years = years, expand = False, observed=False)

In [14]:
us_holidays[datetime(2019,4,19)] = 'Good Friday'
us_holidays[datetime(2018,12,5)] = 'Mourning Bush'
us_holidays[datetime(2018,3,30)] = 'Good Friday'
us_holidays[datetime(2017,4,14)] = 'Good Friday'
us_holidays[datetime(2017,1,2)] = "New Year's"
us_holidays[datetime(2016,3,25)] = 'Good Friday'
us_holidays[datetime(2016,12,26)] = 'Christmas'
us_holidays[datetime(2015,7,3)] = 'Independence Day'
us_holidays[datetime(2015,4,3)] = 'Good Friday'
us_holidays[datetime(2019,4,19)] = 'Good Friday'
us_holidays[datetime(2014,4,18)] = 'Good Friday'
us_holidays[datetime(2013,3,29)] = 'Good Friday'
us_holidays[datetime(2012,10,30)] = 'Hurricane Sandy'
us_holidays[datetime(2012,10,29)] = 'Hurricane Sandy'
us_holidays[datetime(2012,4,6)] = 'Good Friday'
us_holidays[datetime(2012,1,2)] = "New Year's"
us_holidays[datetime(2011,12,26)] = 'Christmas'
us_holidays[datetime(2011,4,22)] = 'Good Friday'
us_holidays[datetime(2010,12,24)] = 'Christmas'
us_holidays[datetime(2010,7,5)] = 'Independence Day'
us_holidays[datetime(2010,4,2)] = 'Good Friday'
us_holidays[datetime(2009,7,3)] = 'Independence Day'
us_holidays[datetime(2009,4,10)] = 'Good Friday'
us_holidays[datetime(2008,3,21)] = 'Good Friday'
us_holidays[datetime(2007,4,6)] = 'Good Friday'
us_holidays[datetime(2007,1,2)] = 'Mourning Ford'
us_holidays[datetime(2006,4,14)] = 'Good Friday'
us_holidays[datetime(2006,1,2)] = "New Year's"
us_holidays[datetime(2005,12,26)] = 'Christmas'
us_holidays[datetime(2005,3,25)] = 'Good Friday'
us_holidays[datetime(2004,12,24)] = 'Christmas'
us_holidays[datetime(2004,7,5)] = 'Independence Day'
us_holidays[datetime(2004,6,11)] = 'Mourning Reagan'
us_holidays[datetime(2004,4,9)] = 'Good Friday'
us_holidays[datetime(2003,4,18)] = 'Good Friday'
us_holidays[datetime(2002,3,29)] = 'Good Friday'
us_holidays[datetime(2001,9,14)] = '9-11'
us_holidays[datetime(2001,9,13)] = '9-11'
us_holidays[datetime(2001,9,12)] = '9-11'
us_holidays[datetime(2001,9,11)] = '9-11'
us_holidays[datetime(2001,4,13)] = 'Good Friday'
us_holidays[datetime(2000,4,21)] = 'Good Friday'


In [15]:
for date in business_dates:
    if(date not in list(us_holidays.keys())):
        dates.append(date)
    else:
        print("Holiday", date) 

Holiday 2000-01-17
Holiday 2000-02-21
Holiday 2000-04-21
Holiday 2000-05-29
Holiday 2000-07-04
Holiday 2000-09-04
Holiday 2000-10-09
Holiday 2000-11-23
Holiday 2000-12-25
Holiday 2001-01-01
Holiday 2001-01-15
Holiday 2001-02-19
Holiday 2001-04-13
Holiday 2001-05-28
Holiday 2001-07-04
Holiday 2001-09-03
Holiday 2001-09-11
Holiday 2001-09-12
Holiday 2001-09-13
Holiday 2001-09-14
Holiday 2001-10-08
Holiday 2001-11-22
Holiday 2001-12-25
Holiday 2002-01-01
Holiday 2002-01-21
Holiday 2002-02-18
Holiday 2002-03-29
Holiday 2002-05-27
Holiday 2002-07-04
Holiday 2002-09-02
Holiday 2002-10-14
Holiday 2002-11-11
Holiday 2002-11-28
Holiday 2002-12-25
Holiday 2003-01-01
Holiday 2003-01-20
Holiday 2003-02-17
Holiday 2003-04-18
Holiday 2003-05-26
Holiday 2003-07-04
Holiday 2003-09-01
Holiday 2003-10-13
Holiday 2003-11-11
Holiday 2003-11-27
Holiday 2003-12-25
Holiday 2004-01-01
Holiday 2004-01-19
Holiday 2004-02-16
Holiday 2004-04-09
Holiday 2004-05-31
Holiday 2004-06-11
Holiday 2004-07-05
Holiday 2004


### Dates present on our date list but missing from sp500

In [16]:
count = 0
for date in dates:
    if date.strftime('%Y-%m-%d') not in v_sp500['Date'].unique():
        print(date.strftime('%Y-%m-%d'))
        count += 1
print(count, " dates missing from sp500 list")

0  dates missing from sp500 list


#### There are no missing dates