# Daily Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

In [2]:
cta = pd.read_csv('../../capstone-data/daily/CTA_-_Ridership_-_Daily_Boarding_Totals.csv')
weather = pd.read_csv('../../capstone-data/daily/weather.csv')
gas = pd.read_excel('../../capstone-data/daily/gasoline_stocks.xlsx')
covid20 = pd.read_csv('../../capstone-data/covid/OxCGRT_withnotes_2020.csv')
covid21 = pd.read_csv('../../capstone-data/covid/OxCGRT_withnotes_2021.csv')
vax = pd.read_csv('../../capstone-data/covid/COVID-19_Daily_Vaccinations_-_Chicago_Residents.csv')
curve = pd.read_excel('../../capstone-data/daily/treasury_yield_curve.xlsx')
crash = pd.read_csv('../../capstone-data/monthly/Traffic_Crashes_-_Crashes.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


## CTA Data

In [3]:
cta.tail()

Unnamed: 0,service_date,day_type,bus,rail_boardings,total_rides
7696,11/26/2021,W,257700,189694,447394
7697,11/27/2021,A,237839,187065,424904
7698,11/28/2021,U,184817,147830,332647
7699,11/29/2021,W,421322,276090,697412
7700,11/30/2021,W,450230,302349,752579


In [4]:
cta.duplicated().sum()

62

In [5]:
cta.isna().sum()

service_date      0
day_type          0
bus               0
rail_boardings    0
total_rides       0
dtype: int64

In [6]:
cta.drop_duplicates(inplace=True)

In [7]:
cta.drop(columns={'day_type'},inplace=True)

In [8]:
cta.shape

(7639, 4)

In [9]:
cta.rename(columns={'service_date':'date'},inplace=True)

In [10]:
cta['date'] = pd.to_datetime(cta['date'])

In [11]:
cta.set_index('date',inplace=True)

cta.sort_index(inplace=True)

In [12]:
cta.dtypes

bus               int64
rail_boardings    int64
total_rides       int64
dtype: object

In [13]:
cta.isna().sum()

bus               0
rail_boardings    0
total_rides       0
dtype: int64

In [14]:
cta.duplicated().sum()

0

In [15]:
cta=cta.asfreq('d')
cta.index

DatetimeIndex(['2001-01-01', '2001-01-02', '2001-01-03', '2001-01-04',
               '2001-01-05', '2001-01-06', '2001-01-07', '2001-01-08',
               '2001-01-09', '2001-01-10',
               ...
               '2021-11-21', '2021-11-22', '2021-11-23', '2021-11-24',
               '2021-11-25', '2021-11-26', '2021-11-27', '2021-11-28',
               '2021-11-29', '2021-11-30'],
              dtype='datetime64[ns]', name='date', length=7639, freq='D')

## Weather Data

In [16]:
weather = pd.read_csv('../../capstone-data/daily/weather.csv')

In [17]:
def daily_weather(weather):
    """
    A function that cleans the weather dataset, which comes as daily. 
    Commented out is optional code that can be edited to suit the needed format,
    ex. ...resample('MS').sum() for month starting on first day of that month.
    """
    weather.columns = weather.columns.str.lower()
    weather.drop(columns='station',inplace=True)
    weather['date'] = pd.to_datetime(weather['date'])
    weather.set_index('date',inplace=True)
    weather.sort_index(inplace=True)
    weather.drop(columns={'snow','snwd'},inplace=True)
    weather = weather.fillna(0)
    weather=weather.asfreq('d')
    return weather

In [18]:
weather = daily_weather(weather)

In [19]:
weather.index

DatetimeIndex(['2001-01-01', '2001-01-02', '2001-01-03', '2001-01-04',
               '2001-01-05', '2001-01-06', '2001-01-07', '2001-01-08',
               '2001-01-09', '2001-01-10',
               ...
               '2022-01-15', '2022-01-16', '2022-01-17', '2022-01-18',
               '2022-01-19', '2022-01-20', '2022-01-21', '2022-01-22',
               '2022-01-23', '2022-01-24'],
              dtype='datetime64[ns]', name='date', length=7694, freq='D')

In [20]:
weather.index.duplicated().sum()

0

In [21]:
weather.isna().sum()

prcp    0
tmax    0
tmin    0
dtype: int64

In [22]:
cta.shape, weather.shape

((7639, 3), (7694, 3))

### Merge CTA/Weather

In [23]:
daily = cta.merge(weather, how='outer', left_index=True, right_index=True)
daily

Unnamed: 0_level_0,bus,rail_boardings,total_rides,prcp,tmax,tmin
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2001-01-01,297192.0,126455.0,423647.0,0.0,26,13
2001-01-02,780827.0,501952.0,1282779.0,0.0,21,7
2001-01-03,824923.0,536432.0,1361355.0,0.0,28,7
2001-01-04,870021.0,550011.0,1420032.0,0.0,32,23
2001-01-05,890426.0,557917.0,1448343.0,0.0,37,24
...,...,...,...,...,...,...
2022-01-20,,,,0.0,22,9
2022-01-21,,,,0.0,27,10
2022-01-22,,,,0.0,36,20
2022-01-23,,,,0.0,25,10


In [24]:
daily.loc[daily['bus'].isna()==True].head()

Unnamed: 0_level_0,bus,rail_boardings,total_rides,prcp,tmax,tmin
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-12-01,,,,0.0,48,35
2021-12-02,,,,0.0,59,44
2021-12-03,,,,0.0,51,39
2021-12-04,,,,0.0,46,31
2021-12-05,,,,0.0,43,35


In [25]:
dailydrop=daily.dropna()

In [26]:
dailydrop.isna().sum()

bus               0
rail_boardings    0
total_rides       0
prcp              0
tmax              0
tmin              0
dtype: int64

In [27]:
dailydrop.duplicated().sum()

0

In [28]:
dailydrop.index

DatetimeIndex(['2001-01-01', '2001-01-02', '2001-01-03', '2001-01-04',
               '2001-01-05', '2001-01-06', '2001-01-07', '2001-01-08',
               '2001-01-09', '2001-01-10',
               ...
               '2021-11-21', '2021-11-22', '2021-11-23', '2021-11-24',
               '2021-11-25', '2021-11-26', '2021-11-27', '2021-11-28',
               '2021-11-29', '2021-11-30'],
              dtype='datetime64[ns]', name='date', length=7639, freq='D')

## Gas Data

In [29]:
gas

Unnamed: 0,Date,Open,High,Low,Close*,Adj Close**,Volume
0,2001-01-02,0.794,0.804,0.78,0.795,0.795,14697
1,2001-01-03,0.799,0.812,0.792,0.81,0.81,10219
2,2001-01-04,0.815,0.83,0.813,0.8186,0.8186,17869
3,2001-01-05,0.815,0.838,0.814,0.8205,0.8205,18617
4,2001-01-08,0.827,0.853,0.825,0.835,0.835,24057
...,...,...,...,...,...,...,...
5360,2022-01-25,2.4215,2.465,2.4023,2.4595,2.4595,30337
5361,2022-01-26,2.4552,2.5347,2.4533,2.5229,2.5229,35505
5362,2022-01-27,2.5223,2.57,2.51,2.521,2.521,23546
5363,2022-01-28,2.5608,2.5798,2.5244,2.5423,2.5423,23546


In [30]:
gas.dtypes

Date           datetime64[ns]
Open                   object
High                   object
Low                    object
Close*                 object
Adj Close**            object
Volume                 object
dtype: object

In [31]:
gas.isna().sum()

Date           0
Open           0
High           0
Low            0
Close*         0
Adj Close**    0
Volume         0
dtype: int64

In [32]:
gas = pd.read_excel('../../capstone-data/daily/gasoline_stocks.xlsx')

In [33]:
gas.index

RangeIndex(start=0, stop=5365, step=1)

In [34]:
gas['Date'].duplicated().sum()

0

In [35]:
gas.isna().sum()

Date           0
Open           0
High           0
Low            0
Close*         0
Adj Close**    0
Volume         0
dtype: int64

In [36]:
def clean_gas(gas):
    
    gas.columns = gas.columns.str.lower()
    
    #gas['date'] = pd.to_datetime(gas['date'])
    
    gas.drop(columns={'high','low','adj close**'},inplace=True)
    
    gas.replace('-',np.nan,inplace=True)
    gas['open']= pd.to_numeric(gas['open'], downcast="float")
    gas['close*']= pd.to_numeric(gas['close*'], downcast="float")
    gas['volume']= pd.to_numeric(gas['volume'], downcast="float")
    
    # "Drop = True" causes duplicates
    gas.set_index('date',inplace=True)
    gas.sort_index(inplace=True)
   
    gas.rename(columns={'open':'gas_open','close*': 'gas_close',
                        'volume':'gas_volume'},inplace=True)
    gas = gas.asfreq('d')
    gas.fillna(method='ffill',inplace=True)

    return gas
    

In [37]:
gas = clean_gas(gas)

In [38]:
gas.duplicated().sum()

2404

In [39]:
gas.loc[gas.index.duplicated()==True]

Unnamed: 0_level_0,gas_open,gas_close,gas_volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


In [40]:
gas.index

DatetimeIndex(['2001-01-02', '2001-01-03', '2001-01-04', '2001-01-05',
               '2001-01-06', '2001-01-07', '2001-01-08', '2001-01-09',
               '2001-01-10', '2001-01-11',
               ...
               '2022-01-22', '2022-01-23', '2022-01-24', '2022-01-25',
               '2022-01-26', '2022-01-27', '2022-01-28', '2022-01-29',
               '2022-01-30', '2022-01-31'],
              dtype='datetime64[ns]', name='date', length=7700, freq='D')

In [41]:
gas.index.is_unique

True

In [42]:
gas.index.duplicated()

array([False, False, False, ..., False, False, False])

In [43]:
gas.isna().sum()

gas_open      0
gas_close     0
gas_volume    0
dtype: int64

In [44]:
daily.shape

(7694, 6)

In [45]:
dailydrop.shape

(7639, 6)

In [46]:
gas.shape

(7700, 3)

In [47]:
gas.isna().sum()

gas_open      0
gas_close     0
gas_volume    0
dtype: int64

### Gas Merge

In [48]:
dailydrop = dailydrop.merge(gas, how='outer', left_index=True, right_index=True)


In [49]:
dailydrop.isna().sum()

bus               62
rail_boardings    62
total_rides       62
prcp              62
tmax              62
tmin              62
gas_open           1
gas_close          1
gas_volume         1
dtype: int64

In [50]:
dailydrop.drop(dailydrop.tail(62).index,inplace=True)

In [51]:
dailydrop.bfill(inplace=True)

In [52]:
dailydrop.head()

Unnamed: 0_level_0,bus,rail_boardings,total_rides,prcp,tmax,tmin,gas_open,gas_close,gas_volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2001-01-01,297192.0,126455.0,423647.0,0.0,26.0,13.0,0.794,0.795,14697.0
2001-01-02,780827.0,501952.0,1282779.0,0.0,21.0,7.0,0.794,0.795,14697.0
2001-01-03,824923.0,536432.0,1361355.0,0.0,28.0,7.0,0.799,0.81,10219.0
2001-01-04,870021.0,550011.0,1420032.0,0.0,32.0,23.0,0.815,0.8186,17869.0
2001-01-05,890426.0,557917.0,1448343.0,0.0,37.0,24.0,0.815,0.8205,18617.0


In [53]:
dailydrop.tail()

Unnamed: 0_level_0,bus,rail_boardings,total_rides,prcp,tmax,tmin,gas_open,gas_close,gas_volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2021-11-26,257700.0,189694.0,447394.0,0.0,31.0,19.0,2.314,2.0294,17599.0
2021-11-27,237839.0,187065.0,424904.0,0.0,41.0,30.0,2.314,2.0294,17599.0
2021-11-28,184817.0,147830.0,332647.0,0.0,41.0,26.0,2.314,2.0294,17599.0
2021-11-29,421322.0,276090.0,697412.0,0.0,41.0,26.0,2.0785,2.0771,29118.0
2021-11-30,450230.0,302349.0,752579.0,0.0,53.0,33.0,2.1036,1.9801,83219.0


In [54]:
dailydrop.index.duplicated().sum()

0

In [55]:
dailydrop.index

DatetimeIndex(['2001-01-01', '2001-01-02', '2001-01-03', '2001-01-04',
               '2001-01-05', '2001-01-06', '2001-01-07', '2001-01-08',
               '2001-01-09', '2001-01-10',
               ...
               '2021-11-21', '2021-11-22', '2021-11-23', '2021-11-24',
               '2021-11-25', '2021-11-26', '2021-11-27', '2021-11-28',
               '2021-11-29', '2021-11-30'],
              dtype='datetime64[ns]', name='date', length=7639, freq='D')

## Treasury Curve

In [56]:
curve.dtypes

Date     datetime64[ns]
1 Mo            float64
2 Mo            float64
3 Mo            float64
6 Mo            float64
1 Yr            float64
2 Yr            float64
3 Yr            float64
5 Yr            float64
7 Yr            float64
10 Yr           float64
20 Yr           float64
30 Yr           float64
dtype: object

In [57]:
curve.shape

(5274, 13)

In [58]:
curve.isna().sum()

Date        0
1 Mo      145
2 Mo     4451
3 Mo        3
6 Mo        0
1 Yr        0
2 Yr        0
3 Yr        0
5 Yr        0
7 Yr        0
10 Yr       0
20 Yr       0
30 Yr     994
dtype: int64

In [59]:
def clean_curve(curve):
    curve.columns = curve.columns.str.lower()

    curve.set_index('date',inplace=True)

    curve.sort_index(inplace=True)
    curve.columns = curve.columns.str.replace(' ','_')
    curve.drop(columns={'1_mo','2_mo','30_yr'},inplace=True)
    return curve

In [60]:
curve= clean_curve(curve)

In [61]:
curve.index.duplicated().sum()

0

In [62]:
curve.head()

Unnamed: 0_level_0,3_mo,6_mo,1_yr,2_yr,3_yr,5_yr,7_yr,10_yr,20_yr
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2001-01-02,5.87,5.58,5.11,4.87,4.82,4.76,4.97,4.92,5.46
2001-01-03,5.69,5.44,5.04,4.92,4.92,4.94,5.18,5.14,5.62
2001-01-04,5.37,5.2,4.82,4.77,4.78,4.82,5.07,5.03,5.56
2001-01-05,5.12,4.98,4.6,4.56,4.57,4.66,4.93,4.93,5.5
2001-01-08,5.19,5.03,4.61,4.54,4.55,4.65,4.94,4.94,5.52


In [63]:
curve.fillna(method='ffill',inplace=True)

In [64]:
curve.isna().sum()

3_mo     0
6_mo     0
1_yr     0
2_yr     0
3_yr     0
5_yr     0
7_yr     0
10_yr    0
20_yr    0
dtype: int64

In [65]:
curve=curve.asfreq('d')

In [66]:
curve.isna().sum()

3_mo     2426
6_mo     2426
1_yr     2426
2_yr     2426
3_yr     2426
5_yr     2426
7_yr     2426
10_yr    2426
20_yr    2426
dtype: int64

In [67]:
curve.loc[curve['3_mo'].isna()==True]

Unnamed: 0_level_0,3_mo,6_mo,1_yr,2_yr,3_yr,5_yr,7_yr,10_yr,20_yr
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2001-01-06,,,,,,,,,
2001-01-07,,,,,,,,,
2001-01-13,,,,,,,,,
2001-01-14,,,,,,,,,
2001-01-15,,,,,,,,,
...,...,...,...,...,...,...,...,...,...
2022-01-17,,,,,,,,,
2022-01-22,,,,,,,,,
2022-01-23,,,,,,,,,
2022-01-29,,,,,,,,,


In [68]:
curve.ffill(inplace=True)

In [69]:
curve.index

DatetimeIndex(['2001-01-02', '2001-01-03', '2001-01-04', '2001-01-05',
               '2001-01-06', '2001-01-07', '2001-01-08', '2001-01-09',
               '2001-01-10', '2001-01-11',
               ...
               '2022-01-22', '2022-01-23', '2022-01-24', '2022-01-25',
               '2022-01-26', '2022-01-27', '2022-01-28', '2022-01-29',
               '2022-01-30', '2022-01-31'],
              dtype='datetime64[ns]', name='date', length=7700, freq='D')

### Merge Yield Curve

In [70]:
dailydrop = dailydrop.merge(curve, how='outer', left_index=True, right_index=True)


In [71]:
dailydrop.isna().sum()

bus               62
rail_boardings    62
total_rides       62
prcp              62
tmax              62
tmin              62
gas_open          62
gas_close         62
gas_volume        62
3_mo               1
6_mo               1
1_yr               1
2_yr               1
3_yr               1
5_yr               1
7_yr               1
10_yr              1
20_yr              1
dtype: int64

In [72]:
dailydrop = dailydrop.drop(dailydrop.tail(62).index)

In [73]:
dailydrop.bfill(inplace=True)

In [74]:
dailydrop.isna().sum()

bus               0
rail_boardings    0
total_rides       0
prcp              0
tmax              0
tmin              0
gas_open          0
gas_close         0
gas_volume        0
3_mo              0
6_mo              0
1_yr              0
2_yr              0
3_yr              0
5_yr              0
7_yr              0
10_yr             0
20_yr             0
dtype: int64

In [75]:
dailydrop.index

DatetimeIndex(['2001-01-01', '2001-01-02', '2001-01-03', '2001-01-04',
               '2001-01-05', '2001-01-06', '2001-01-07', '2001-01-08',
               '2001-01-09', '2001-01-10',
               ...
               '2021-11-21', '2021-11-22', '2021-11-23', '2021-11-24',
               '2021-11-25', '2021-11-26', '2021-11-27', '2021-11-28',
               '2021-11-29', '2021-11-30'],
              dtype='datetime64[ns]', name='date', length=7639, freq='D')

## Prep Covid

In [76]:
def merge_prep_covid(df):

    df = df.loc[(df['CountryCode']=='USA') & (df['RegionName']=='Illinois')]

    test20 = df.drop(columns={'CountryName','CountryCode','RegionName','RegionCode','Jurisdiction','H1_Flag',
                        'StringencyIndexForDisplay','StringencyLegacyIndexForDisplay','GovernmentResponseIndexForDisplay',
                        'ContainmentHealthIndexForDisplay','EconomicSupportIndexForDisplay',
                        'C7_Flag','E1_Flag','H6_Flag','H7_Flag','H8_Flag',
                        'V2B_Vaccine age eligibility/availability age floor (general population summary)',
                       'V2D_Medically/ clinically vulnerable (Non-elderly)', 'V2E_Education',
                       'V2F_Frontline workers  (non healthcare)',
                       'V2G_Frontline workers  (healthcare)','C1_Flag','C1_Notes','C2_Flag','C2_Notes',
                             'C3_Flag','C3_Notes'})
    
    test20 = test20.drop(test20.columns[test20.isnull().sum() > 30],axis=1)
    
    #test20['Date'] = pd.to_datetime(test20['Date'],format='%Y%m%d').dt.date
    test20['Date'] = pd.to_datetime(test20['Date'],format='%Y%m%d')
    # test20 = test20.groupby('Date').agg('mean')
    test20.set_index('Date',inplace=True)
    test20.sort_index(inplace=True)
    #test20=test20.groupby(pd.Grouper(freq='M')).nth(0)
    
    return test20

In [77]:
covid20.head()

Unnamed: 0,CountryName,CountryCode,RegionName,RegionCode,Jurisdiction,Date,C1_School closing,C1_Flag,C1_Notes,C2_Workplace closing,C2_Flag,C2_Notes,C3_Cancel public events,C3_Flag,C3_Notes,C4_Restrictions on gatherings,C4_Flag,C4_Notes,C5_Close public transport,C5_Flag,C5_Notes,C6_Stay at home requirements,C6_Flag,C6_Notes,C7_Restrictions on internal movement,C7_Flag,C7_Notes,C8_International travel controls,C8_Notes,E1_Income support,E1_Flag,E1_Notes,E2_Debt/contract relief,E2_Notes,E3_Fiscal measures,E3_Notes,E4_International support,E4_Notes,H1_Public information campaigns,H1_Flag,H1_Notes,H2_Testing policy,H2_Notes,H3_Contact tracing,H3_Notes,H4_Emergency investment in healthcare,H4_Notes,H5_Investment in vaccines,H5_Notes,H6_Facial Coverings,H6_Flag,H6_Notes,H7_Vaccination policy,H7_Flag,H7_Notes,H8_Protection of elderly people,H8_Flag,H8_Notes,M1_Wildcard,M1_Notes,V1_Vaccine Prioritisation (summary),V1_Notes,V2A_Vaccine Availability (summary),V2_Notes,V2B_Vaccine age eligibility/availability age floor (general population summary),V2C_Vaccine age eligibility/availability age floor (at risk summary),V2D_Medically/ clinically vulnerable (Non-elderly),V2E_Education,V2F_Frontline workers (non healthcare),V2G_Frontline workers (healthcare),V3_Vaccine Financial Support (summary),V3_Notes,ConfirmedCases,ConfirmedDeaths,StringencyIndex,StringencyIndexForDisplay,StringencyLegacyIndex,StringencyLegacyIndexForDisplay,GovernmentResponseIndex,GovernmentResponseIndexForDisplay,ContainmentHealthIndex,ContainmentHealthIndexForDisplay,EconomicSupportIndex,EconomicSupportIndexForDisplay
0,Aruba,ABW,,,NAT_TOTAL,20200101,0.0,,,0.0,,,0.0,,,0.0,,,0.0,,,0.0,,,0.0,,,0.0,,0.0,,,0.0,,0.0,,0.0,,0.0,,,0.0,,0.0,,0.0,,0.0,,0.0,,,0.0,,,0.0,,,,,0.0,,0.0,,,,,,,,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Aruba,ABW,,,NAT_TOTAL,20200102,0.0,,,0.0,,,0.0,,,0.0,,,0.0,,,0.0,,,0.0,,,0.0,,0.0,,,0.0,,0.0,,0.0,,0.0,,,0.0,,0.0,,0.0,,0.0,,0.0,,,0.0,,,0.0,,,,,0.0,,0.0,,,,,,,,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Aruba,ABW,,,NAT_TOTAL,20200103,0.0,,,0.0,,,0.0,,,0.0,,,0.0,,,0.0,,,0.0,,,0.0,,0.0,,,0.0,,0.0,,0.0,,0.0,,,0.0,,0.0,,0.0,,0.0,,0.0,,,0.0,,,0.0,,,,,0.0,,0.0,,,,,,,,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Aruba,ABW,,,NAT_TOTAL,20200104,0.0,,,0.0,,,0.0,,,0.0,,,0.0,,,0.0,,,0.0,,,0.0,,0.0,,,0.0,,0.0,,0.0,,0.0,,,0.0,,0.0,,0.0,,0.0,,0.0,,,0.0,,,0.0,,,,,0.0,,0.0,,,,,,,,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Aruba,ABW,,,NAT_TOTAL,20200105,0.0,,,0.0,,,0.0,,,0.0,,,0.0,,,0.0,,,0.0,,,0.0,,0.0,,,0.0,,0.0,,0.0,,0.0,,,0.0,,0.0,,0.0,,0.0,,0.0,,,0.0,,,0.0,,,,,0.0,,0.0,,,,,,,,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [78]:
covid20 = merge_prep_covid(covid20)

In [79]:
covid20.isna().sum()

C1_School closing                        0
C2_Workplace closing                     0
C3_Cancel public events                  0
C4_Restrictions on gatherings            0
C5_Close public transport                0
C6_Stay at home requirements             0
C7_Restrictions on internal movement     0
C8_International travel controls         0
E1_Income support                        0
E2_Debt/contract relief                  0
H1_Public information campaigns          0
H2_Testing policy                        0
H3_Contact tracing                       0
H6_Facial Coverings                      0
H7_Vaccination policy                    0
H8_Protection of elderly people          0
ConfirmedCases                          21
ConfirmedDeaths                         21
StringencyIndex                          0
StringencyLegacyIndex                    0
GovernmentResponseIndex                  0
ContainmentHealthIndex                   0
EconomicSupportIndex                     0
dtype: int6

In [80]:
covid20.index.duplicated().sum()

0

In [81]:
covid20.fillna(0)

Unnamed: 0_level_0,C1_School closing,C2_Workplace closing,C3_Cancel public events,C4_Restrictions on gatherings,C5_Close public transport,C6_Stay at home requirements,C7_Restrictions on internal movement,C8_International travel controls,E1_Income support,E2_Debt/contract relief,H1_Public information campaigns,H2_Testing policy,H3_Contact tracing,H6_Facial Coverings,H7_Vaccination policy,H8_Protection of elderly people,ConfirmedCases,ConfirmedDeaths,StringencyIndex,StringencyLegacyIndex,GovernmentResponseIndex,ContainmentHealthIndex,EconomicSupportIndex
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2020-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.0
2020-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.0
2020-01-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.0
2020-01-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.0
2020-01-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,0.00,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-27,2.0,1.0,2.0,4.0,0.0,2.0,2.0,3.0,1.0,2.0,2.0,3.0,1.0,3.0,1.0,1.0,937909.0,17336.0,66.67,64.29,62.71,62.74,62.5
2020-12-28,2.0,1.0,2.0,4.0,0.0,2.0,2.0,3.0,1.0,2.0,2.0,3.0,1.0,3.0,1.0,1.0,942362.0,17470.0,66.67,64.29,62.71,62.74,62.5
2020-12-29,2.0,1.0,2.0,4.0,0.0,2.0,2.0,3.0,1.0,2.0,2.0,3.0,1.0,3.0,2.0,1.0,948006.0,17596.0,66.67,64.29,63.96,64.17,62.5
2020-12-30,2.0,1.0,2.0,4.0,0.0,2.0,2.0,3.0,1.0,2.0,2.0,3.0,1.0,3.0,2.0,1.0,955380.0,17811.0,66.67,64.29,63.96,64.17,62.5


In [82]:
covid20 = covid20.asfreq('d')

In [83]:
covid20.isna().sum()

C1_School closing                        0
C2_Workplace closing                     0
C3_Cancel public events                  0
C4_Restrictions on gatherings            0
C5_Close public transport                0
C6_Stay at home requirements             0
C7_Restrictions on internal movement     0
C8_International travel controls         0
E1_Income support                        0
E2_Debt/contract relief                  0
H1_Public information campaigns          0
H2_Testing policy                        0
H3_Contact tracing                       0
H6_Facial Coverings                      0
H7_Vaccination policy                    0
H8_Protection of elderly people          0
ConfirmedCases                          21
ConfirmedDeaths                         21
StringencyIndex                          0
StringencyLegacyIndex                    0
GovernmentResponseIndex                  0
ContainmentHealthIndex                   0
EconomicSupportIndex                     0
dtype: int6

In [84]:
covid20.bfill(inplace=True)

In [85]:
covid20.index

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
               '2020-01-05', '2020-01-06', '2020-01-07', '2020-01-08',
               '2020-01-09', '2020-01-10',
               ...
               '2020-12-22', '2020-12-23', '2020-12-24', '2020-12-25',
               '2020-12-26', '2020-12-27', '2020-12-28', '2020-12-29',
               '2020-12-30', '2020-12-31'],
              dtype='datetime64[ns]', name='Date', length=366, freq='D')

In [86]:
covid20.index.duplicated().sum()

0

In [87]:
covid21 = merge_prep_covid(covid21)

In [88]:
covid21=covid21.asfreq('d')

In [89]:
covid21.isna().sum()

C1_School closing                       0
C2_Workplace closing                    0
C3_Cancel public events                 0
C4_Restrictions on gatherings           0
C5_Close public transport               0
C6_Stay at home requirements            0
C7_Restrictions on internal movement    0
C8_International travel controls        0
E1_Income support                       0
E2_Debt/contract relief                 0
H1_Public information campaigns         0
H2_Testing policy                       0
H3_Contact tracing                      0
H6_Facial Coverings                     0
H7_Vaccination policy                   0
H8_Protection of elderly people         0
ConfirmedCases                          0
ConfirmedDeaths                         0
StringencyIndex                         0
StringencyLegacyIndex                   0
GovernmentResponseIndex                 0
ContainmentHealthIndex                  0
EconomicSupportIndex                    0
dtype: int64

In [90]:
covid21.index.duplicated().sum()

0

In [91]:
covid21.index

DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',
               '2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08',
               '2021-01-09', '2021-01-10',
               ...
               '2021-12-22', '2021-12-23', '2021-12-24', '2021-12-25',
               '2021-12-26', '2021-12-27', '2021-12-28', '2021-12-29',
               '2021-12-30', '2021-12-31'],
              dtype='datetime64[ns]', name='Date', length=365, freq='D')

In [92]:
covid_total = pd.concat([covid20, covid21], axis=0)

In [93]:
covid_total.isna().sum()

C1_School closing                       0
C2_Workplace closing                    0
C3_Cancel public events                 0
C4_Restrictions on gatherings           0
C5_Close public transport               0
C6_Stay at home requirements            0
C7_Restrictions on internal movement    0
C8_International travel controls        0
E1_Income support                       0
E2_Debt/contract relief                 0
H1_Public information campaigns         0
H2_Testing policy                       0
H3_Contact tracing                      0
H6_Facial Coverings                     0
H7_Vaccination policy                   0
H8_Protection of elderly people         0
ConfirmedCases                          0
ConfirmedDeaths                         0
StringencyIndex                         0
StringencyLegacyIndex                   0
GovernmentResponseIndex                 0
ContainmentHealthIndex                  0
EconomicSupportIndex                    0
dtype: int64

In [94]:
covid_total.index.duplicated().sum()

0

In [95]:
covid_total.index

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
               '2020-01-05', '2020-01-06', '2020-01-07', '2020-01-08',
               '2020-01-09', '2020-01-10',
               ...
               '2021-12-22', '2021-12-23', '2021-12-24', '2021-12-25',
               '2021-12-26', '2021-12-27', '2021-12-28', '2021-12-29',
               '2021-12-30', '2021-12-31'],
              dtype='datetime64[ns]', name='Date', length=731, freq='D')

In [96]:
#covid_total.to_csv('../../capstone-data/daily/covid_total(d).csv')

### Merge Covid Restrictions

In [97]:
dailydrop = dailydrop.merge(covid_total, how='outer', left_index=True, right_index=True)
dailydrop.index

DatetimeIndex(['2001-01-01', '2001-01-02', '2001-01-03', '2001-01-04',
               '2001-01-05', '2001-01-06', '2001-01-07', '2001-01-08',
               '2001-01-09', '2001-01-10',
               ...
               '2021-12-22', '2021-12-23', '2021-12-24', '2021-12-25',
               '2021-12-26', '2021-12-27', '2021-12-28', '2021-12-29',
               '2021-12-30', '2021-12-31'],
              dtype='datetime64[ns]', length=7670, freq='D')

In [98]:
dailydrop.isna().sum()

bus                                       31
rail_boardings                            31
total_rides                               31
prcp                                      31
tmax                                      31
tmin                                      31
gas_open                                  31
gas_close                                 31
gas_volume                                31
3_mo                                      31
6_mo                                      31
1_yr                                      31
2_yr                                      31
3_yr                                      31
5_yr                                      31
7_yr                                      31
10_yr                                     31
20_yr                                     31
C1_School closing                       6939
C2_Workplace closing                    6939
C3_Cancel public events                 6939
C4_Restrictions on gatherings           6939
C5_Close p

In [99]:
dailydrop = dailydrop.drop(dailydrop.tail(31).index)

In [100]:
dailydrop.fillna(0,inplace=True)

In [101]:
dailydrop.isna().sum()

bus                                     0
rail_boardings                          0
total_rides                             0
prcp                                    0
tmax                                    0
tmin                                    0
gas_open                                0
gas_close                               0
gas_volume                              0
3_mo                                    0
6_mo                                    0
1_yr                                    0
2_yr                                    0
3_yr                                    0
5_yr                                    0
7_yr                                    0
10_yr                                   0
20_yr                                   0
C1_School closing                       0
C2_Workplace closing                    0
C3_Cancel public events                 0
C4_Restrictions on gatherings           0
C5_Close public transport               0
C6_Stay at home requirements      

## Prep Vax

In [102]:
def vax_cleaning(vax):
    vax.columns = vax.columns.str.lower()
    vax['date'] = pd.to_datetime(vax['date'])
    vax.set_index('date',inplace=True)
    vax.sort_index(inplace=True)
    
    vax=vax[['total doses - daily','total doses - cumulative','1st dose - daily',
             '1st dose - cumulative','1st dose - percent population','vaccine series completed - daily',
             'vaccine series completed - cumulative','vaccine series completed - percent population']]
    
    vax = vax.rename(columns={'total doses - daily':'total_daily_doses',
           'total doses - cumulative':'total_daily_cum',
           '1st dose - daily':'first_dose_daily',
           '1st dose - cumulative':'first_dose_cum',
           '1st dose - percent population':'first_dose_percent_pop',
           'vaccine series completed - daily':'vax_series_completed_daily',
           'vaccine series completed - cumulative':'vax_series_cum',
           'vaccine series completed - percent population':'vax_series_percent'})
    return vax


In [103]:
vax = vax_cleaning(vax)

In [107]:
vax = vax.asfreq('d')

In [108]:
vax.isna().sum()

total_daily_doses             0
total_daily_cum               0
first_dose_daily              0
first_dose_cum                0
first_dose_percent_pop        0
vax_series_completed_daily    0
vax_series_cum                0
vax_series_percent            0
dtype: int64

In [109]:
vax.index

DatetimeIndex(['2020-12-15', '2020-12-16', '2020-12-17', '2020-12-18',
               '2020-12-19', '2020-12-20', '2020-12-21', '2020-12-22',
               '2020-12-23', '2020-12-24',
               ...
               '2022-01-04', '2022-01-05', '2022-01-06', '2022-01-07',
               '2022-01-08', '2022-01-09', '2022-01-10', '2022-01-11',
               '2022-01-12', '2022-01-13'],
              dtype='datetime64[ns]', name='date', length=395, freq='D')

In [110]:
vax.index.duplicated().sum()

0

### Merge Vax

In [112]:
dailydrop = dailydrop.merge(vax, how='outer', left_index=True, right_index=True)
dailydrop

Unnamed: 0,bus,rail_boardings,total_rides,prcp,tmax,tmin,gas_open,gas_close,gas_volume,3_mo,6_mo,1_yr,2_yr,3_yr,5_yr,7_yr,10_yr,20_yr,C1_School closing,C2_Workplace closing,C3_Cancel public events,C4_Restrictions on gatherings,C5_Close public transport,C6_Stay at home requirements,C7_Restrictions on internal movement,C8_International travel controls,E1_Income support,E2_Debt/contract relief,H1_Public information campaigns,H2_Testing policy,H3_Contact tracing,H6_Facial Coverings,H7_Vaccination policy,H8_Protection of elderly people,ConfirmedCases,ConfirmedDeaths,StringencyIndex,StringencyLegacyIndex,GovernmentResponseIndex,ContainmentHealthIndex,EconomicSupportIndex,total_daily_doses,total_daily_cum,first_dose_daily,first_dose_cum,first_dose_percent_pop,vax_series_completed_daily,vax_series_cum,vax_series_percent
2001-01-01,297192.0,126455.0,423647.0,0.0,26.0,13.0,0.794,0.7950,14697.0,5.87,5.58,5.11,4.87,4.82,4.76,4.97,4.92,5.46,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,
2001-01-02,780827.0,501952.0,1282779.0,0.0,21.0,7.0,0.794,0.7950,14697.0,5.87,5.58,5.11,4.87,4.82,4.76,4.97,4.92,5.46,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,
2001-01-03,824923.0,536432.0,1361355.0,0.0,28.0,7.0,0.799,0.8100,10219.0,5.69,5.44,5.04,4.92,4.92,4.94,5.18,5.14,5.62,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,
2001-01-04,870021.0,550011.0,1420032.0,0.0,32.0,23.0,0.815,0.8186,17869.0,5.37,5.20,4.82,4.77,4.78,4.82,5.07,5.03,5.56,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,
2001-01-05,890426.0,557917.0,1448343.0,0.0,37.0,24.0,0.815,0.8205,18617.0,5.12,4.98,4.60,4.56,4.57,4.66,4.93,4.93,5.50,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-01-09,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4623.0,4277770.0,1260.0,1963754.0,0.729,880.0,1757428.0,0.652
2022-01-10,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,11610.0,4289380.0,3705.0,1967459.0,0.730,1919.0,1759347.0,0.653
2022-01-11,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,13429.0,4302809.0,5114.0,1972573.0,0.732,2274.0,1761621.0,0.654
2022-01-12,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,13091.0,4315900.0,4857.0,1977430.0,0.734,2305.0,1763926.0,0.655


In [113]:
dailydrop.isna().sum()[0]

44

In [114]:
dailydrop = dailydrop.drop(dailydrop.tail(44).index)

In [116]:
dailydrop.isna().sum()

bus                                        0
rail_boardings                             0
total_rides                                0
prcp                                       0
tmax                                       0
tmin                                       0
gas_open                                   0
gas_close                                  0
gas_volume                                 0
3_mo                                       0
6_mo                                       0
1_yr                                       0
2_yr                                       0
3_yr                                       0
5_yr                                       0
7_yr                                       0
10_yr                                      0
20_yr                                      0
C1_School closing                          0
C2_Workplace closing                       0
C3_Cancel public events                    0
C4_Restrictions on gatherings              0
C5_Close p

In [118]:
dailydrop.fillna(0,inplace=True)

In [119]:
dailydrop.isna().sum()

bus                                     0
rail_boardings                          0
total_rides                             0
prcp                                    0
tmax                                    0
tmin                                    0
gas_open                                0
gas_close                               0
gas_volume                              0
3_mo                                    0
6_mo                                    0
1_yr                                    0
2_yr                                    0
3_yr                                    0
5_yr                                    0
7_yr                                    0
10_yr                                   0
20_yr                                   0
C1_School closing                       0
C2_Workplace closing                    0
C3_Cancel public events                 0
C4_Restrictions on gatherings           0
C5_Close public transport               0
C6_Stay at home requirements      

In [120]:
dailydrop.index

DatetimeIndex(['2001-01-01', '2001-01-02', '2001-01-03', '2001-01-04',
               '2001-01-05', '2001-01-06', '2001-01-07', '2001-01-08',
               '2001-01-09', '2001-01-10',
               ...
               '2021-11-21', '2021-11-22', '2021-11-23', '2021-11-24',
               '2021-11-25', '2021-11-26', '2021-11-27', '2021-11-28',
               '2021-11-29', '2021-11-30'],
              dtype='datetime64[ns]', length=7639, freq='D')

## Prep Crash

In [124]:
crash = crash[['CRASH_DATE','DAMAGE']].copy()

In [125]:
crash['crash_occurrences'] = 1

In [126]:
crash['CRASH_DATE'] = pd.to_datetime(crash['CRASH_DATE']).dt.date
crash['CRASH_DATE'] = pd.to_datetime(crash['CRASH_DATE'])

In [127]:
crash.columns = crash.columns.str.lower()

In [128]:
crash.dtypes

crash_date           datetime64[ns]
damage                       object
crash_occurrences             int64
dtype: object

In [129]:
crash['damage'].value_counts()

OVER $1,500      343278
$501 - $1,500    164722
$500 OR LESS      71227
Name: damage, dtype: int64

In [130]:
crash['damage'] = crash['damage'].replace('OVER $1,500', 3)
crash['damage'] = crash['damage'].replace('$501 - $1,500', 2)
crash['damage'] = crash['damage'].replace('$500 OR LESS', 1)

In [131]:
crash.rename(columns= {'crash_date':'date','damage':'damage_indicator'},inplace=True)

In [132]:
crash = crash.groupby('date').agg({'crash_occurrences':'sum','damage_indicator':'mean'})

In [133]:
crash.sort_index(inplace=True)

In [134]:
crash.isna().sum()

crash_occurrences    0
damage_indicator     0
dtype: int64

In [135]:
crash.index.duplicated().sum()

0

In [139]:
crash = crash.asfreq('d')

In [145]:
crash.fillna(0,inplace=True)

In [146]:
crash.loc[crash['damage_indicator'].isna()==True]

Unnamed: 0_level_0,crash_occurrences,damage_indicator
date,Unnamed: 1_level_1,Unnamed: 2_level_1


In [147]:
crash.index

DatetimeIndex(['2013-03-03', '2013-03-04', '2013-03-05', '2013-03-06',
               '2013-03-07', '2013-03-08', '2013-03-09', '2013-03-10',
               '2013-03-11', '2013-03-12',
               ...
               '2022-01-08', '2022-01-09', '2022-01-10', '2022-01-11',
               '2022-01-12', '2022-01-13', '2022-01-14', '2022-01-15',
               '2022-01-16', '2022-01-17'],
              dtype='datetime64[ns]', name='date', length=3243, freq='D')

### Merge Crash

In [148]:
dailydrop.index

DatetimeIndex(['2001-01-01', '2001-01-02', '2001-01-03', '2001-01-04',
               '2001-01-05', '2001-01-06', '2001-01-07', '2001-01-08',
               '2001-01-09', '2001-01-10',
               ...
               '2021-11-21', '2021-11-22', '2021-11-23', '2021-11-24',
               '2021-11-25', '2021-11-26', '2021-11-27', '2021-11-28',
               '2021-11-29', '2021-11-30'],
              dtype='datetime64[ns]', length=7639, freq='D')

In [149]:
dailydrop = dailydrop.merge(crash, how='outer', left_index=True, right_index=True)
dailydrop

Unnamed: 0,bus,rail_boardings,total_rides,prcp,tmax,tmin,gas_open,gas_close,gas_volume,3_mo,6_mo,1_yr,2_yr,3_yr,5_yr,7_yr,10_yr,20_yr,C1_School closing,C2_Workplace closing,C3_Cancel public events,C4_Restrictions on gatherings,C5_Close public transport,C6_Stay at home requirements,C7_Restrictions on internal movement,C8_International travel controls,E1_Income support,E2_Debt/contract relief,H1_Public information campaigns,H2_Testing policy,H3_Contact tracing,H6_Facial Coverings,H7_Vaccination policy,H8_Protection of elderly people,ConfirmedCases,ConfirmedDeaths,StringencyIndex,StringencyLegacyIndex,GovernmentResponseIndex,ContainmentHealthIndex,EconomicSupportIndex,total_daily_doses,total_daily_cum,first_dose_daily,first_dose_cum,first_dose_percent_pop,vax_series_completed_daily,vax_series_cum,vax_series_percent,crash_occurrences,damage_indicator
2001-01-01,297192.0,126455.0,423647.0,0.0,26.0,13.0,0.794,0.7950,14697.0,5.87,5.58,5.11,4.87,4.82,4.76,4.97,4.92,5.46,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
2001-01-02,780827.0,501952.0,1282779.0,0.0,21.0,7.0,0.794,0.7950,14697.0,5.87,5.58,5.11,4.87,4.82,4.76,4.97,4.92,5.46,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
2001-01-03,824923.0,536432.0,1361355.0,0.0,28.0,7.0,0.799,0.8100,10219.0,5.69,5.44,5.04,4.92,4.92,4.94,5.18,5.14,5.62,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
2001-01-04,870021.0,550011.0,1420032.0,0.0,32.0,23.0,0.815,0.8186,17869.0,5.37,5.20,4.82,4.77,4.78,4.82,5.07,5.03,5.56,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
2001-01-05,890426.0,557917.0,1448343.0,0.0,37.0,24.0,0.815,0.8205,18617.0,5.12,4.98,4.60,4.56,4.57,4.66,4.93,4.93,5.50,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-01-13,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,216.0,2.601852
2022-01-14,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,264.0,2.625000
2022-01-15,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,265.0,2.686792
2022-01-16,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,149.0,2.697987


In [150]:
dailydrop.isna().sum()[0]

48

In [151]:
dailydrop = dailydrop.drop(dailydrop.tail(48).index)

In [152]:
dailydrop.isna().sum()

bus                                        0
rail_boardings                             0
total_rides                                0
prcp                                       0
tmax                                       0
tmin                                       0
gas_open                                   0
gas_close                                  0
gas_volume                                 0
3_mo                                       0
6_mo                                       0
1_yr                                       0
2_yr                                       0
3_yr                                       0
5_yr                                       0
7_yr                                       0
10_yr                                      0
20_yr                                      0
C1_School closing                          0
C2_Workplace closing                       0
C3_Cancel public events                    0
C4_Restrictions on gatherings              0
C5_Close p

In [153]:
dailydrop.fillna(0,inplace=True)


In [155]:
dailydrop.isna().sum()

bus                                     0
rail_boardings                          0
total_rides                             0
prcp                                    0
tmax                                    0
tmin                                    0
gas_open                                0
gas_close                               0
gas_volume                              0
3_mo                                    0
6_mo                                    0
1_yr                                    0
2_yr                                    0
3_yr                                    0
5_yr                                    0
7_yr                                    0
10_yr                                   0
20_yr                                   0
C1_School closing                       0
C2_Workplace closing                    0
C3_Cancel public events                 0
C4_Restrictions on gatherings           0
C5_Close public transport               0
C6_Stay at home requirements      

In [156]:
dailydrop.index.duplicated().sum()

0

In [158]:
dailydrop.to_csv('../../capstone-data/daily-merged.csv')