# Daily Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

In [2]:
cta = pd.read_csv('../capstone-data/daily/CTA_-_Ridership_-_Daily_Boarding_Totals.csv')
weather = pd.read_csv('../capstone-data/daily/weather.csv')
gas = pd.read_excel('../capstone-data/daily/gasoline_stocks.xlsx')
covid = pd.read_csv('../capstone-data/monthly/covid-restrictions.csv')
curve = pd.read_excel('../capstone-data/daily/treasury_yield_curve.xlsx')

## CTA Data

In [3]:
cta.tail()

Unnamed: 0,service_date,day_type,bus,rail_boardings,total_rides
7696,11/26/2021,W,257700,189694,447394
7697,11/27/2021,A,237839,187065,424904
7698,11/28/2021,U,184817,147830,332647
7699,11/29/2021,W,421322,276090,697412
7700,11/30/2021,W,450230,302349,752579


In [4]:
cta.drop(columns={'day_type'},inplace=True)

In [5]:
cta.shape

(7701, 4)

In [6]:
cta.rename(columns={'service_date':'date'},inplace=True)

In [7]:
cta['date'] = pd.to_datetime(cta['date'])

In [8]:
cta.set_index('date',inplace=True)

cta.sort_index(inplace=True)

In [9]:
cta.dtypes

bus               int64
rail_boardings    int64
total_rides       int64
dtype: object

In [10]:
cta.isna().sum()

bus               0
rail_boardings    0
total_rides       0
dtype: int64

## Weather Data

In [11]:
def daily_weather(weather):
    """
    A function that cleans the weather dataset, which comes as daily. 
    Commented out is optional code that can be edited to suit the needed format,
    ex. ...resample('MS').sum() for month starting on first day of that month.
    """
    weather.columns
    weather.columns = weather.columns.str.lower()
    weather.head()
    weather.dtypes
    weather.drop(columns='station',inplace=True)
    weather['date'] = pd.to_datetime(weather['date'])
    weather.set_index('date',inplace=True)
    weather.sort_index(inplace=True)
    weather = weather.fillna(0)
    
    
    # weather = df.resample('W',closed='left').sum()
    # weather.index = weather.index - pd.DateOffset(days=6)
    
    
    return weather


In [12]:
weather = daily_weather(weather)

In [13]:
weather.isna().sum()

prcp    0
snow    0
snwd    0
tmax    0
tmin    0
dtype: int64

In [14]:
cta.isna().sum()

bus               0
rail_boardings    0
total_rides       0
dtype: int64

In [15]:
cta.shape, weather.shape

((7701, 3), (7694, 5))

## Merge CTA/Weather

In [16]:
daily = cta.merge(weather, how='outer', left_index=True, right_index=True)
daily

Unnamed: 0_level_0,bus,rail_boardings,total_rides,prcp,snow,snwd,tmax,tmin
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2001-01-01,297192.0,126455.0,423647.0,0.0,0.0,16.0,26,13
2001-01-02,780827.0,501952.0,1282779.0,0.0,0.0,15.0,21,7
2001-01-03,824923.0,536432.0,1361355.0,0.0,0.0,15.0,28,7
2001-01-04,870021.0,550011.0,1420032.0,0.0,0.0,14.0,32,23
2001-01-05,890426.0,557917.0,1448343.0,0.0,0.0,12.0,37,24
...,...,...,...,...,...,...,...,...
2022-01-20,,,,0.0,0.0,0.0,22,9
2022-01-21,,,,0.0,0.0,0.0,27,10
2022-01-22,,,,0.0,0.2,0.0,36,20
2022-01-23,,,,0.0,3.6,0.0,25,10


In [17]:
daily.loc[daily['bus'].isna()==True].head()

Unnamed: 0_level_0,bus,rail_boardings,total_rides,prcp,snow,snwd,tmax,tmin
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-12-01,,,,0.0,0.0,0.0,48,35
2021-12-02,,,,0.0,0.0,0.0,59,44
2021-12-03,,,,0.0,0.0,0.0,51,39
2021-12-04,,,,0.0,0.0,0.0,46,31
2021-12-05,,,,0.0,0.0,0.0,43,35


In [18]:
dailydrop=daily.dropna()

## Gas Data

In [19]:
gas

Unnamed: 0,Date,Open,High,Low,Close*,Adj Close**,Volume
0,2022-01-31,2.5599,2.5741,2.5388,2.5543,2.5543,22434
1,2022-01-28,2.5608,2.5798,2.5244,2.5423,2.5423,23546
2,2022-01-27,2.5223,2.57,2.51,2.521,2.521,23546
3,2022-01-26,2.4552,2.5347,2.4533,2.5229,2.5229,35505
4,2022-01-25,2.4215,2.465,2.4023,2.4595,2.4595,30337
...,...,...,...,...,...,...,...
5361,2001-01-08,0.827,0.853,0.825,0.835,0.835,24057
5362,2001-01-05,0.815,0.838,0.814,0.8205,0.8205,18617
5363,2001-01-04,0.815,0.83,0.813,0.8186,0.8186,17869
5364,2001-01-03,0.799,0.812,0.792,0.81,0.81,10219


In [20]:
gas.dtypes

Date           datetime64[ns]
Open                   object
High                   object
Low                    object
Close*                 object
Adj Close**            object
Volume                 object
dtype: object

In [21]:
gas.isna().sum()

Date           0
Open           0
High           0
Low            0
Close*         0
Adj Close**    0
Volume         0
dtype: int64

In [22]:
def clean_gas(gas):
    gas.columns = gas.columns.str.lower()
    gas['date'] = pd.to_datetime(gas['date'])
    gas.set_index('date',inplace=True)
    gas.sort_index(inplace=True)
    gas.drop(columns={'high','low','adj close**'},inplace=True)
    gas.replace('-',np.nan,inplace=True)
    gas.rename(columns={'open':'gas_open','close*': 'gas_close',
                        'volume':'gas_volume'},inplace=True)
    return gas
    

In [23]:
gas = clean_gas(gas)

In [24]:
gas.isna().sum()

gas_open       69
gas_close      69
gas_volume    142
dtype: int64

In [25]:
gas['gas_volume'].isna().value_counts(normalize=True)

False    0.973537
True     0.026463
Name: gas_volume, dtype: float64

In [26]:
gas.loc[gas['gas_volume'].isna()==True].head()

Unnamed: 0_level_0,gas_open,gas_close,gas_volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2001-01-15,,,
2001-01-29,0.891,0.877,
2001-02-19,,,
2001-03-13,0.875,0.885,
2001-04-12,,,


In [27]:
gas.describe()

Unnamed: 0,gas_open,gas_close,gas_volume
count,5297.0,5297.0,5224.0
mean,1.829898,1.830688,40378.104326
std,0.703186,0.703521,21033.533512
min,0.4273,0.4118,5.0
25%,1.3066,1.3058,24325.0
50%,1.7639,1.7676,36029.5
75%,2.2975,2.3009,54194.75
max,3.563,3.571,190777.0


In [28]:
gas.tail()

Unnamed: 0_level_0,gas_open,gas_close,gas_volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01-25,2.4215,2.4595,30337.0
2022-01-26,2.4552,2.5229,35505.0
2022-01-27,2.5223,2.521,23546.0
2022-01-28,2.5608,2.5423,23546.0
2022-01-31,2.5599,2.5543,22434.0


## Merge Gasoline

In [29]:
daily.shape

(7756, 8)

In [30]:
dailydrop.shape

(7701, 8)

In [31]:
gas.shape

(5366, 3)

In [32]:
gas.isna().sum()

gas_open       69
gas_close      69
gas_volume    142
dtype: int64

In [33]:
dailydrop = dailydrop.merge(gas, how='outer', left_index=True, right_index=True)


In [34]:
dailydrop.isna().sum()

bus                 42
rail_boardings      42
total_rides         42
prcp                42
snow                42
snwd                42
tmax                42
tmin                42
gas_open          2403
gas_close         2403
gas_volume        2476
dtype: int64

In [35]:
dailydrop.drop(dailydrop.tail(42).index,inplace=True)
dailydrop.tail()

Unnamed: 0_level_0,bus,rail_boardings,total_rides,prcp,snow,snwd,tmax,tmin,gas_open,gas_close,gas_volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2021-11-26,257700.0,189694.0,447394.0,0.0,0.0,0.0,31.0,19.0,2.314,2.0294,17599.0
2021-11-27,237839.0,187065.0,424904.0,0.0,0.0,0.0,41.0,30.0,,,
2021-11-28,184817.0,147830.0,332647.0,0.0,0.0,0.0,41.0,26.0,,,
2021-11-29,421322.0,276090.0,697412.0,0.0,0.0,0.0,41.0,26.0,2.0785,2.0771,29118.0
2021-11-30,450230.0,302349.0,752579.0,0.0,0.0,0.0,53.0,33.0,2.1036,1.9801,83219.0


## Treasury Curve

In [36]:
curve.dtypes

Date     datetime64[ns]
1 Mo            float64
2 Mo            float64
3 Mo            float64
6 Mo            float64
1 Yr            float64
2 Yr            float64
3 Yr            float64
5 Yr            float64
7 Yr            float64
10 Yr           float64
20 Yr           float64
30 Yr           float64
dtype: object

In [37]:
curve.shape

(5274, 13)

In [38]:
curve.isna().sum()

Date        0
1 Mo      145
2 Mo     4451
3 Mo        3
6 Mo        0
1 Yr        0
2 Yr        0
3 Yr        0
5 Yr        0
7 Yr        0
10 Yr       0
20 Yr       0
30 Yr     994
dtype: int64

In [39]:
def clean_curve(curve):
    curve.columns = curve.columns.str.lower()

    curve.set_index('date',inplace=True)

    curve.sort_index(inplace=True)
    curve.columns = curve.columns.str.replace(' ','_')
    curve.drop(columns={'1_mo','2_mo','30_yr'},inplace=True)
    return curve

In [40]:
curve= clean_curve(curve)

In [42]:
curve.head()

Unnamed: 0_level_0,3_mo,6_mo,1_yr,2_yr,3_yr,5_yr,7_yr,10_yr,20_yr
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2001-01-02,5.87,5.58,5.11,4.87,4.82,4.76,4.97,4.92,5.46
2001-01-03,5.69,5.44,5.04,4.92,4.92,4.94,5.18,5.14,5.62
2001-01-04,5.37,5.2,4.82,4.77,4.78,4.82,5.07,5.03,5.56
2001-01-05,5.12,4.98,4.6,4.56,4.57,4.66,4.93,4.93,5.5
2001-01-08,5.19,5.03,4.61,4.54,4.55,4.65,4.94,4.94,5.52


## Merge Yield Curve

In [43]:
dailydrop = dailydrop.merge(curve, how='outer', left_index=True, right_index=True)


In [46]:
dailydrop.isna().sum()[0]

42

In [47]:
dailydrop = dailydrop.drop(dailydrop.tail(42).index)

In [48]:
dailydrop.tail()

Unnamed: 0_level_0,bus,rail_boardings,total_rides,prcp,snow,snwd,tmax,tmin,gas_open,gas_close,gas_volume,3_mo,6_mo,1_yr,2_yr,3_yr,5_yr,7_yr,10_yr,20_yr
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2021-11-26,257700.0,189694.0,447394.0,0.0,0.0,0.0,31.0,19.0,2.314,2.0294,17599.0,0.06,0.1,0.2,0.5,0.81,1.16,1.4,1.48,1.89
2021-11-27,237839.0,187065.0,424904.0,0.0,0.0,0.0,41.0,30.0,,,,,,,,,,,,
2021-11-28,184817.0,147830.0,332647.0,0.0,0.0,0.0,41.0,26.0,,,,,,,,,,,,
2021-11-29,421322.0,276090.0,697412.0,0.0,0.0,0.0,41.0,26.0,2.0785,2.0771,29118.0,0.06,0.1,0.21,0.51,0.83,1.18,1.42,1.52,1.93
2021-11-30,450230.0,302349.0,752579.0,0.0,0.0,0.0,53.0,33.0,2.1036,1.9801,83219.0,0.05,0.1,0.24,0.52,0.81,1.14,1.36,1.43,1.85


In [49]:
dailydrop.isna().sum()

bus                  0
rail_boardings       0
total_rides          0
prcp                 0
snow                 0
snwd                 0
tmax                 0
tmin                 0
gas_open          2403
gas_close         2403
gas_volume        2476
3_mo              2430
6_mo              2427
1_yr              2427
2_yr              2427
3_yr              2427
5_yr              2427
7_yr              2427
10_yr             2427
20_yr             2427
dtype: int64

## Prep Covid

## Prep Vax