# Data Clean Up

## Libraries

In [1]:
import pandas as pd
import pandas_datareader.data as web
import yfinance as yf
import datetime
import numpy as np
import xlwings as xw
from pathlib import Path
import fred_functions as ff

## ETF Daily Returns

### Data Clean Up

###### import ETF file

In [2]:
df_returns = pd.read_excel('../df_returns.xlsx',index_col=0,sheet_name='Sheet1')

###### Select "SPY" and "VBMFX". SPY inception was in 1993 while VBMFX was started in 1986. The data set will cover dates from February 1, 1993 to September 30, 2025

In [3]:
df = df_returns[['SPY','VBMFX']] # select tickets
df = df.loc['1993-02-01':'2025-09-30'] # filter for dates
df = df.resample("ME").last().dropna(how='all') # get the month end data, dropnas
df_simple_returns = df.pct_change().dropna(how='all') # find the percent change from the prev month, dropnas
df_log_returns = np.log(df/df.shift(1)).dropna(how="all") # calc log returns, just in case we need it

In [4]:
df_simple_returns.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 391 entries, 1993-03-31 to 2025-09-30
Freq: ME
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   SPY     391 non-null    float64
 1   VBMFX   391 non-null    float64
dtypes: float64(2)
memory usage: 9.2 KB


## Macro Indicators (FRED)

### Data Clean Up


###### import Macro File

In [5]:
df_macro = pd.read_excel('../df_macro.xlsx',index_col=0,sheet_name='Sheet1')
df_macro = df_macro.loc['1993-01-01':'2025-09-30'] # filter for dates

##### GDP

######  GDP is released quarterly. An advanced estimate is first released a month after the quarter end. A second preliminary estiamte is relased a month after the advanced estaimte. Finally an adjusted number is relased a month after the second estimate. 

##### For our model, we would need to resample month end and forfarwd fill the missing values. This means the GDP for January will be the GDP value for Febraury and March. 
##### The next step is to shift the GDP values so we create a 2 month lag to simulate real time analysis. In practice, the January GDP might not be known until April or until June. To simulate this timing issue, we will shift or create a lag the GDP values by 1 month and 2 months.

In [6]:
result = ff.forward_fill(df_macro['RealGDP'])
ff.resample_me(result)

DATE
1993-01-31    10576.275
1993-02-28    10576.275
1993-03-31    10576.275
1993-04-30    10637.847
1993-05-31    10637.847
                ...    
2025-05-31    23770.976
2025-06-30    23770.976
2025-07-31    23770.976
2025-08-31    23770.976
2025-09-30    23770.976
Freq: ME, Name: RealGDP, Length: 393, dtype: float64