# Data Clean Up

Libraries

In [6]:
import pandas as pd
import pandas_datareader.data as web
import yfinance as yf
import datetime
import numpy as np
import xlwings as xw
from pathlib import Path
import fred_functions as ff

### ETF Daily Returns

Import ETF file

In [7]:
df_returns = pd.read_excel('../df_returns.xlsx',index_col=0,sheet_name='Sheet1')

Select "SPY" and "VBMFX". SPY inception was in 1993 while VBMFX was started in 1986. The data set will cover dates from February 1, 1993 to September 30, 2025

In [8]:
df = df_returns[['SPY','VBMFX']] # select tickets
# df = df.loc['1993-02-01':'2025-09-30'] # filter for dates
df = df.resample("ME").last().dropna(how='all') # get the month end data, dropnas
df_simple_returns = df.pct_change().dropna(how='all') # find the percent change from the prev month, dropnas
df_log_returns = np.log(df/df.shift(1)).dropna(how="all") # calc log returns, just in case we need it

## Macro Indicators (FRED)

import macro File

In [9]:
df_macro = pd.read_excel('../df_macro.xlsx',index_col=0,sheet_name='Sheet1')
# df_macro = df_macro.loc['1990-01-01':'2025-09-30'] # filter for dates

### GDP Data Preprocessing

To prepare quarterly GDP data for our model, we perform a two-step process to handle its frequency and reporting lag:

1.  **Resample to Monthly Frequency:** Since GDP is released **quarterly**, we first resample it to a monthly series. This is done using a **forward-fill** (`ffill`) method, where the GDP value for a given quarter is applied to each month within that quarter. For example, the Q1 GDP value is used for January, February, and March.

2.  **Simulate Reporting Lag:** In practice, GDP figures are not known in real-time (e.g., January's final GDP might not be available until April). To simulate this delay and prevent lookahead bias, we **shift** the resampled monthly data forward. We will create four lagged features: a 1-month lag ,2-month lag, 3-month lag, and a 4-month lag.

In [10]:
df_gdp = ff.forward_fill(df_macro[['RealGDP']].copy())
df_gdp = ff.resample_me(df_gdp)
df_gdp = ff.yoy_change(df_gdp)
df_gdp_lag1 = ff.shift_lag(df_gdp,lag_months=1).rename(columns={"RealGDP":"RealGDP_lag1"})
df_gdp_lag2 = ff.shift_lag(df_gdp,lag_months=2).rename(columns={"RealGDP":"RealGDP_lag2"})
df_gdp_lag3 = ff.shift_lag(df_gdp,lag_months=3).rename(columns={"RealGDP":"RealGDP_lag3"})
df_gdp_lag4 = ff.shift_lag(df_gdp,lag_months=4).rename(columns={"RealGDP":"RealGDP_lag4"})
df_gdp_clean = df_gdp.join([df_gdp_lag1, df_gdp_lag2,df_gdp_lag3,df_gdp_lag4], how='inner')

### Permits, CPI, CoreCPI, IndustrialProd, RealRetail


In [None]:
indicator_index_cols = ['Permits','CPI','CoreCPI','IndustrialProd','RealRetail']
df_indicator_index = df_macro[indicator_index_cols].copy()




Unnamed: 0_level_0,Permits,CPI,CoreCPI,IndustrialProd,RealRetail
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1985-01-01,1660.0,105.7,107.1,54.527,
1985-01-02,,,,,
1985-01-03,,,,,
1985-01-04,,,,,
1985-01-05,,,,,
...,...,...,...,...,...
2025-09-27,,,,,
2025-09-28,,,,,
2025-09-29,,,,,
2025-09-30,,,,,


### 