# Data Clean Up

Libraries

In [16]:
import pandas as pd
import pandas_datareader.data as web
import yfinance as yf
import datetime
import numpy as np
import xlwings as xw
from pathlib import Path
import fred_functions as ff
import index_prices as ip
import functions as fp

### ETF Daily Returns

Import ETF file

In [17]:
df_prices = pd.read_excel('../df_returns.xlsx',index_col=0,sheet_name='Sheet1')

The index funds "SPY" and "VBMFX" will be used for this project. 

SPY is an exchange-traded fund that tracks the performance of the S&P 500.
SPY inception was in 1993 while 
VBMFX is a mutual fund that tracks the eprformance of the Bloomberg U.S. Aggregate Float Adjusted Index.
VBMFX was started in 1986.

In [18]:
df_prices = df_prices[['SPY','VBMFX']] # select tickets
# df = df.loc['1993-02-01':'2025-09-30'] # filter for dates
df_prices.index = pd.to_datetime(df_prices.index)
df_prices = df_prices.sort_index() # sorting
df_prices = df_prices.resample("ME").last().dropna(how='all') # get the month end data, dropnas
df_simple_returns = df_prices.pct_change().dropna(how='all') # find the percent change from the prev month, dropnas
df_cumm_returns = ip.cummulative_return(df_simple_returns,['SPY','VBMFX'])
df_ln_returns = ip.ln_return(df_simple_returns,['SPY','VBMFX'])
df_funds = df_cumm_returns.combine_first(df_ln_returns).sort_index()
df_funds = df_funds.rename(columns={'SPY':"SPY_mom",'VBMFX':"VBMFX_mom"})
df_funds = df_funds.combine_first(df_prices).sort_index()
# drop SPY and VBMFX since its showing percentages and replace with prices


In [19]:
df_funds

Unnamed: 0_level_0,SPY,SPY_3m,SPY_3m_ann,SPY_3m_ann_sd,SPY_3m_sd,SPY_ann_ln_3m,SPY_ln_3m,SPY_mom,VBMFX,VBMFX_3m,VBMFX_3m_ann,VBMFX_3m_ann_sd,VBMFX_3m_sd,VBMFX_ann_ln_3m,VBMFX_ln_3m,VBMFX_mom
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1986-12-31,,,,,,,,,1.420026,,,,,,,
1987-01-31,,,,,,,,,1.438512,,,,,,0.012934,0.013018
1987-02-28,,,,,,,,,1.446386,,,,,,0.005458,0.005473
1987-03-31,,,,,,,,,1.437078,0.012009,0.048907,0.019615,0.009808,0.047748,-0.006455,-0.006435
1987-04-30,,,,,,,,,1.391309,-0.032814,-0.124936,0.038128,0.019064,-0.133458,-0.032367,-0.031849
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-31,586.028015,-0.005078,-0.020156,0.119402,0.059701,-0.020362,0.060949,0.062845,9.439227,-0.002884,-0.011485,0.011429,0.005714,-0.011551,-0.007151,-0.007126
2025-06-30,616.141785,0.107772,0.505918,0.076822,0.038411,0.409403,0.050110,0.051386,9.587547,0.012662,0.051619,0.022839,0.011420,0.050331,0.015591,0.015713
2025-07-31,630.332458,0.143197,0.707987,0.040991,0.020495,0.535316,0.022770,0.023032,9.558923,0.005465,0.022039,0.024336,0.012168,0.021800,-0.002990,-0.002986
2025-08-31,643.266602,0.097672,0.451746,0.034284,0.017142,0.372767,0.020312,0.020520,9.669253,0.024369,0.101098,0.019632,0.009816,0.096308,0.011476,0.011542


## Macro Indicators (FRED)

import macro File

In [20]:
df_macro = pd.read_excel('../df_macro.xlsx',index_col=0,sheet_name='Sheet1')
# df_macro = df_macro.loc['1990-01-01':'2025-09-30'] # filter for dates

### GDP Data Preprocessing

To prepare quarterly GDP data for our model, we perform a two-step process to handle its frequency and reporting lag:

1.  Resample to Monthly Frequency: Since GDP is released quarterly, we first resample it to a monthly series. This is done using a forward-fill (`ffill`) method, where the GDP value for a given quarter is applied to each month within that quarter. For example, the Q1 GDP value is used for January, February, and March.

2.  Simulate Reporting Lag: In practice, GDP figures are not known in real-time (January's final GDP might not be available until April). To simulate this delay and prevent lookahead bias, we shift the resampled monthly data forward by 3 months.

In [21]:
df_gdp = ff.forward_fill(df_macro[['RealGDP']].copy())
df_gdp = ff.resample_me(df_gdp)
df_gdp = ff.yoy_change(df_gdp)
# df_gdp_lag1 = ff.shift_lag(df_gdp,lag_months=1).rename(columns={"RealGDP":"RealGDP_1m"})
# df_gdp_lag2 = ff.shift_lag(df_gdp,lag_months=2).rename(columns={"RealGDP":"RealGDP_2m"})
df_gdp_lag3 = ff.shift_lag(df_gdp,lag_months=3).rename(columns={"RealGDP":"RealGDP_3m"})
# df_gdp_lag4 = ff.shift_lag(df_gdp,lag_months=4).rename(columns={"RealGDP":"RealGDP_4m"})
# df_gdp_clean = df_gdp.join([df_gdp_lag1, df_gdp_lag2,df_gdp_lag3,df_gdp_lag4], how='inner') # use this to join the lag_df
df_gdp_clean = df_gdp.join([df_gdp_lag3], how='inner') # only using lag3

### Permits, CPI, CoreCPI, IndustrialProd, RealRetail


In [22]:
indicator_index_cols = ['Permits','CPI','CoreCPI','IndustrialProd','RealRetail'] # select columns to update
df_indicator_index = df_macro[indicator_index_cols].copy() # make a copy
df_indicator_index = ff.resample_me(df_indicator_index) # get month-end indices
df_lag1_indicator_index = ff.shift_lag(df_indicator_index,lag_months=1) #shift by 1 to account for information lag
# dictionary to rename original numbers to "_lag1"
col_rename_indicator_index = {
    'Permits':'Permits_1m',
    'CPI':'CPI_1m',
    'CoreCPI':'CoreCPI_1m',
    'IndustrialProd':'IndustrialProd_1m',
    'RealRetail':'RealRetail_1m'
}
col_rename_yoy = {
    'Permits_1m':'Permits_1m_yoy',
    'CPI_1m':'CPI_1m_yoy',
    'CoreCPI_1m':'CoreCPI_1m_yoy',
    'IndustrialProd_1m':'IndustrialProd_1m_yoy',
    'RealRetail_1m':'RealRetail_1m_yoy'
}
ff.rename_columns(df_lag1_indicator_index,col_rename_indicator_index) #ff to rename columns using the dict above
df_indicator_yoy = ff.yoy_change(df_lag1_indicator_index) # ff for year over year change
ff.rename_columns(df_indicator_yoy,col_rename_yoy) # rename yoy columns
df_indicator_mom = ff.mom_change(df_lag1_indicator_index) ## ff for month-over-month change
df_indicator_mom = ff.rolling_3m(df_indicator_mom,df_indicator_mom.columns) # ff to calculate 3month rolling avg, and also annualized 3month rolling avg
df_indicator_clean = df_indicator_index.join([df_indicator_yoy,df_indicator_mom], how='inner')

### Unemployment

In [23]:
df_unemployment = df_macro[["UnemploymentRate"]].copy()
df_unemployment = ff.forward_fill(df_unemployment)
df_unemployment = ff.resample_me(df_unemployment)
df_unemployment_1m = ff.shift_lag(df_unemployment,lag_months=1)
df_unemployment_1m = ff.mom_change(df_unemployment_1m)
df_unemployment_clean = ff.rolling_3m(df_unemployment_1m,df_unemployment_1m.columns)

### 

### Rates - Mortgage 30 Year, US 10 YR Treasury, US 2 Year Treasury, US 3 Month Treasury, TIPS10 YR, CorpBAA, CorpAAA, FedFunsRate

In [24]:
df_rate_clean = ff.credit_spread(df_macro,lag_months=0,delta_calc=True)
# col_remove = ['Mort30Y', 'UST10Y', 'UST2Y', 'UST3M', 'CorpBAA', 'CorpAAA','FedFundsRate']
# df_rate_clean.drop(columns=col_remove,inplace=True)

### Combine all macro features

In [25]:
df_macro_clean = (
    df_gdp_clean.combine_first(df_indicator_clean)
       .combine_first(df_unemployment_clean)
       .combine_first(df_rate_clean)
       .sort_index()
)

### Combine index fund df and macro df

In [26]:
df_clean =(
    df_funds.combine_first(df_macro_clean)
    .sort_index()
)

In [27]:
df_clean.shape

(490, 79)

### Rearrange column

In [28]:
column_order = fp.column_order()
print(len(column_order))
df_clean = df_clean[column_order]

79


### Filter data

In [29]:
df_clean = df_clean.loc['1993-04-01':'2025-09-30'] # filter for dates

### Export data to excel

In [None]:
def to_excel(df,file_name):
    out = Path.home() /"Documents"/"cs668"/"CS668_Capstone"/file_name
    df.to_excel(out, index=True)  # index=True is default; keep if you want the index

# Call to_excel function to save as excel
# to_excel(df_clean,"df_SPY_VBMFX_Macro_Signals.xlsx")
