# Data Clean Up

Libraries

In [1]:
import pandas as pd
import pandas_datareader.data as web
import yfinance as yf
import datetime
import numpy as np
import xlwings as xw
from pathlib import Path
import fred_functions as ff
import index_prices as ip
import functions as fp

### ETF Daily Returns

Import ETF file

In [2]:
df_prices = pd.read_excel('../df_returns.xlsx',index_col=0,sheet_name='Sheet1')

The index funds "SPY" and "VBMFX" will be used for this project. 

SPY is an exchange-traded fund that tracks the performance of the S&P 500.
SPY inception was in 1993 while 
VBMFX is a mutual fund that tracks the eprformance of the Bloomberg U.S. Aggregate Float Adjusted Index.
VBMFX was started in 1986.

In [3]:
df_prices = df_prices[['SPY','VBMFX']] # select tickets
# df = df.loc['1993-02-01':'2025-09-30'] # filter for dates
df_prices.index = pd.to_datetime(df_prices.index)
df_prices = df_prices.sort_index() # sorting
df_prices = df_prices.resample("ME").last().dropna(how='all') # get the month end data, dropnas
df_simple_returns = df_prices.pct_change().dropna(how='all') # find the percent change from the prev month, dropnas
df_cumm_returns = ip.cummulative_return(df_simple_returns,['SPY','VBMFX'])
df_ln_returns = ip.ln_return(df_simple_returns,['SPY','VBMFX'])
df_funds = df_cumm_returns.combine_first(df_ln_returns).sort_index()


## Macro Indicators (FRED)

import macro File

In [4]:
df_macro = pd.read_excel('../df_macro.xlsx',index_col=0,sheet_name='Sheet1')
# df_macro = df_macro.loc['1990-01-01':'2025-09-30'] # filter for dates

### GDP Data Preprocessing

To prepare quarterly GDP data for our model, we perform a two-step process to handle its frequency and reporting lag:

1.  Resample to Monthly Frequency: Since GDP is released quarterly, we first resample it to a monthly series. This is done using a forward-fill (`ffill`) method, where the GDP value for a given quarter is applied to each month within that quarter. For example, the Q1 GDP value is used for January, February, and March.

2.  Simulate Reporting Lag: In practice, GDP figures are not known in real-time (January's final GDP might not be available until April). To simulate this delay and prevent lookahead bias, we shift the resampled monthly data forward by 3 months.

In [5]:
df_gdp = ff.forward_fill(df_macro[['RealGDP']].copy())
df_gdp = ff.resample_me(df_gdp)
df_gdp = ff.yoy_change(df_gdp)
# df_gdp_lag1 = ff.shift_lag(df_gdp,lag_months=1).rename(columns={"RealGDP":"RealGDP_1m"})
# df_gdp_lag2 = ff.shift_lag(df_gdp,lag_months=2).rename(columns={"RealGDP":"RealGDP_2m"})
df_gdp_lag3 = ff.shift_lag(df_gdp,lag_months=3).rename(columns={"RealGDP":"RealGDP_3m"})
# df_gdp_lag4 = ff.shift_lag(df_gdp,lag_months=4).rename(columns={"RealGDP":"RealGDP_4m"})
# df_gdp_clean = df_gdp.join([df_gdp_lag1, df_gdp_lag2,df_gdp_lag3,df_gdp_lag4], how='inner') # use this to join the lag_df
df_gdp_clean = df_gdp.join([df_gdp_lag3], how='inner') # only using lag3

### Permits, CPI, CoreCPI, IndustrialProd, RealRetail


In [6]:
indicator_index_cols = ['Permits','CPI','CoreCPI','IndustrialProd','RealRetail'] # select columns to update
df_indicator_index = df_macro[indicator_index_cols].copy() # make a copy
df_indicator_index = ff.resample_me(df_indicator_index) # get month-end indices
df_lag1_indicator_index = ff.shift_lag(df_indicator_index,lag_months=1) #shift by 1 to account for information lag
# dictionary to rename original numbers to "_lag1"
col_rename_indicator_index = {
    'Permits':'Permits_1m',
    'CPI':'CPI_1m',
    'CoreCPI':'CoreCPI_1m',
    'IndustrialProd':'IndustrialProd_1m',
    'RealRetail':'RealRetail_1m'
}
col_rename_yoy = {
    'Permits_1m':'Permits_1m_yoy',
    'CPI_1m':'CPI_1m_yoy',
    'CoreCPI_1m':'CoreCPI_1m_yoy',
    'IndustrialProd_1m':'IndustrialProd_1m_yoy',
    'RealRetail_1m':'RealRetail_1m_yoy'
}
ff.rename_columns(df_lag1_indicator_index,col_rename_indicator_index) #ff to rename columns using the dict above
df_indicator_yoy = ff.yoy_change(df_lag1_indicator_index) # ff for year over year change
ff.rename_columns(df_indicator_yoy,col_rename_yoy) # rename yoy columns
df_indicator_mom = ff.mom_change(df_lag1_indicator_index) ## ff for month-over-month change
df_indicator_mom = ff.rolling_3m(df_indicator_mom,df_indicator_mom.columns) # ff to calculate 3month rolling avg, and also annualized 3month rolling avg
df_indicator_clean = df_indicator_index.join([df_indicator_yoy,df_indicator_mom], how='inner')

### Unemployment

In [7]:
df_unemployment = df_macro[["UnemploymentRate"]].copy()
df_unemployment = ff.forward_fill(df_unemployment)
df_unemployment = ff.resample_me(df_unemployment)
df_unemployment_1m = ff.shift_lag(df_unemployment,lag_months=1)
df_unemployment_1m = ff.mom_change(df_unemployment_1m)
df_unemployment_clean = ff.rolling_3m(df_unemployment_1m,df_unemployment_1m.columns)

### 

### Rates - Mortgage 30 Year, US 10 YR Treasury, US 2 Year Treasury, US 3 Month Treasury, TIPS10 YR, CorpBAA, CorpAAA, FedFunsRate

In [8]:
df_rate_clean = ff.credit_spread(df_macro,lag_months=0,delta_calc=True)
# col_remove = ['Mort30Y', 'UST10Y', 'UST2Y', 'UST3M', 'CorpBAA', 'CorpAAA','FedFundsRate']
# df_rate_clean.drop(columns=col_remove,inplace=True)

In [9]:
df_rate_clean

Unnamed: 0_level_0,Mort30Y,UST10Y,UST2Y,UST3M,CorpBAA,CorpAAA,FedFundsRate,Slope_10Y_2Y,Slope_10Y_3M,Slope_10Y_FF,...,UST3M_Delta3m,CorpBAA_Delta3m,CorpAAA_Delta3m,FedFundsRate_Delta3m,Slope_10Y_2Y_Delta3m,Slope_10Y_3M_Delta3m,Slope_10Y_FF_Delta3m,BAA_minus_10Y_Delta3m,AAA_minus_10Y_Delta3m,BAA_minus_AAA_Delta3m
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1985-01-31,12.96,11.17,9.90,8.33,,11.82,8.73,1.27,2.84,2.44,...,,,,,,,,,,
1985-02-28,12.94,11.91,10.66,8.81,,12.56,8.74,1.25,3.10,3.17,...,,,,,,,,,,
1985-03-31,13.29,11.65,10.43,8.47,,12.42,8.58,1.22,3.18,3.07,...,,,,,,,,,,
1985-04-30,13.12,11.41,9.91,8.12,,12.18,8.58,1.50,3.29,2.83,...,-0.21,,0.36,-0.15,0.23,0.45,0.39,,0.12,
1985-05-31,12.71,10.28,8.92,7.38,,11.17,7.64,1.36,2.90,2.64,...,-1.43,,-1.39,-1.10,0.11,-0.20,-0.53,,0.24,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-06-30,6.77,4.24,3.72,4.41,5.99,5.34,4.33,0.52,-0.17,-0.09,...,0.09,0.00,0.03,0.00,0.18,-0.08,0.01,-0.01,0.02,-0.03
2025-07-31,6.72,4.37,3.94,4.41,6.04,5.41,4.33,0.43,-0.04,0.04,...,0.10,-0.09,0.03,0.00,-0.14,0.10,0.20,-0.29,-0.17,-0.12
2025-08-31,6.56,4.23,3.59,4.23,6.03,5.42,4.33,0.64,0.00,-0.10,...,-0.13,-0.22,-0.09,0.00,0.12,-0.05,-0.18,-0.04,0.09,-0.13
2025-09-30,6.30,4.16,3.60,4.02,5.83,5.22,4.09,0.56,0.14,0.07,...,-0.39,-0.16,-0.12,-0.24,0.04,0.31,0.16,-0.08,-0.04,-0.04


### Combine all macro features

In [10]:
df_macro_clean = (
    df_gdp_clean.combine_first(df_indicator_clean)
       .combine_first(df_unemployment_clean)
       .combine_first(df_rate_clean)
       .sort_index()
)

### Combine index fund df and macro df

In [11]:
df_clean =(
    df_funds.combine_first(df_macro_clean)
    .sort_index()
)

In [12]:
df_clean.shape

(490, 77)

### Rearrange column

In [13]:
column_order = fp.column_order()
print(len(column_order))
df_clean = df_clean[column_order]

77


In [14]:
df_clean = df_clean.loc['1993-04-01':'2025-09-30'] # filter for dates

In [15]:
df_clean[df_clean.isna().any(axis=1)]

Unnamed: 0,SPY,SPY_3m,SPY_3m_ann,SPY_3m_sd,SPY_3m_ann_sd,SPY_ln_3m,SPY_ann_ln_3m,VBMFX,VBMFX_3m,VBMFX_3m_ann,...,Slope_10Y_FF_Delta3m,AAA_minus_10Y,AAA_minus_10Y_Delta1m,AAA_minus_10Y_Delta3m,BAA_minus_10Y,BAA_minus_10Y_Delta1m,BAA_minus_10Y_Delta3m,BAA_minus_AAA,BAA_minus_AAA_Delta1m,BAA_minus_AAA_Delta3m
2025-09-30,0.03562,0.081212,0.366607,0.008091,0.016183,0.035001,0.312331,0.010419,0.01903,0.078321,...,0.16,1.06,-0.13,-0.04,1.67,-0.13,-0.08,0.61,0.0,-0.04


In [16]:
def to_excel(df,file_name):
    out = Path.home() /"Documents"/"cs668"/"CS668_Capstone"/file_name
    df.to_excel(out, index=True)  # index=True is default; keep if you want the index

# Call to_excel function to save as excel
to_excel(df_clean,"df_clean.xlsx")
