# Data Clean Up

Libraries

In [1]:
import pandas as pd
import pandas_datareader.data as web
import yfinance as yf
import datetime
import numpy as np
import xlwings as xw
from pathlib import Path
import fred_functions as ff
import index_prices as ip
import functions as fp

### ETF Daily Returns

Import ETF file

In [2]:
df_prices = pd.read_excel('../df_prices.xlsx',index_col=0,sheet_name='Sheet1')

The index funds "SPY" and "VBMFX" will be used for this project. 

SPY is an exchange-traded fund that tracks the performance of the S&P 500.
SPY inception was in 1993 while 
VBMFX is a mutual fund that tracks the eprformance of the Bloomberg U.S. Aggregate Float Adjusted Index.
VBMFX was started in 1986.

In [3]:
df_prices = df_prices[['SPY','VBMFX']] # select tickets
# df = df.loc['1993-02-01':'2025-09-30'] # filter for dates
df_prices.index = pd.to_datetime(df_prices.index)
df_prices = df_prices.sort_index() # sorting
df_prices = df_prices.resample("ME").last().dropna(how='all') # get the month end data, dropnas
df_simple_returns = df_prices.pct_change().dropna(how='all') # find the percent change from the prev month, dropnas
df_cumm_returns = ip.cummulative_return(df_simple_returns,['SPY','VBMFX'])
df_ln_returns = ip.ln_return(df_simple_returns,['SPY','VBMFX'])
df_funds = df_cumm_returns.combine_first(df_ln_returns).sort_index()
df_funds = df_funds.rename(columns={'SPY':"SPY_mom",'VBMFX':"VBMFX_mom"})
df_funds = df_funds.combine_first(df_prices).sort_index()
# drop SPY and VBMFX since its showing percentages and replace with prices


## Macro Indicators (FRED)

import macro File

In [4]:
df_macro = pd.read_excel('../df_macro.xlsx',index_col=0,sheet_name='Sheet1')
# df_macro = df_macro.loc['1990-01-01':'2025-09-30'] # filter for dates

### GDP Data Preprocessing

To prepare quarterly GDP data for our model, we perform a two-step process to handle GDP data frequency and reporting lag:

GDP is reported on a quarterly basis. For our research we are using Real Gross Domestic Product (GDPC1) from the FRED website

Units: Billions of Chained 2017 Dollars

**Source:** [FRED GDPC1](https://fred.stlouisfed.org/series/GDPC1)

1.  Since GDP is reported quarterly, we will use the foward-fill method to fill-in the blank months.The GDP value for a quarter is the same for all the months within that quarter. For example, the Q1 GDP value is used for January, February, and March.

2.  Simulate Reporting Lag: In practice, GDP figures are not known in real-time (January's final GDP might not be available until April). To simulate this delay and prevent lookahead bias, we shift the resampled monthly data forward by 3 months.

In [None]:
df_gdp = ff.forward_fill(df_macro[['RealGDP']].copy()) #forward fill to fill in blanks
df_gdp = ff.resample_me(df_gdp) # resampled for month-end value
df_gdp_lag = ff.shift_lag(df_gdp,lag_months=3).rename(columns={'RealGDP':'RealGDP_lag'}) # shift the values by 3 months
df_gdp_yoy = ff.yoy_change(df_gdp).rename(columns={'RealGDP':'RealGDP_yoy'}) # calc yoy change and create df
df_gdp_mom = ff.mom_change(df_gdp).rename(columns={'RealGDP':'RealGDP_mom'}) # calc mom change and create df
df_gdp_clean = df_gdp.join([df_gdp_lag,df_gdp_yoy,df_gdp_mom], how='inner') # only using lag3, combine dfs

### Permits, CPI, CoreCPI, IndustrialProd, and RealRetail


### Permits, CPI, CoreCPI, IndustrialProd, and RealRetail Data Preprocessing

These key economic indicators are grouped together in the data preprocessing proces since they are reported monthly.

### PERMIT 

PERMIT is the number of housing permits of new privately-owned housing units. It is an annualized estimate based on that month's  seasonally adjusted pace.

Units: Thousands of Units, SAAS
[Building Permit Survey link](https://www.census.gov/construction/bps/about.html)

**Source:** [FRED PERMIT](https://fred.stlouisfed.org/series/PERMIT)


In [7]:
indicator_index_cols = ['Permits','CPI','CoreCPI','IndustrialProd','RealRetail'] # select columns to update
df_indicator_index = df_macro[indicator_index_cols].copy() # make a copy
df_indicator_index = ff.resample_me(df_indicator_index) # get month-end indices
df_lag1_indicator_index = ff.shift_lag(df_indicator_index,lag_months=1) #shift by 1 to account for information lag
# dictionary to rename original numbers to "_lag1"
col_rename_indicator_index = {
    'Permits':'Permits_1m',
    'CPI':'CPI_1m',
    'CoreCPI':'CoreCPI_1m',
    'IndustrialProd':'IndustrialProd_1m',
    'RealRetail':'RealRetail_1m'
}
col_rename_yoy = {
    'Permits_1m':'Permits_1m_yoy',
    'CPI_1m':'CPI_1m_yoy',
    'CoreCPI_1m':'CoreCPI_1m_yoy',
    'IndustrialProd_1m':'IndustrialProd_1m_yoy',
    'RealRetail_1m':'RealRetail_1m_yoy'
}
ff.rename_columns(df_lag1_indicator_index,col_rename_indicator_index) #ff to rename columns using the dict above
df_indicator_yoy = ff.yoy_change(df_lag1_indicator_index) # ff for year over year change
ff.rename_columns(df_indicator_yoy,col_rename_yoy) # rename yoy columns
df_indicator_mom = ff.mom_change(df_lag1_indicator_index) ## ff for month-over-month change
df_indicator_mom = ff.rolling_3m(df_indicator_mom,df_indicator_mom.columns) # ff to calculate 3month rolling avg, and also annualized 3month rolling avg
df_indicator_clean = df_indicator_index.join([df_indicator_yoy,df_indicator_mom], how='inner')

### Unemployment

In [8]:
df_unemployment = df_macro[["UnemploymentRate"]].copy()
df_unemployment = ff.forward_fill(df_unemployment)
df_unemployment = ff.resample_me(df_unemployment)
df_unemployment_1m = ff.shift_lag(df_unemployment,lag_months=1)
df_unemployment_1m = ff.mom_change(df_unemployment_1m)
df_unemployment_clean = ff.rolling_3m(df_unemployment_1m,df_unemployment_1m.columns)
df_unemployment_clean = df_unemployment.join([df_unemployment_clean], how='inner') 

In [9]:
df_unemployment_clean

Unnamed: 0_level_0,UnemploymentRate,UnemploymentRate_r3m,UnemploymentRate_ann_r3m
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1985-01-31,7.3,,
1985-02-28,7.2,,
1985-03-31,7.2,,
1985-04-30,7.3,,
1985-05-31,7.2,0.000063,0.000761
...,...,...,...
2025-06-30,4.1,0.008130,0.102044
2025-07-31,4.2,-0.007937,-0.091189
2025-08-31,4.3,0.000194,0.002325
2025-09-30,4.3,0.008130,0.102044


### 

### Rates - Mortgage 30 Year, US 10 YR Treasury, US 2 Year Treasury, US 3 Month Treasury, TIPS10 YR, CorpBAA, CorpAAA, FedFunsRate

In [10]:
df_rate_clean = ff.credit_spread(df_macro,lag_months=0,delta_calc=True)
# col_remove = ['Mort30Y', 'UST10Y', 'UST2Y', 'UST3M', 'CorpBAA', 'CorpAAA','FedFundsRate']
# df_rate_clean.drop(columns=col_remove,inplace=True)

### Combine all macro features

In [11]:
df_macro_clean = (
    df_gdp_clean.combine_first(df_indicator_clean)
       .combine_first(df_unemployment_clean)
       .combine_first(df_rate_clean)
       .sort_index()
)

### Combine index fund df and macro df

In [12]:
df_clean =(
    df_funds.combine_first(df_macro_clean)
    .sort_index()
)

In [13]:
df_clean.shape

(490, 80)

### Rearrange column

In [14]:
column_order = fp.column_order()
print(len(column_order))
df_clean = df_clean[column_order]

80


### Filter data

In [15]:
df_clean = df_clean.loc['1993-04-01':'2025-09-30'] # filter for dates

### Export data to excel

In [16]:
def to_excel(df,file_name):
    out = Path.home() /"Documents"/"cs668"/"CS668_Capstone"/file_name
    df.to_excel(out, index=True)  # index=True is default; keep if you want the index

# Call to_excel function to save as excel
# to_excel(df_clean,"df_SPY_VBMFX_Macro_Signals.xlsx")
