we want to basically take all of the fred-pulled and -derived values

and then push them into a unified dataset for downstream processing (unify with the other compatible datasets)

note that we probably are dealing with duplicate columns and want to merge on date

preserving the date but removing duplicates on non-date columns

In [8]:
import os
import pandas as pd

In [None]:
# Path to your folder containing all relevant CSVs
folder_path = "fredstuff/"  # update this to your folder

# List all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]

# Initialize merged dataframe
merged_df = None
merged_columns = set()
merge_key = "date"

# Loop through and merge
for filename in csv_files:
    file_path = os.path.join(folder_path, filename)
    df = pd.read_csv(file_path)

    # Normalize column names
    df.columns = [col.lower() for col in df.columns]

    if merge_key not in df.columns:
        print(f"Skipping {filename}: no 'date' column")
        continue

    df[merge_key] = pd.to_datetime(df[merge_key])

    # Only keep new columns + 'date'
    if merged_df is None:
        merged_df = df
        merged_columns.update(df.columns)
    else:
        new_cols = [col for col in df.columns if col == merge_key or col not in merged_columns]
        df = df[new_cols]
        merged_df = pd.merge(merged_df, df, on=merge_key, how="outer")
        merged_columns.update(new_cols)

# Interpolate all numeric columns forward + backward
merged_df = merged_df.sort_values("date")
merged_df = merged_df.interpolate(method="linear", limit_direction="both")

# Optionally drop rows or columns that are still empty
merged_df = merged_df.dropna(axis=1, how="all")  # remove columns fully NaN
merged_df = merged_df.dropna(axis=0, how="any")  # or drop rows with any NaNs

# Final preview
print(merged_df.head())

# Optionally save
merged_df.to_csv("merged_fredstuff.csv", index=False)

note that some signals like aapl are not really clear

we considered dropping them, but after unifying our feature matrix for downstream

scalarization and blinding, the weak signals or redundant ones will be dropped anyway

so we arent really worried

---

In [14]:
# --- Load and prepare FRED dataset ---
fred_df = pd.read_csv("merged_fredstuff.csv")
fred_df['date'] = pd.to_datetime(fred_df['date'])

# --- Load and prepare stock dataset ---
stock_df = pd.read_csv("preview_unified_stock_data.csv")

# Normalize the 'Date' column to 'date'
if 'Date' in stock_df.columns:
    stock_df.rename(columns={"Date": "date"}, inplace=True)

stock_df['date'] = pd.to_datetime(stock_df['date'])

# --- Identify shared columns (non-date) ---
shared_columns = set(fred_df.columns).intersection(stock_df.columns) - {"date"}

# Drop shared non-date columns from stock_df before merging
stock_df_filtered = stock_df.drop(columns=shared_columns)

# --- Merge on 'date' ---
merged_combined_df = pd.merge(fred_df, stock_df_filtered, on="date", how="outer")

# Interpolate all numeric columns forward + backward
merged_combined_df = merged_combined_df.sort_values("date")
merged_combined_df = merged_combined_df.interpolate(method="linear", limit_direction="both")

# Optionally drop rows or columns that are still empty
merged_combined_df = merged_combined_df.dropna(axis=1, how="all")  # remove columns fully NaN
merged_combined_df = merged_combined_df.dropna(axis=0, how="any")  # or drop rows with any NaNs

# --- Optional: Save or inspect result ---
print(merged_combined_df.head())

# Save result if desired
merged_combined_df.to_csv("merged_fred_and_stock.csv", index=False)

        date  ^gspc  aapl  msft  qqq  xlf  gdp_growth  unemployment_rate  \
0 1997-01-01    0.0   0.0   0.0  0.0  0.0   10002.179                4.0   
1 1997-02-01    0.0   0.0   0.0  0.0  0.0   10002.179                4.0   
2 1997-03-01    0.0   0.0   0.0  0.0  0.0   10002.179                4.0   
3 1997-04-01    0.0   0.0   0.0  0.0  0.0   10002.179                4.0   
4 1997-05-01    0.0   0.0   0.0  0.0  0.0   10002.179                4.0   

   cpi_inflation  consumer_sentiment_index  ...  Low_FORD  Close_FORD  \
0          169.3                     112.0  ...    28.125      30.625   
1          169.3                     112.0  ...    28.125      30.625   
2          169.3                     112.0  ...    28.125      30.625   
3          169.3                     112.0  ...    28.125      30.625   
4          169.3                     112.0  ...    28.125      30.625   

   Adj Close_FORD  Volume_FORD  Open_AAPL  High_AAPL  Low_AAPL  Close_AAPL  \
0          30.625       42

now we really want to add the first week's stuff

In [32]:
# --- Load existing merged FRED + Stock dataset ---
merged_df = pd.read_csv("merged_fred_and_stock.csv")
merged_df['date'] = pd.to_datetime(merged_df['date'])

# --- Load financial signals ---
financial_df = pd.read_csv("preview_financial_data_cleaned2.csv")

# Normalize column names
if 'date' not in financial_df.columns:
    financial_df.columns = [col.lower() for col in financial_df.columns]
financial_df['date'] = pd.to_datetime(financial_df['date'])

# --- Remove duplicate non-date columns ---
shared_cols = set(merged_df.columns).intersection(financial_df.columns) - {"date"}
financial_df = financial_df.drop(columns=shared_cols)

# --- Merge financial signals into the master frame ---
final_merged_df = pd.merge(merged_df, financial_df, on="date", how="outer")

# Interpolate all numeric columns forward + backward
final_merged_df = final_merged_df.sort_values("date")
final_merged_df = final_merged_df.interpolate(method="linear", limit_direction="both")

# Optionally drop rows or columns that are still empty
final_merged_df = final_merged_df.dropna(axis=1, how="all")  # remove columns fully NaN
final_merged_df = final_merged_df.dropna(axis=0, how="any")  # or drop rows with any NaNs

# --- Preview result ---
print(final_merged_df.head())

# Optional: save
final_merged_df.to_csv("merged_fred_stock_financial.csv", index=False)

        date  ^gspc  aapl  msft  qqq  xlf  gdp_growth  unemployment_rate  \
0 1997-01-01    0.0   0.0   0.0  0.0  0.0   10002.179                4.0   
1 1997-02-01    0.0   0.0   0.0  0.0  0.0   10002.179                4.0   
2 1997-03-01    0.0   0.0   0.0  0.0  0.0   10002.179                4.0   
3 1997-04-01    0.0   0.0   0.0  0.0  0.0   10002.179                4.0   
4 1997-05-01    0.0   0.0   0.0  0.0  0.0   10002.179                4.0   

   cpi_inflation  consumer_sentiment_index  ...  \
0          169.3                     112.0  ...   
1          169.3                     112.0  ...   
2          169.3                     112.0  ...   
3          169.3                     112.0  ...   
4          169.3                     112.0  ...   

   interest rate_lag30_lag90_lag180_rolling90_rolling180  \
0                                           1.746524       
1                                           1.746524       
2                                           1.746524    

get the cells from nav net asset value column and 

change the name of the columns by using the associated scheme code

to make the nav mutual funds data integrate-able

In [34]:
import pandas as pd

# --- Load mutual fund NAV dataset ---
mf_df = pd.read_csv("preview_cleaned_mutual_fund_nav copy.csv")

# --- Normalize column names and types ---
mf_df.columns = [col.strip() for col in mf_df.columns]  # remove any whitespace
mf_df['NAV__Scheme Code'] = mf_df['NAV__Scheme Code'].astype(str)
mf_df['date'] = pd.to_datetime(mf_df['date'])

# --- Pivot: one column per scheme code, values = NAV ---
mf_pivot = mf_df.pivot(
    index='date',
    columns='NAV__Scheme Code',
    values='NAV__Net Asset Value'
)

# --- Rename columns with prefix for clarity ---
mf_pivot.columns = [f"nav_{code}" for code in mf_pivot.columns]

# --- Reset index so 'date' becomes a column again ---
mf_pivot = mf_pivot.reset_index()

# --- Preview result ---
print(mf_pivot.head())

# Now this `mf_pivot` DataFrame can be merged on `date` with your final feature matrix like so:
final_merged_df_w_nav = pd.merge(final_merged_df, mf_pivot, on='date', how='outer')

# Interpolate all numeric columns forward + backward
final_merged_df_w_nav = final_merged_df_w_nav.sort_values("date")
final_merged_df_w_nav = final_merged_df_w_nav.interpolate(method="linear", limit_direction="both")

# Optionally drop rows or columns that are still empty
final_merged_df_w_nav = final_merged_df_w_nav.dropna(axis=1, how="all")  # remove columns fully NaN
final_merged_df_w_nav = final_merged_df_w_nav.dropna(axis=0, how="any")  # or drop rows with any NaNs

# Optional: save
final_merged_df_w_nav.to_csv("merged_fred_stock_financial_w_nav.csv", index=False)

        date  nav_100794  nav_104681  nav_104683  nav_105603  nav_105604  \
0 2025-04-09     29.7061     15.0257     33.8509     31.4783     16.7995   
1 2025-04-10         NaN         NaN         NaN         NaN         NaN   

   nav_106793  nav_106795  nav_106797  nav_115398  ...  nav_149451  \
0      30.231      10.915       20.89         NaN  ...         NaN   
1         NaN         NaN         NaN    2477.978  ...      1000.0   

   nav_149452  nav_149453  nav_149454  nav_151134  nav_151135  nav_151136  \
0         NaN         NaN         NaN     18.7291     10.5205     11.1456   
1   1214.0747      1000.0   1213.8653         NaN         NaN         NaN   

   nav_151137  nav_151138  nav_151140  
0     10.9532     20.0211     10.5399  
1         NaN         NaN         NaN  

[2 rows x 51 columns]


In [36]:
final_merged_df_w_nav.head()

Unnamed: 0,date,^gspc,aapl,msft,qqq,xlf,gdp_growth,unemployment_rate,cpi_inflation,consumer_sentiment_index,...,nav_149451,nav_149452,nav_149453,nav_149454,nav_151134,nav_151135,nav_151136,nav_151137,nav_151138,nav_151140
0,1997-01-01,0.0,0.0,0.0,0.0,0.0,10002.179,4.0,169.3,112.0,...,1000.0,1214.0747,1000.0,1213.8653,18.7291,10.5205,11.1456,10.9532,20.0211,10.5399
1,1997-02-01,0.0,0.0,0.0,0.0,0.0,10002.179,4.0,169.3,112.0,...,1000.0,1214.0747,1000.0,1213.8653,18.7291,10.5205,11.1456,10.9532,20.0211,10.5399
2,1997-03-01,0.0,0.0,0.0,0.0,0.0,10002.179,4.0,169.3,112.0,...,1000.0,1214.0747,1000.0,1213.8653,18.7291,10.5205,11.1456,10.9532,20.0211,10.5399
3,1997-04-01,0.0,0.0,0.0,0.0,0.0,10002.179,4.0,169.3,112.0,...,1000.0,1214.0747,1000.0,1213.8653,18.7291,10.5205,11.1456,10.9532,20.0211,10.5399
4,1997-05-01,0.0,0.0,0.0,0.0,0.0,10002.179,4.0,169.3,112.0,...,1000.0,1214.0747,1000.0,1213.8653,18.7291,10.5205,11.1456,10.9532,20.0211,10.5399
