In [1]:
import pandas as pd
import numpy as np
import joblib
from numpy.lib.stride_tricks import sliding_window_view
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [2]:
years = list(range(2010, 2024))
data_list = []
for i in years:
    df = pd.read_csv("../Data/data_" + str(i) + ".csv")
    df["year"] = i
    data_list.append(df)

In [3]:
nan_df = data_list[0][["GEOIDFQ", "NaN count"]].rename(columns={"NaN count": "NaN count sum"})

for i in range(1, len(years)):
    df = data_list[i][['GEOIDFQ', 'NaN count']]
    nan_df = nan_df.merge(df, on='GEOIDFQ', how='inner')
    nan_df["NaN count sum"] += nan_df["NaN count"]
    nan_df = nan_df.drop("NaN count", axis=1)

county_df = nan_df.drop(nan_df.nlargest(3, "NaN count sum").index)["GEOIDFQ"]

print(county_df.shape)

(3096,)


In [4]:
df_combined = pd.concat(data_list, axis=0, ignore_index=True)
df_combined = df_combined.drop("NaN count", axis=1)

df_combined.shape

(43518, 76)

In [5]:
df_filtered = df_combined[df_combined["GEOIDFQ"].isin(county_df)]

df_filtered.shape

(43344, 76)

In [6]:
target = "Median household income ($)"
features = df_filtered.columns.difference(["GEOIDFQ", "year", target])

len(features)

73

In [7]:
print(features)

Index(['18 to 64 years poverty [%]', '18 years and over [%]',
       '65 years and over [%]', '65 years and over poverty [%]',
       'All parents in family in labor force [%]',
       'American Indian and Alaska Native [%]', 'Armed Forces [%]',
       'Asian [%]', 'Black or African American [%]',
       'Civilian labor force [%]', 'Commuting: individual car [%]',
       'Commuting: public transport [%]', 'Commuting: work from home [%]',
       'Education: 9th-12th (no diploma) [%]', 'Education: <9th grade [%]',
       'Education: associate's degree [%]', 'Education: bachelor's degree [%]',
       'Education: bachelor's degree or higher [%]',
       'Education: graduate or professional degree [%]',
       'Education: high school graduate [%]',
       'Education: high school graduate or higher [%]',
       'Education: some college (no degree) [%]',
       'Female civilian labor force [%]', 'Foreign-born [%]',
       'Foreign-born: naturalized [%]', 'Gross rent <25% of income [%]',
     

In [8]:
scaler_x = StandardScaler()
scaler_y = MinMaxScaler()

df_norm = df_filtered.copy()

df_norm[features] = scaler_x.fit_transform(df_norm[features])
df_norm[target] = scaler_y.fit_transform(df_norm[[target]])

df_norm.shape

(43344, 76)

In [9]:
joblib.dump(scaler_y, "../Data/scaler_y.pkl")

['../Data/scaler_y.pkl']

In [10]:
window_size = 3

df_geoid = df_norm[df_norm["GEOIDFQ"].isin(county_df)].sort_values(["GEOIDFQ", "year"])

windows = {}
targets = {}

for id, group in df_geoid.groupby("GEOIDFQ"):
        features_x_df = group[features].to_numpy()
        features_y_df = group[target].to_numpy()[window_size-1:]

        window = sliding_window_view(features_x_df, (window_size, features_x_df.shape[1]))[:, 0, :]

        windows[id] = window
        targets[id] = features_y_df

print(len(windows), windows["0500000US28107"].shape, len(windows) * windows["0500000US28107"].shape[0] )

3096 (12, 3, 73) 37152


In [11]:
np.savez("../Data/data_x.npz", **windows)
np.savez("../Data/data_y.npz", **targets)