In [100]:
import pandas as pd
import numpy as np
from numpy.lib.stride_tricks import sliding_window_view
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [101]:
years = list(range(2010, 2024))
data_list = []
for i in years:
    df = pd.read_csv("../Data/data_" + str(i) + ".csv")
    df["year"] = i
    data_list.append(df)

In [102]:
nan_df = data_list[0][["GEOIDFQ", "NaN count"]].rename(columns={"NaN count": "NaN count sum"})

for i in range(1, len(years)):
    df = data_list[i][['GEOIDFQ', 'NaN count']]
    nan_df = nan_df.merge(df, on='GEOIDFQ', how='inner')
    nan_df["NaN count sum"] += nan_df["NaN count"]
    nan_df = nan_df.drop("NaN count", axis=1)

county_df = nan_df.drop(nan_df.nlargest(3, "NaN count sum").index)["GEOIDFQ"]

print(county_df.shape)

(3096,)


In [103]:
df_combined = pd.concat(data_list, axis=0, ignore_index=True)
df_combined = df_combined.drop("NaN count", axis=1)

df_combined.shape

(43518, 76)

In [104]:
df_filtered = df_combined[df_combined["GEOIDFQ"].isin(county_df)]

df_filtered.shape

(43344, 76)

In [None]:
target = "Median household income ($)" # "Owner-occupied [%]"
features = df_filtered.columns.difference(["GEOIDFQ", "year", target])

len(features)

73

In [106]:
scaler_x = StandardScaler()
scaler_y = MinMaxScaler()

df_norm = df_filtered.copy()

df_norm[features] = scaler_x.fit_transform(df_norm[features])
df_norm[target] = scaler_y.fit_transform(df_norm[[target]])

df_norm.shape

(43344, 76)

In [107]:
window_size = 3

df_geoid = df_norm[df_norm["GEOIDFQ"].isin(county_df)].sort_values(["GEOIDFQ", "year"])

windows = {}
targets = {}

for id, group in df_geoid.groupby("GEOIDFQ"):
        features_x_df = group[features].to_numpy()
        features_y_df = group[target].to_numpy()[window_size-1:]

        window = sliding_window_view(features_x_df, (window_size, features_x_df.shape[1]))[:, 0, :]

        windows[id] = window
        targets[id] = features_y_df

print(len(windows), next(iter(windows.values())).shape, len(windows) * next(iter(windows.values())).shape[0] )

3096 (12, 3, 73) 37152


In [108]:
np.savez("../Data/data_x_" + target + ".npz", **windows)
np.savez("../Data/data_y_" + target + ".npz", **targets)