In [40]:
import pandas as pd
import numpy as np
from numpy.lib.stride_tricks import sliding_window_view
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [41]:
years = list(range(2010, 2024))
data_list = []
for i in years:
    df = pd.read_csv("../Data/data_" + str(i) + ".csv")
    df["year"] = i
    data_list.append(df)

In [42]:
nan_df = data_list[0][["GEOIDFQ", "NaN count"]].rename(columns={"NaN count": "NaN count sum"})

for i in range(1, len(years)):
    df = data_list[i][['GEOIDFQ', 'NaN count']]
    nan_df = nan_df.merge(df, on='GEOIDFQ', how='inner')
    nan_df["NaN count sum"] += nan_df["NaN count"]
    nan_df = nan_df.drop("NaN count", axis=1)

county_df = nan_df.drop(nan_df.nlargest(3, "NaN count sum").index)["GEOIDFQ"]

county_valid_df = county_df.sample(frac=0.2, random_state=42)
county_train_df = county_df[~county_df.isin(county_valid_df)]

print(county_df.shape)
print(county_valid_df.shape)
print(county_train_df.shape)

(3096,)
(619,)
(2477,)


In [43]:
df_combined = pd.concat(data_list, axis=0, ignore_index=True)
df_combined = df_combined.drop("NaN count", axis=1)

df_combined.shape

(43518, 76)

In [44]:
target = "Median household income ($)" # "Owner-occupied [%]"

In [45]:
window_size = 3

In [46]:
features = df_combined.columns.difference(["GEOIDFQ", "year", target])

len(features)

73

In [47]:
df_filtered = df_combined[df_combined["GEOIDFQ"].isin(county_df)]

df_filtered.shape

(43344, 76)

In [48]:
scaler_x = StandardScaler()
scaler_y = MinMaxScaler()

df_norm = df_filtered.copy()

df_norm[features] = scaler_x.fit_transform(df_norm[features])
df_norm[target] = scaler_y.fit_transform(df_norm[[target]])

df_norm.shape

(43344, 76)

In [49]:
df_vaild = df_norm[df_norm["GEOIDFQ"].isin(county_valid_df)].sort_values(["GEOIDFQ", "year"])

windows_valid = []
targets_valid = []

for id, group in df_vaild.groupby("GEOIDFQ"):
        features_x_df = group[features].to_numpy()
        features_y_df = group[target].to_numpy()[window_size-1:]

        window = sliding_window_view(features_x_df, (window_size, features_x_df.shape[1]))[:, 0, :]
        windows_valid.append(window)
        targets_valid.append(features_y_df)

In [50]:
df_train = df_norm[df_norm["GEOIDFQ"].isin(county_train_df)].sort_values(["GEOIDFQ", "year"])

windows_train = []
targets_train = []

for id, group in df_train.groupby("GEOIDFQ"):
        features_x_df = group[features].to_numpy()
        features_y_df = group[target].to_numpy()[window_size-1:]

        window = sliding_window_view(features_x_df, (window_size, features_x_df.shape[1]))[:, 0, :]
        windows_train.append(window)
        targets_train.append(features_y_df)

In [51]:
data_valid_x = np.vstack(windows_valid)
data_valid_y = np.concatenate(targets_valid)
data_train_x = np.vstack(windows_train)
data_train_y = np.concatenate(targets_train)
data_x = np.concatenate([data_valid_x, data_train_x], axis=0)
data_y = np.concatenate([data_valid_y, data_train_y], axis=0)

print(data_valid_x.shape, data_valid_y.shape)
print(data_train_x.shape, data_train_y.shape)
print(data_x.shape, data_y.shape)

(7428, 3, 73) (7428,)
(29724, 3, 73) (29724,)
(37152, 3, 73) (37152,)


In [52]:
np.save("../Data/data_valid_x_" + target + ".npy", data_valid_x)
np.save("../Data/data_valid_y_" + target + ".npy", data_valid_y)
np.save("../Data/data_train_x_" + target + ".npy", data_train_x)
np.save("../Data/data_train_y_" + target + ".npy", data_train_y)
np.save("../Data/data_x_" + target + ".npy", data_x)
np.save("../Data/data_y_" + target + ".npy", data_y)