# Generates parameter for random projection models

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle

import torch
import torch.nn as nn
import torch.nn.functional as F

In [4]:
train_df = pd.read_parquet("train_20221130.parquet.gzip")

In [5]:
non_pred_cols = [
    "new_ind",
    "index",
    "ADM1DHS",
    "ADM1FIPS",
    "ADM1FIPSNA",
    "ADM1NAME",
    "ADM1SALBCO",
    "ADM1SALBNA",
    "ADM1SALNA",
    "ALT_DEM",
    "ALT_GPS",
    "CCFIPS",
    "DATUM",
    "DHSCC",
    "DHSCLUST",
    "DHSID",
    "DHSREGCO",
    "DHSREGNA",
    "DHSYEAR",
    "F21",
    "F22",
    "F23",
    "LATNUM",
    "LONGNUM",
    "SOURCE",
    "URBAN_RURA",
    "ZONECO",
    "ZONENA",
]
other_keys = [
    "Mean_BMI",
    "Under5_Mortality_Rate",
    "Stunted_Rate",
    "new_ind",
    "key1",
    "key2",
    "key3",
    "DATUM",
    "DHSCC",
    "DHSID_x",
    "DHSREGNA",
    "SOURCE",
    "URBAN_RURA_x",
    "CCFIPS",
    "DHSID_y",
    "URBAN_RURA_y",
    "ADM1NAME",
]

In [6]:
na = train_df.isna().mean()
na = na.sort_values()
drop_cols = na[na >= 0.53].keys()

In [9]:
drop = set(non_pred_cols+other_keys).union(drop_cols)
train_df.drop(drop, errors='ignore', inplace=True, axis = 1)

In [10]:
pickle.dump(drop, open("drop_cols.pickle", "wb"))

In [12]:
means = train_df.mean()
stds = train_df.std()

In [13]:
df_mean_std = pd.DataFrame()
df_mean_std["means"] = means
df_mean_std["stds"] = stds

In [14]:
df_mean_std

Unnamed: 0,means,stds
DHSCLUST_x,678395.131962,4.814126e+06
DHSYEAR_x,2009.431875,6.041852e+00
LATNUM_x,11.165772,1.655336e+01
LONGNUM_x,28.211007,5.578222e+01
ET_water_median@CAS/IGSNRR/PML/V2&timestamped,-0.538179,1.283995e+00
...,...,...
LATNUM_y,11.702778,1.641035e+01
LONGNUM_y,30.980501,5.403856e+01
Median_BMI,22.884374,3.563578e+00
Unmet_Need_Rate,34.470986,2.969996e+01


In [15]:
df_mean_std.to_parquet("train_means_std.parquet.gzip")

In [16]:
train_df.shape

(96808, 11342)

In [21]:
stds

DHSCLUST_x                                       4.814126e+06
DHSYEAR_x                                        6.041852e+00
LATNUM_x                                         1.655336e+01
LONGNUM_x                                        5.578222e+01
ET_water_median@CAS/IGSNRR/PML/V2&timestamped    1.283995e+00
                                                     ...     
LATNUM_y                                         1.641035e+01
LONGNUM_y                                        5.403856e+01
Median_BMI                                       3.563578e+00
Unmet_Need_Rate                                  2.969996e+01
Skilled_Birth_Attendant_Rate                     3.530865e+01
Length: 11342, dtype: float64

In [23]:
train_df_x_norm = (train_df_x - means)/stds

In [24]:
train_df_x_norm.to_parquet("train_normalized.parquet.gzip")

In [27]:
train_df_x_norm = pd.read_parquet("train_normalized.parquet.gzip")

In [28]:
train_df_x_norm.shape

(96808, 11342)

In [29]:
embed_dim = 2048
proj = torch.empty((train_df_x_norm.shape[1], embed_dim))
proj = torch.nn.init.xavier_normal_(proj)

In [30]:
with open("rand_proj.pt", "wb") as f:
    torch.save(proj, f)

In [29]:
proj

tensor([[ 0.0080, -0.0107, -0.0058,  ...,  0.0154,  0.0197, -0.0041],
        [ 0.0035, -0.0022, -0.0227,  ..., -0.0121,  0.0030,  0.0179],
        [-0.0015,  0.0119, -0.0019,  ..., -0.0188, -0.0067,  0.0148],
        ...,
        [-0.0043, -0.0092,  0.0108,  ..., -0.0126,  0.0015, -0.0215],
        [-0.0061,  0.0090, -0.0217,  ...,  0.0130, -0.0235, -0.0096],
        [ 0.0070,  0.0140, -0.0001,  ...,  0.0147, -0.0149, -0.0013]])

In [None]:
train_proj = train_df_x_norm @ proj

Data Normalizations

In [None]:
train_df = pd.read_parquet("train_20221130.parquet.gzip")
with open("drop_cols.pickle", "rb") as f:
    other_keys = pickle.load(f)