# Generates parameter for random projection models

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle

import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
train_df = pd.read_parquet("data/train_20221130.parquet.gzip")

In [3]:
non_pred_cols = [
    "new_ind",
    "index",
    "ADM1DHS",
    "ADM1FIPS",
    "ADM1FIPSNA",
    "ADM1NAME",
    "ADM1SALBCO",
    "ADM1SALBNA",
    "ADM1SALNA",
    "ALT_DEM",
    "ALT_GPS",
    "CCFIPS",
    "DATUM",
    "DHSCC",
    "DHSCLUST",
    "DHSID",
    "DHSREGCO",
    "DHSREGNA",
    "DHSYEAR",
    "F21",
    "F22",
    "F23",
    "LATNUM",
    "LONGNUM",
    "SOURCE",
    "URBAN_RURA",
    "ZONECO",
    "ZONENA",
]
other_keys = [
    "Mean_BMI",
    "Under5_Mortality_Rate",
    "Stunted_Rate",
    "new_ind",
    "key1",
    "key2",
    "key3",
    "DATUM",
    "DHSCC",
    "DHSID_x",
    "DHSREGNA",
    "SOURCE",
    "URBAN_RURA_x",
    "CCFIPS",
    "DHSID_y",
    "URBAN_RURA_y",
    "ADM1NAME", 'Unnamed: 0', 
    'DHSYEAR_y',
    'DHSCLUST_y', 
    'LATNUM_y', 
    'LONGNUM_y', 
    'Median_BMI', 
    'Unmet_Need_Rate',
    'Skilled_Birth_Attendant_Rate'
]

In [4]:
na = train_df.isna().mean()
na = na.sort_values()
drop_cols = na[na >= 0.53].keys()

In [5]:
drop = set(non_pred_cols+other_keys).union(drop_cols)
train_df.drop(drop, errors='ignore', inplace=True, axis = 1)

In [6]:
stds = train_df.std()

In [7]:
zero_std = stds[stds==0].keys()

In [8]:
drop = drop.union(zero_std)
train_df.drop(drop, errors='ignore', inplace=True, axis = 1)
stds.drop(drop, errors='ignore', inplace=True)

In [16]:
# make sure there is no zero std fields
stds[stds==0.0]

Series([], dtype: float64)

In [17]:
stds[11305]

0.013536716843997987

In [24]:
pickle.dump(drop, open("data/drop_cols.pickle", "wb"))

In [11]:
means = train_df.mean()

In [26]:
means[means.isna()]

Series([], dtype: float64)

In [29]:
df_mean_std = pd.concat([means, stds],axis=1)
df_mean_std.columns = ["means", "stds"]

pd.concat([means, std],axis=1)

In [34]:
#make sure there is no nans
np.where(df_mean_std["means"].isna())[0] 

array([], dtype=int64)

In [35]:
df_mean_std.to_parquet("data/train_means_std.parquet.gzip")

In [9]:
df_mean_std = pd.read_parquet("data/train_means_std.parquet.gzip")

In [11]:
df_mean_std.shape

(11333, 2)

In [17]:
embed_dim = 512
proj = torch.empty((df_mean_std.shape[0], embed_dim))
proj = torch.nn.init.xavier_normal_(proj)

In [18]:
proj.shape

torch.Size([11333, 512])

In [19]:
with open(f"data/rand_proj_{embed_dim}.pt", "wb") as f:
    torch.save(proj, f)