In [1]:
import os
import joblib

import pandas as pd
import numpy as np

##### Funs

In [2]:
def read_file(filename, columns=None, **kwargs):
    print(filename)
    load_extension = os.path.splitext(filename)[-1]
    if load_extension == ".parquet":
        return pd.read_parquet(filename, columns=columns,**kwargs)
    elif load_extension == ".csv":
        return pd.read_csv(filename, usecols=columns, **kwargs)

## Features up until 4 hours into admission

##### Paths

In [3]:
features_path = '/local-scratch/nigam/projects/lguo/temp_ds_shift_robustness/features/admissions_admission/merged_features_binary'

##### Load Features

In [4]:
vocab = read_file(
    f"{features_path}/vocab/vocab.parquet", 
    engine="pyarrow"
)
row_id_map = read_file(
    f'{features_path}/features_sparse/features_row_id_map.parquet',
    engine='pyarrow'
)
features = joblib.load(
    f'{features_path}/features_sparse/features.gz', 
)

/local-scratch/nigam/projects/lguo/temp_ds_shift_robustness/features/admissions_admission/merged_features_binary/vocab/vocab.parquet
/local-scratch/nigam/projects/lguo/temp_ds_shift_robustness/features/admissions_admission/merged_features_binary/features_sparse/features_row_id_map.parquet


##### Columns

In [5]:
feature_cats_all = list(set(['_'.join((x for x in x.split('_')[-3:] if x.isalpha())) for x in vocab['feature_id']]))
feature_cats_static = ['age_group','gender','race','ethnicity']
feature_cats_date = [x for x in feature_cats_all if x not in feature_cats_static and 'dt' not in x]
feature_cats_dt = [x for x in feature_cats_all if x not in feature_cats_static and 'dt' in x]
time_bins = ['-36500_-31','-30_-8','-7_-1']
hourly_time_bins = ['-24_0',]

In [6]:
c = 0
df = pd.DataFrame(columns = ['Feature Category', 'Time Bin', 'Count'])
for feature_cat in feature_cats_all:
    if feature_cat in feature_cats_static:
        c+=1
        df.loc[c,:] = [
            feature_cat, 
            'Static', 
            len(set([x for x in vocab['feature_id'] if  feature_cat in x]))
        ]
    elif feature_cat in feature_cats_date:
         for time_bin in time_bins:
            c+=1
            df.loc[c,:] = [
                feature_cat, 
                time_bin, 
                len(set([x for x in vocab['feature_id'] if feature_cat in x and time_bin in x]))
            ]
    elif feature_cat in feature_cats_dt:
         for time_bin in hourly_time_bins:
            c+=1
            df.loc[c,:] = [
                feature_cat, 
                time_bin, 
                len(set([x for x in vocab['feature_id'] if feature_cat in x and time_bin in x]))
            ]
df = df.pivot(index = 'Feature Category', columns = 'Time Bin', values = 'Count')
df = df.fillna('-')
df = df.reindex(feature_cats_static+feature_cats_dt+feature_cats_date)
df = df[time_bins+hourly_time_bins+['Static']]
df

Time Bin,-36500_-31,-30_-8,-7_-1,-24_0,Static
Feature Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
age_group,-,-,-,-,16
gender,-,-,-,-,3
race,-,-,-,-,11
ethnicity,-,-,-,-,3
drug_exposure_dt,-,-,-,2699,-
observation_dt,-,-,-,30,-
measurement_range_dt,-,-,-,968,-
device_exposure_dt,-,-,-,12,-
measurement_dt,-,-,-,1966,-
procedure_occurrence_delayed,17607,8614,6742,-,-


## Features up until discharge

In [7]:
features_path = '/local-scratch/nigam/projects/lguo/temp_ds_shift_robustness/features/admissions_discharge/merged_features_binary'

In [8]:
vocab = read_file(
    f"{features_path}/vocab/vocab.parquet", 
    engine="pyarrow"
)
row_id_map = read_file(
    f'{features_path}/features_sparse/features_row_id_map.parquet',
    engine='pyarrow'
)
features = joblib.load(
    f'{features_path}/features_sparse/features.gz', 
)

/local-scratch/nigam/projects/lguo/temp_ds_shift_robustness/features/admissions_discharge/merged_features_binary/vocab/vocab.parquet
/local-scratch/nigam/projects/lguo/temp_ds_shift_robustness/features/admissions_discharge/merged_features_binary/features_sparse/features_row_id_map.parquet


In [9]:
feature_cats_all = list(set(['_'.join((x for x in x.split('_')[-3:] if x.isalpha())) for x in vocab['feature_id']]))
feature_cats_static = ['age_group','gender','race','ethnicity']
feature_cats_date = [x for x in feature_cats_all if x not in feature_cats_static and 'dt' not in x]
feature_cats_dt = [x for x in feature_cats_all if x not in feature_cats_static and 'dt' in x]
time_bins = ['-36500_-31','-30_-8','-7_-1']
hourly_time_bins = ['-24_0',]

In [10]:
c = 0
df = pd.DataFrame(columns = ['Feature Category', 'Time Bin', 'Count'])
for feature_cat in feature_cats_all:
    if feature_cat in feature_cats_static:
        c+=1
        df.loc[c,:] = [
            feature_cat, 
            'Static', 
            len(set([x for x in vocab['feature_id'] if  feature_cat in x]))
        ]
    elif feature_cat in feature_cats_date:
         for time_bin in time_bins:
            c+=1
            df.loc[c,:] = [
                feature_cat, 
                time_bin, 
                len(set([x for x in vocab['feature_id'] if feature_cat in x and time_bin in x]))
            ]
    elif feature_cat in feature_cats_dt:
         for time_bin in hourly_time_bins:
            c+=1
            df.loc[c,:] = [
                feature_cat, 
                time_bin, 
                len(set([x for x in vocab['feature_id'] if feature_cat in x and time_bin in x]))
            ]
df = df.pivot(index = 'Feature Category', columns = 'Time Bin', values = 'Count')
df = df.fillna('-')
df = df.reindex(feature_cats_static+feature_cats_dt+feature_cats_date)
df = df[time_bins+hourly_time_bins+['Static']]
df

Time Bin,-36500_-31,-30_-8,-7_-1,-24_0,Static
Feature Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
age_group,-,-,-,-,16
gender,-,-,-,-,3
race,-,-,-,-,11
ethnicity,-,-,-,-,3
drug_exposure_dt,-,-,-,5627,-
observation_dt,-,-,-,26,-
measurement_range_dt,-,-,-,1056,-
device_exposure_dt,-,-,-,43,-
measurement_dt,-,-,-,2250,-
procedure_occurrence_delayed,17785,9367,6590,-,-
