In [1]:
import os
import numpy as np
import pandas as pd
import joblib
import sklearn

from scipy.sparse import csr_matrix

In [2]:
data_path = "/share/pi/nigam/projects/sepsis/extraction_201003/"
merged_name = "merged_features_binary"
label_col = "early_sepsis" # use early sepsis as the outcome
random_state = np.random.RandomState(0)

In [3]:
cohort = pd.read_parquet(
    os.path.join(data_path, 'cohort', 'cohort_cv.parquet')
)
features = joblib.load(
    os.path.join(data_path, merged_name, 'features_sparse', 'features.gz')
)

row_id_map = pd.read_parquet(
    os.path.join(data_path, merged_name, 'features_sparse', 'features_row_id_map.parquet')
)

vocab = pd.read_parquet(
    os.path.join(data_path, merged_name, 'vocab', 'vocab.parquet')
)

In [4]:
cohort = cohort.merge(row_id_map)

In [5]:
cohort_adult = cohort.query('adult_at_admission == 1')
cohort_pediatric = cohort.query('adult_at_admission == 0')
cohort_train = cohort.query('fold_id != "test" & fold_id != "eval"')
cohort_train_adult = cohort_adult.query('fold_id != "test" & fold_id != "eval"')
cohort_train_pediatric = cohort_pediatric.query('fold_id != "test" & fold_id != "eval"')

In [6]:
features_adult = features[cohort_adult['features_row_id'].values]
features_pediatric = features[cohort_pediatric['features_row_id'].values]
features_train_adult = features[cohort_train_adult['features_row_id'].values]
features_train_pediatric = features[cohort_train_pediatric['features_row_id'].values]

In [7]:
assert features.shape[0] == cohort.shape[0]
assert features_adult.shape[0] == cohort_adult.shape[0]
assert features_pediatric.shape[0] == cohort_pediatric.shape[0]
assert features_train_adult.shape[0] == cohort_train_adult.shape[0]
assert features_train_pediatric.shape[0] == cohort_train_pediatric.shape[0]

In [9]:
coo_train_pediatric = features_train_pediatric.tocoo()
vocab_pediatric_train = pd.DataFrame({'col_id': coo_train_pediatric.col}).drop_duplicates()
vocab_pediatric_train = vocab_pediatric_train.reset_index(drop=True).rename_axis('col_id_new').reset_index()

In [10]:
vocab_pediatric_train

Unnamed: 0,col_id_new,col_id
0,0,1
1,1,49
2,2,955
3,3,1509
4,4,1520
...,...,...
60795,60795,31207
60796,60796,41512
60797,60797,81209
60798,60798,60645


In [10]:
# Map vocabularies across datasets

data_coo = features.tocoo()
data_coo_df = pd.DataFrame({
    'row_id': data_coo.row,
    'col_id': data_coo.col,
    'data': data_coo.data
})

data_coo_df

data_coo_df_mapped = data_coo_df.merge(vocab_pediatric_train)

features_mapped = csr_matrix(
     (data_coo_df_mapped.data.values.astype(np.int64), 
      (data_coo_df_mapped.row_id.values, data_coo_df_mapped.col_id_new.values)
     ), 
     shape=(features.shape[0], vocab_pediatric_train.shape[0])
)

# features_peds

In [11]:
result_path = os.path.join(data_path, 'features_mapped_pediatric')
os.makedirs(os.path.join(result_path, 'features_sparse'), exist_ok = True)
os.makedirs(os.path.join(result_path, 'vocab'), exist_ok = True)

In [12]:
joblib.dump(
    features_mapped, 
    os.path.join(result_path, 'features_sparse', 'features.gz')
)

['/share/pi/nigam/projects/sepsis/extraction_201003/features_mapped_pediatric/features_sparse/features.gz']

In [13]:
vocab_pediatric_train.to_parquet(
    os.path.join(result_path, 'vocab', 'vocab.parquet'), index=False
)

In [14]:
row_id_map.to_parquet(
    os.path.join(result_path, 'features_sparse', 'features_row_id_map.parquet'),
    index=False
)