In [32]:
import pandas as pd
import numpy as np
import os
import joblib
import pickle
import scipy.sparse as sp

In [33]:
feat_path = '/local-scratch/nigam/projects/jlemmon/transfer_learning/experiments/data/merged_features/features_sparse'
vocab_path = '/local-scratch/nigam/projects/jlemmon/transfer_learning/experiments/data/merged_features/vocab'
analysis_path = '/local-scratch/nigam/projects/jlemmon/transfer_learning/experiments/data/features_by_analysis'
cohort_path = '/local-scratch/nigam/projects/jlemmon/transfer_learning/experiments/data/cohort'

In [34]:
def read_file(filename, columns=None, **kwargs):
    print(filename)
    load_extension = os.path.splitext(filename)[-1]
    if load_extension == ".parquet":
        return pd.read_parquet(filename, columns=columns,**kwargs)
    elif load_extension == ".csv":
        return pd.read_csv(filename, usecols=columns, **kwargs)
def slice_sparse_matrix(mat, rows):
	mask = np.zeros(mat.shape[0], dtype=bool)
	mask[rows] = True
	w = np.flatnonzero(mask)
	sliced = mat[w,:]
	return sliced

In [35]:
feats_id_map = read_file(
		os.path.join(
			feat_path,
			"features_row_id_map.parquet"
		),
		engine='pyarrow'
	)
vocab = read_file(
		os.path.join(
			vocab_path,
			"vocab.parquet"
		),
		engine='pyarrow'
)

/local-scratch/nigam/projects/jlemmon/transfer_learning/experiments/data/merged_features/features_sparse/features_row_id_map.parquet
/local-scratch/nigam/projects/jlemmon/transfer_learning/experiments/data/merged_features/vocab/vocab.parquet


In [36]:
print(len(feats_id_map['prediction_id'].unique()))

316548


In [37]:
features = joblib.load(os.path.join(feat_path,"features.gz"))
# ped_feats = sp.load_npz('/local-scratch/nigam/projects/jlemmon/transfer_learning/experiments/data/bin_features/pediatric/full.npz')
# ad_feats = sp.load_npz('/local-scratch/nigam/projects/jlemmon/transfer_learning/experiments/data/bin_features/adult/full.npz')

In [51]:
cohort = read_file(
		os.path.join(
			cohort_path,
			"cohort_split.parquet"
		),
		engine='pyarrow'
	)
cohort = cohort.merge(feats_id_map)

/local-scratch/nigam/projects/jlemmon/transfer_learning/experiments/data/cohort/cohort_split.parquet


In [52]:
cohort_pred_id = list(cohort['prediction_id'].unique())
feat_pred_id = list(feats_id_map['prediction_id'].unique())

In [53]:
print(len(feat_pred_id))

316548


In [54]:
cohort_id_map = feats_id_map[feats_id_map['prediction_id'].isin(cohort_pred_id)]

In [55]:
cohort_id_map = cohort_id_map.merge(cohort[['person_id','prediction_id']], on='prediction_id')

In [56]:
cohort_features = slice_sparse_matrix(features, list(cohort_id_map['features_row_id']))

In [57]:
sum_full_feats = np.squeeze(np.asarray(cohort_features.sum(axis=0)))

In [58]:
cnt = 0
for s in sum_full_feats:
    if s == 0:
        cnt += 1
print(cnt)

0


In [59]:
print(len(cohort))
print(cohort_features.shape)

316548
(316548, 132194)


In [63]:
ped_df = cohort[cohort['age_group'] == '<18']

In [64]:
adult_df = cohort[cohort['age_group'] != '<18']

In [65]:
ped_rows = list(ped_df['features_row_id'])
ad_rows = list(adult_df['features_row_id'])

ped_row_map = pd.DataFrame({'row_idx_new':[i for i in range(len(ped_rows))], 'row_idx_og':ped_rows})
ad_row_map = pd.DataFrame({'row_idx_new':[i for i in range(len(ad_rows))], 'row_idx_og':ad_rows})

In [66]:
ped_row_map.to_csv('/local-scratch/nigam/projects/jlemmon/transfer_learning/experiments/data/bin_features/pediatric/pat_map.csv',index=False)
ad_row_map.to_csv('/local-scratch/nigam/projects/jlemmon/transfer_learning/experiments/data/bin_features/adult/pat_map.csv',index=False)

In [67]:
ad_list = slice_sparse_matrix(features, ad_rows)
ped_list = slice_sparse_matrix(features, ped_rows)

In [68]:
sum_ped_list = np.squeeze(np.asarray(ped_list.sum(axis=0)))
sum_ad_list = np.squeeze(np.asarray(ad_list.sum(axis=0)))

[0 0 0 ... 0 0 1]
[0 0 0 ... 1 1 0]


In [69]:
bin_ped_feat_list = [0 if i==0 else 1 for i in sum_ped_list]
bin_ad_feat_list = [0 if i==0 else 1 for i in sum_ad_list]

In [70]:
print(np.sum(bin_ped_feat_list))
print(np.sum(bin_ad_feat_list))

63006
123645


In [71]:
feat_count_dict = {
    'neither': 0,
    'both': 0,
    'pediatric': 0,
    'adult': 0
}
ped_feat_indices = []
ad_feat_indices = []
shared_feat_indices = []
neither_feat_indices = []
for i in range(len(bin_ped_feat_list)):
    ped = bin_ped_feat_list[i]
    ad = bin_ad_feat_list[i]
    if ped == 0 and ad == 0:
        feat_count_dict['neither'] = feat_count_dict['neither'] + 1
        neither_feat_indices.append(i)
    elif ped == 1 and ad == 0:
        feat_count_dict['pediatric'] = feat_count_dict['pediatric'] + 1
        ped_feat_indices.append(i)
    elif ped == 0 and ad == 1:
        feat_count_dict['adult'] = feat_count_dict['adult'] + 1
        ad_feat_indices.append(i)
    elif ped == 1 and ad == 1:
        feat_count_dict['both'] = feat_count_dict['both'] + 1
        shared_feat_indices.append(i)

In [72]:
for k in feat_count_dict.keys():
    print(f'{k}:{feat_count_dict[k]}')

neither:0
both:54457
pediatric:8549
adult:69188


In [73]:
pkl_file = open('/local-scratch/nigam/projects/jlemmon/transfer_learning/experiments/data/bin_features/adult/feat_used.txt','wb')
pickle.dump(bin_ad_feat_list, pkl_file)
pkl_file.close()
pkl_file = open('/local-scratch/nigam/projects/jlemmon/transfer_learning/experiments/data/bin_features/pediatric/feat_used.txt','wb')
pickle.dump(bin_ped_feat_list, pkl_file)
pkl_file.close()

In [74]:
pkl_file = open('/local-scratch/nigam/projects/jlemmon/transfer_learning/experiments/data/bin_features/adult/feat_used.txt','rb')
bin_ad_feat_list = pickle.load(pkl_file)
pkl_file.close()
pkl_file = open('/local-scratch/nigam/projects/jlemmon/transfer_learning/experiments/data/bin_features/pediatric/feat_used.txt','rb')
bin_ped_feat_list = pickle.load(pkl_file)
pkl_file.close()

In [75]:
pd.DataFrame({'feat_indices':ad_feat_indices}).to_csv('/local-scratch/nigam/projects/jlemmon/transfer_learning/experiments/data/bin_features/only_adult_feats.csv',index=False)
pd.DataFrame({'feat_indices':ped_feat_indices}).to_csv('/local-scratch/nigam/projects/jlemmon/transfer_learning/experiments/data/bin_features/only_pediatric_feats.csv',index=False)
pd.DataFrame({'feat_indices':shared_feat_indices}).to_csv('/local-scratch/nigam/projects/jlemmon/transfer_learning/experiments/data/bin_features/shared_feats.csv',index=False)

In [76]:
print(neither_feat_indices[:10])

[]


In [77]:
features.getcol(0).toarray()

array([[1],
       [1],
       [1],
       ...,
       [0],
       [0],
       [0]])