In [7]:
import pickle
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
from sklearn_pandas import DataFrameMapper
import seaborn as sns

# Read the pickled DataFrame
with open('data/consolidated_pat_tbl_tv.pickle', 'rb') as file:
    consolidated_pat_tbl = pickle.load(file)

pd.set_option('display.max_columns', None)

In [8]:
consolidated_pat_tbl.subject_id.nunique()

5904

In [9]:
# OneHotEncode race (ethnicity)!
encoder = OneHotEncoder(sparse_output=False)
race_enc = encoder.fit_transform(consolidated_pat_tbl[['race']])
mod_df = consolidated_pat_tbl.drop('race' , axis = 1)
mod_df = pd.concat([mod_df , pd.DataFrame(
    race_enc , 
    columns = consolidated_pat_tbl[['race']].drop_duplicates().sort_values('race').values.T[0]
    )
] , axis = 1)

# OneHotEncode gender (sex)
gender_encoder = OneHotEncoder(sparse_output=False)
gender_enc = gender_encoder.fit_transform(mod_df[['gender']])
mod_df = mod_df.drop('gender' , axis = 1)
mod_df = pd.concat([mod_df , pd.DataFrame(
    gender_enc , 
    columns = consolidated_pat_tbl[['gender']].drop_duplicates().sort_values('gender').values.T[0]
    )
] , axis = 1)

In [10]:
def train_test_splitter_tv(df , test_size = 0.2 , val_size = 0.2 , patient_col = 'subject_id'):
    # get all patients
    pats = df[patient_col].unique()
    # inplace shuffle
    np.random.shuffle(pats)

    # get splits
    test_pats = pats[:int(test_size*len(pats))]
    val_pats = pats[int(test_size*len(pats)):int(test_size*len(pats))+int(val_size*len(pats))]
    train_pats = pats[int(test_size*len(pats))+int(val_size*len(pats)):]

    # allocate
    df_test = df[df['subject_id'].isin(test_pats)]
    df_val = df[df['subject_id'].isin(val_pats)]
    df_train = df[df['subject_id'].isin(train_pats)]
    
    # check
    assert df_train.subject_id.nunique() + df_test.subject_id.nunique() + df_val.subject_id.nunique() == df.subject_id.nunique()
    return df_train , df_test , df_val

df_train , df_test , df_val = train_test_splitter_tv(mod_df)

In [4]:
# def impute(df):
#     _columns = df.columns.astype('str')
#     df.columns = _columns

#     # Imputation
#     imputer = SimpleImputer(strategy = 'most_frequent')
#     df = pd.DataFrame(imputer.fit_transform(df) , columns = _columns)

#     # Check for null
#     assert np.round(df.notnull().sum()/len(df)).sum() == df.shape[1]

#     # # name change
#     # df.rename(columns = {'50907':'cholesterol' , '50983':'sodium' , '51133':'lymphocyte' , '51222':'hemoglobin'} , inplace = True)

#     return df

# # use on train test val data
# df_train = impute(df_train)
# df_test = impute(df_test)
# df_val = impute(df_val)

In [59]:
def scaling(df):
    # Scaling
    scaler = StandardScaler()
    scaled_columns = [
    'cholesterol', 'sodium', 'lymphocyte', 'hemoglobin',
    'temperature', 'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp','anchor_age','BMI (kg/m2)',
    'Height (Inches)', 'Weight (Lbs)'
    ]

    unscaled_columns = ['acebutolol', 'amlodipine', 'atenolol', 'benazepril', 'candesartan',
        'captopril', 'diltiazem', 'felodipine', 'irbesartan', 'lisinopril',
        'moexipril', 'nadolol', 'nebivolol', 'nicardipine', 'nifedipine',
        'olmesartan', 'propranolol', 'quinapril', 'ramipril', 'telmisartan',
        'trandolapril', 'valsartan', 'verapamil', 'ASIAN', 'BLACK', 'HISPANIC', 'NATIVE',
        'OTHER', 'WHITE' ,'F','M', 'subject_id','timediff']

    scale = [([col], StandardScaler()) for col in scaled_columns]
    no_scale = [(col, None) for col in unscaled_columns]

    x_mapper = DataFrameMapper(scale + no_scale)
    
    # scale data
    x = pd.DataFrame(x_mapper.fit_transform(df) , 
                        columns = scaled_columns + unscaled_columns
                        )
    # add subject_id and timediff separately
    # x_ = pd.concat([x , df[['subject_id']]] , axis = 1)
    # print(x.shape , x_.shape)
    # get targets
    y = df[['subject_id','time_to_event','death']].drop_duplicates().drop('subject_id', axis = 1)

    # Check
    assert x.subject_id.nunique() == len(y) , 'target and feature length mismatch' 
    assert x.shape[0] == df.shape[0] , 'row mismatch'
    return x , y

# Scale
x_train , y_train = scaling(df_train)
x_test , y_test = scaling(df_test)
x_val , y_val = scaling(df_val)

In [60]:
def check_var(df):
    zero_var_cols = []
    # Check which columns have zero variance
    for _col in df.columns[:-2]:
        var = df[_col].var()
        if var == 0:
            zero_var_cols.append(_col)
    return zero_var_cols

# get rid of columns that have zero variance in training data as they won't add anything to the training
zero_var_cols = check_var(x_train)
print(f'zero_var_cols {zero_var_cols}')

# Drop the cols
x_train.drop(zero_var_cols, axis = 1, inplace = True)
x_test.drop(zero_var_cols, axis = 1, inplace = True)
x_val.drop(zero_var_cols, axis = 1, inplace = True)

zero_var_cols ['candesartan', 'nebivolol']


In [94]:
def null_masking(df):
    '''
    create a null mask
    '''
    mask = df.isnull().astype('int')
    mask.columns = [col+'_mask' for col in df.isnull().astype('int').columns]
    mask = mask.iloc[: , :-2]
    return mask

# Get masks
mask_train = null_masking(x_train)
mask_test = null_masking(x_test)
mask_val = null_masking(x_val)

# Add masks
_x_train = pd.concat([x_train , mask_train], axis = 1)
_x_test = pd.concat([x_test , mask_test], axis = 1)
_x_val = pd.concat([x_val , mask_val], axis = 1)

In [108]:
def feature_reshape(df , subject_col):
    pats = list(df[subject_col].unique())

    df_reshape = []

    # selecting individual patients in the df
    for pat in pats:
        pat_df = df[df[subject_col]==pat].fillna(method = 'ffill').fillna(method = 'bfill') # first, forward fill
        # then back fill for any remaining nulls
        df_reshape.append(pat_df.drop(subject_col , axis = 1).to_numpy())

    return np.array(df_reshape, dtype = 'object')

# Create variable length 3D representations
x_train_reshape = feature_reshape(_x_train , 'subject_id')
x_test_reshape = feature_reshape(_x_test , 'subject_id')
x_val_reshape = feature_reshape(_x_val , 'subject_id')

In [115]:
# Pickle the DataFrame
with open('data/x_train_reshape_tv.pickle', 'wb') as file:
    pickle.dump(x_train_reshape, file)

with open('data/x_test_reshape_tv.pickle', 'wb') as file:
    pickle.dump(x_test_reshape, file)

with open('data/x_val_reshape_tv.pickle', 'wb') as file:
    pickle.dump(x_val_reshape, file)

-----

In [None]:
# def feature_reshape(df, group_column = 0):
#     '''
#     Convert to 3D array
#     '''
#     # convert to numpy
#     df_arr = df.to_numpy()

#     # Step 1: Sort the array based on the group_column
#     sorted_data = df_arr[df_arr[:, group_column].argsort()]

#     # Step 2: Find unique values in the group_column
#     group_values, group_counts = np.unique(sorted_data[:, group_column], return_counts=True)

#     # Step 3: Use np.split() to split the sorted_data into separate arrays based on the unique values
#     grouped_data = np.split(sorted_data, np.cumsum(group_counts)[:-1])

#     # Step 4: Reshape the resulting arrays into a 3D matrix
#     result = np.array(grouped_data , dtype = 'object')

#     result_new = []

#     # # remove subject id
#     # for pat_img in result:
#     #     pat_img = pat_img[: , :-1]
#     #     result_new.append(pat_img)
    
#     # return np.array(result_new , dtype = 'object')
#     return np.array(result , dtype = 'object')

In [None]:
# # Create variable length 3D representations
# x_train_reshape = feature_reshape(x_train , group_column = -1)
# x_test_reshape = feature_reshape(x_test , group_column = -1)
# x_val_reshape = feature_reshape(x_val , group_column = -1)