In [1]:
import pickle
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
from sklearn_pandas import DataFrameMapper
import seaborn as sns
from scipy.interpolate import interp1d

import torch
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.optim import Optimizer

from torch import Tensor

import torch.nn as nn
import torch.nn.functional as F
from torch.nn.modules.loss import _Loss

# Read the pickled DataFrame
with open('data/consolidated_pat_tbl_tv.pickle', 'rb') as file:
    consolidated_pat_tbl = pickle.load(file)

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [2]:
# OneHotEncode race (ethnicity)!
encoder = OneHotEncoder(sparse_output=False)
race_enc = encoder.fit_transform(consolidated_pat_tbl[['race']])
mod_df = consolidated_pat_tbl.drop('race' , axis = 1)
mod_df = pd.concat([mod_df , pd.DataFrame(
    race_enc , 
    columns = consolidated_pat_tbl[['race']].drop_duplicates().sort_values('race').values.T[0]
    )
] , axis = 1)

# Leave out one of the one-hot encoded columns so as to not raise multicollinearity issues
mod_df.drop('OTHER' , axis = 1 , inplace = True)

# OneHotEncode gender (sex)
gender_encoder = OneHotEncoder(sparse_output=False)
gender_enc = gender_encoder.fit_transform(mod_df[['gender']])
mod_df = mod_df.drop('gender' , axis = 1)
mod_df = pd.concat([mod_df , pd.DataFrame(
    gender_enc , 
    columns = consolidated_pat_tbl[['gender']].drop_duplicates().sort_values('gender').values.T[0]
    )
] , axis = 1)

# Leave out one of the one-hot encoded columns so as to not raise multicollinearity issues
mod_df.drop('F' , axis = 1 , inplace = True)

In [3]:
def train_test_splitter_tv(df , test_size = 0.2 , val_size = 0.2 , patient_col = 'subject_id'):
    # get all patients
    pats = df[patient_col].unique()
    # inplace shuffle
    np.random.shuffle(pats)

    # get splits
    test_pats = pats[:int(test_size*len(pats))]
    val_pats = pats[int(test_size*len(pats)):int(test_size*len(pats))+int(val_size*len(pats))]
    train_pats = pats[int(test_size*len(pats))+int(val_size*len(pats)):]

    # allocate
    df_test = df[df['subject_id'].isin(test_pats)]
    df_val = df[df['subject_id'].isin(val_pats)]
    df_train = df[df['subject_id'].isin(train_pats)]
    
    # check
    assert df_train.subject_id.nunique() + df_test.subject_id.nunique() + df_val.subject_id.nunique() == df.subject_id.nunique()
    return df_train , df_test , df_val

df_train , df_test , df_val = train_test_splitter_tv(mod_df)

In [7]:
def get_feature_target(df_original , subject_col , time_2_eve_col , event_col , timediff_col):
    df = df_original.copy()

    # cumulative time difference!
    df[timediff_col] = df.groupby(subject_col)[timediff_col].cumsum()

    target_cols = [time_2_eve_col , event_col]

    # feature cols
    x = df[[col for col in df.columns if col not in target_cols]]

    # get targets
    y = df[[subject_col]+target_cols].drop_duplicates().drop(subject_col, axis = 1)

    # Check
    assert x[subject_col].nunique() == len(y) , 'target and feature length mismatch' 
    assert x.shape[0] == df.shape[0] , 'row mismatch'
    return x , y

x_train , y_train = get_feature_target(df_train , 'subject_id' , 'time_to_event' , 'death' , 'timediff')
x_test , y_test = get_feature_target(df_test , 'subject_id' , 'time_to_event' , 'death' , 'timediff')
x_val , y_val = get_feature_target(df_val , 'subject_id' , 'time_to_event' , 'death' , 'timediff')

# Check if cumulative time diff col has been properly added
assert x_train.groupby('subject_id').agg({'timediff':'min'}).sum()[0] == 0 # only checking for train data should be enough

In [8]:
def check_var(df):
    zero_var_cols = []
    # Check which columns have zero variance
    for _col in df.columns[:-2]:
        var = df[_col].var()
        if var == 0:
            zero_var_cols.append(_col)
    return zero_var_cols

# get rid of columns that have zero variance in training data as they won't add anything to the training
zero_var_cols = check_var(x_train)
print(f'zero_var_cols {zero_var_cols}')

# Drop the cols
_x_train = x_train.drop(zero_var_cols, axis = 1)
_x_test = x_test.drop(zero_var_cols, axis = 1)
_x_val = x_val.drop(zero_var_cols, axis = 1)

zero_var_cols ['acebutolol', 'telmisartan']


In [9]:
def null_masking(df,subject_col):
    '''
    create a null mask
    '''
    # null_cols = list(df.isnull().sum()[df.isnull().sum()!=0].index)
    mask = df.isnull().astype('int')
    mask.drop(subject_col , axis = 1 , inplace = True)
    # add a suffix
    mask.columns = [col+'_mask' for col in mask.columns]
    return mask

# Get masks
mask_train = null_masking(x_train,'subject_id')
mask_test = null_masking(x_test,'subject_id')
mask_val = null_masking(x_val,'subject_id')

# Add masks
_x_train = pd.concat([mask_train, _x_train], axis = 1)
_x_test = pd.concat([mask_test, _x_test], axis = 1)
_x_val = pd.concat([mask_val, _x_val], axis = 1)

In [10]:
%%time

def feature_impute_and_reshape(df , subject_col , timediff_col , divisions):
    pats = list(df[subject_col].unique())

    # init container
    df_reshape = []

    # selecting individual patients in the df
    for pat in pats:
        pat_df = df[df[subject_col]==pat].fillna(method = 'ffill') # forward fill
        
        # Further Imputation
        imputer = SimpleImputer(strategy = 'most_frequent', keep_empty_features = True)
        imputed_pat_df = imputer.fit_transform(pat_df)
        _pat_df = pd.DataFrame(imputed_pat_df , columns = pat_df.columns)
        
        # quantized time difference
        timediff_div = np.linspace(
            _pat_df[timediff_col].min() , 
            _pat_df[timediff_col].max() ,
            divisions
        )

        # matrix to store patient features at timestamp-wise divisions of their history
        mat = []

        # interpolate
        _cols = [col for col in _pat_df.columns if col!='subject_id' and col!='timediff']
        for col in _cols:
            f = interp1d(_pat_df[timediff_col] , _pat_df[col]) # fit
            col_div = f(timediff_div)
            mat.append(col_div)
            
        mat = np.column_stack(mat)

        # add to container
        df_reshape.append(mat)

    arr = np.array(df_reshape, dtype = 'object').astype('float')

    # To normalize along the columns (dimension 1), you can do the following:
    norms = np.linalg.norm(arr, ord=2, axis=1)  # Compute L2-norm along axis 1

    # To avoid division by zero, add a small epsilon (e.g., 1e-8) to the norms
    epsilon = 1e-8
    normalized_array = arr / (norms[:, np.newaxis] + epsilon)

    normalized_array = torch.Tensor(
        normalized_array.reshape(
            normalized_array.shape[0] ,
            1 ,
            normalized_array.shape[1] , 
            normalized_array.shape[2]
            )
        )
    
    return normalized_array

# Create variable length 3D representations
x_train_reshape = feature_impute_and_reshape(
    _x_train , 
    subject_col = 'subject_id' , 
    timediff_col = 'timediff' , 
    divisions = 10
)

x_test_reshape = feature_impute_and_reshape(
    _x_test , 
    subject_col = 'subject_id' , 
    timediff_col = 'timediff' , 
    divisions = 10
)

x_val_reshape = feature_impute_and_reshape(
    _x_val , 
    subject_col = 'subject_id' , 
    timediff_col = 'timediff' , 
    divisions = 10
)

CPU times: total: 1min 47s
Wall time: 1min 45s


In [11]:
def nan_check(arr):
    for i in range(len(arr)):
        s = torch.isnan(arr[i]).sum().sum()
        if s > 0:
            print(s)

# Apply
nan_check(x_train_reshape)
nan_check(x_test_reshape)
nan_check(x_val_reshape)

In [19]:
# Pickle the DataFrame
with open('data/x_train_reshape_tv.pickle', 'wb') as file:
    pickle.dump(x_train_reshape, file)

with open('data/x_test_reshape_tv.pickle', 'wb') as file:
    pickle.dump(x_test_reshape, file)

with open('data/x_val_reshape_tv.pickle', 'wb') as file:
    pickle.dump(x_val_reshape, file)

# Pickle the targets
with open('data/y_train.pickle', 'wb') as file:
    pickle.dump(y_train, file)

with open('data/y_test.pickle', 'wb') as file:
    pickle.dump(y_test, file)

with open('data/y_val.pickle', 'wb') as file:
    pickle.dump(y_val, file)

### **JUNK**

In [16]:
# def feature_reshape(df, group_column = 0):
#     '''
#     Convert to 3D array
#     '''
#     # convert to numpy
#     df_arr = df.to_numpy()

#     # Step 1: Sort the array based on the group_column
#     sorted_data = df_arr[df_arr[:, group_column].argsort()]

#     # Step 2: Find unique values in the group_column
#     group_values, group_counts = np.unique(sorted_data[:, group_column], return_counts=True)

#     # Step 3: Use np.split() to split the sorted_data into separate arrays based on the unique values
#     grouped_data = np.split(sorted_data, np.cumsum(group_counts)[:-1])

#     # Step 4: Reshape the resulting arrays into a 3D matrix
#     result = np.array(grouped_data , dtype = 'object')

#     result_new = []

#     # # remove subject id
#     # for pat_img in result:
#     #     pat_img = pat_img[: , :-1]
#     #     result_new.append(pat_img)
    
#     # return np.array(result_new , dtype = 'object')
#     return np.array(result , dtype = 'object')

In [11]:
# # Create variable length 3D representations
# x_train_reshape = feature_reshape(x_train , group_column = -1)
# x_test_reshape = feature_reshape(x_test , group_column = -1)
# x_val_reshape = feature_reshape(x_val , group_column = -1)

In [12]:
# def scaling(df):
#     # Scaling
#     scaler = StandardScaler()
#     scaled_columns = [
#     'cholesterol', 'sodium', 'lymphocyte', 'hemoglobin',
#     'temperature', 'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp','anchor_age','BMI (kg/m2)',
#     'Height (Inches)', 'Weight (Lbs)'
#     ]

#     unscaled_columns = ['acebutolol', 'amlodipine', 'atenolol', 'benazepril', 'candesartan',
#         'captopril', 'diltiazem', 'felodipine', 'irbesartan', 'lisinopril',
#         'moexipril', 'nadolol', 'nebivolol', 'nicardipine', 'nifedipine',
#         'olmesartan', 'propranolol', 'quinapril', 'ramipril', 'telmisartan',
#         'trandolapril', 'valsartan', 'verapamil', 'ASIAN', 'BLACK', 'HISPANIC', 'NATIVE',
#         'OTHER', 'WHITE' ,'F','M', 'subject_id','timediff']

#     # scale = [([col], StandardScaler()) for col in scaled_columns]
#     scale = [([col], None) for col in scaled_columns]
#     no_scale = [(col, None) for col in unscaled_columns]

#     x_mapper = DataFrameMapper(scale + no_scale)
    
#     # scale data
#     x = pd.DataFrame(x_mapper.fit_transform(df) , 
#                         columns = scaled_columns + unscaled_columns
#                         )
#     # add subject_id and timediff separately
    
#     # get targets
#     y = df[['subject_id','time_to_event','death']].drop_duplicates().drop('subject_id', axis = 1)

#     # Check
#     assert x.subject_id.nunique() == len(y) , 'target and feature length mismatch' 
#     assert x.shape[0] == df.shape[0] , 'row mismatch'
#     return x , y

# # Scale
# x_train , y_train = scaling(df_train)
# x_test , y_test = scaling(df_test)
# x_val , y_val = scaling(df_val)

In [13]:
# # def null_masking(df):
# #     '''
# #     create a null mask
# #     '''
# #     mask = df.isnull().astype('int')
# #     mask.columns = [col+'_mask' for col in df.isnull().astype('int').columns]
# #     mask = mask.iloc[: , :-2]
# #     return mask

# # # Get masks
# # mask_train = null_masking(x_train)
# # mask_test = null_masking(x_test)
# # mask_val = null_masking(x_val)

# # # Add masks
# # _x_train = pd.concat([mask_train, x_train], axis = 1)
# # _x_test = pd.concat([mask_test, x_test], axis = 1)
# # _x_val = pd.concat([mask_val, x_val], axis = 1)

# _x_train = x_train
# _x_test = x_test
# _x_val = x_val

In [14]:
# input = x_train_reshape.astype('float')
# input = torch.Tensor(input.reshape(input.shape[0] , 1 , input.shape[1] , input.shape[2]))
# print(input.shape)
# c2d = nn.Conv2d(1 , 14 , kernel_size = 3)
# # c2d(torch.Tensor(input))
# enc = Encoder()
# dec = Decoder()
# dec(enc(input)).shape , input.shape

# def create_distance_matrix(matrix_array):
#     distance_mtx = []
    
#     for i in range(matrix_array.shape[0]):
#         ssd = list(np.sum((matrix_array - matrix_array[i])**2 , axis = (1,2)))
#         frob = np.sqrt(ssd)
#         distance_mtx.append(frob)
    
#     return np.column_stack(distance_mtx)

# dm = create_distance_matrix(x_train_reshape)
# dm.shape

# import numpy as np
# from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
# import matplotlib.pyplot as plt

# # Step 1: Prepare the distance matrix
# # Replace this with your actual distance matrix
# distance_matrix = dm

# # Step 2: Choose the linkage method
# linkage_method = 'average'  # You can try 'single' or 'complete' as well

# # Step 3: Perform hierarchical clustering
# Z = linkage(distance_matrix, method=linkage_method)

# # Step 4: Determine the number of clusters using dendrogram
# dendrogram(Z)
# plt.xlabel('Data points')
# plt.ylabel('Distance')
# plt.title('Dendrogram')
# plt.show()

# # Step 5: Cut the dendrogram to get clusters
# num_clusters = 7 # You can adjust this based on the dendrogram visualization

# # Step 6: Assign data points to clusters
# clusters = fcluster(Z, num_clusters, criterion='maxclust')
# print("Data point clustering:", clusters)

In [15]:
# scaled_cols = ['temperature', 'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp', 'cholesterol', 'sodium', 
#                 'lymphocyte', 'hemoglobin','BMI (kg/m2)','Height (Inches)', 'Weight (Lbs)','timediff']

# x_train_prp = feature_preprocess(
#     _x_train , 
#     subject_col = 'subject_id' , 
#     scaled_cols = scaled_cols ,
#     scaling_func = scaler_func ,
#     filling_func = fill_to_max_time , 
#     max_time = max_time
# )
# x_test_prp = feature_preprocess(
#     _x_test , 
#     subject_col = 'subject_id' , 
#     scaled_cols = scaled_cols ,
#     scaling_func = scaler_func ,
#     filling_func = fill_to_max_time , 
#     max_time = max_time
# )
# x_val_prp = feature_preprocess(
#     _x_val , 
#     subject_col = 'subject_id' , 
#     scaled_cols = scaled_cols ,
#     scaling_func = scaler_func ,
#     filling_func = fill_to_max_time , 
#     max_time = max_time
# )

# def scaler_func(df , scaled_cols):
#     unscaled_cols = [col for col in _x_train.columns if col not in scaled_cols]

#     # Create a StandardScaler object
#     my_scaler = StandardScaler()

#     # Fit the scaler to your data and then transform the data
#     scaled_data = pd.DataFrame(my_scaler.fit_transform(df[scaled_cols]) , columns = scaled_cols)
#     unscaled_data = df[unscaled_cols].reset_index().drop('index' , axis = 1)
#     assert scaled_data.shape[0] == unscaled_data.shape[0]
#     df_scaled = pd.concat([unscaled_data , scaled_data] , axis = 1)

#     return df_scaled
    
# def fill_to_max_time(original_tensor, max_time):
#     # Create a new zero-filled tensor of shape (1000, 20)
#     desired_shape = (max_time, original_tensor.shape[1])
#     new_tensor = torch.zeros(desired_shape)

#     # Copy the original data into the new tensor up to the original number of rows
#     new_tensor[:original_tensor.shape[0], :] = original_tensor

#     return new_tensor