#### Script to  Physician Action probability distribution (pi_behaviour )

In [1]:
### import overall usefull libraries
import os
import random
from multiprocessing import cpu_count

### import specific libraries for this project
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as pltt
import copy 
import joblib
### import KNN related libraries
from sklearn.neighbors import NearestNeighbors as KNN

### setup directory to get the data and write output 

In [2]:
# import your RL data dict from this directory 
data_dir = 'C:/Users/gmpxe/Downloads/MIMIC_data_allbins.csv/Extended_experiment/data/'
data_file = 'Extended_data_dict.pkl'
# write your physician KNN pi_behaviour here 
out_dir = 'C:/Users/gmpxe/Downloads/MIMIC_data_allbins.csv/Extended_experiment/data/'

#figures 
fig_dir = 'C:/Users/gmpxe/Downloads/MIMIC_data_allbins.csv/Extended_experiment/figures/'

#intermidir 
interimdir = 'C:/Users/gmpxe/Downloads/MIMIC_data_allbins.csv/Extended_experiment/interimfiles/'

# load data 
data_dict = joblib.load(os.path.join(data_dir, data_file))
print(data_dict.keys())

dict_keys(['train', 'val', 'test', 'v', 'featurenames'])


#### upweight Features based on article / luca
-  https://arxiv.org/abs/1811.09602:

In [3]:
########################################################################################
# Upweighted features for KNN distance metric
upweighted_features = ['Sofa_score', # SOFA score
                       'Lactate',    # Lactate levels
                       'total_UP',   # fluid output of current state
                       #'total_IV',   # iv fluid input of current state
                       'MAP',        # mean and blood pressure (MAP)
                       'DIA',        # diastolic blood pressure,
                       'PF_ratio',   # PaO2/FiO2 ratio
                       'Weight',     # weight,
                       'Age',        # age
                      ]

# features for modelin 
feature_names = data_dict['featurenames']

# Check if 
assert set(upweighted_features).issubset(feature_names), "Can't upweight non-existent feature."

# Upweight some features for distance metric
feature_weights = 1 + np.array([f in upweighted_features 
                                for f in feature_names], dtype=np.float)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


##### Set up config 

In [4]:
config = {'metric': 'wminkowski',                               
          'feature_weights' : [feature_weights],
          'algorithm': 'auto',                        
          'Minkowski_Power_parameter': 2, 
          'n_neighbors': 300,
         }
config_df = pd.DataFrame(config, index=[0])
config_df.to_csv(os.path.join(out_dir, 'KNNconfig.csv'), index=False)

In [5]:
##### Train the model to generate KNN physican behaviour pickle file 

In [6]:
import time 
import datetime
import math

In [7]:
########################################################################################
eval_types = ['test', 'val', 'train']
for eval_type in eval_types:
    eval_since = time.time()
    try:
        transition_dict = dict(zip(data_dict[eval_type]['state_id'], data_dict[eval_type]['next_state_id']))
        print("Physician model for: " + str(eval_type))
    except:
        print("Error using evaluate_model: Incorrect eval type. It should be 'test', 'val' or 'train'")

    # Get model-ready data
    state_space = data_dict[eval_type]['X']
    state_space[np.where(np.isinf(state_space))] = 0 # impute inf by mean,

    # K-Nearest-Neighbor
    state_space = data_dict[eval_type]['X']
    knn = KNN(n_neighbors=config['n_neighbors'],
              metric=config['metric'],
              p=config['Minkowski_Power_parameter'],
              metric_params={'w': config['feature_weights']},
              algorithm=config['algorithm'],
              n_jobs=cpu_count()-1)

    knn = knn.fit(state_space)
    #print(str(eval_type) + " Knn fitted")

    ################################################################################################################
    counter = 0
    step_size = 50
    total_steps = math.floor(state_space.shape[0]/step_size)
    final_step = state_space.shape[0] % step_size
    '''
    print('state space', state_space.shape[0])
    print('step size (states in a step):', step_size)
    print('final step state space count:', final_step)
    print('total steps:', total_steps+1)
    '''

    ################################################################################################################
    for i in range(0,state_space.shape[0],step_size):
        ### get subset of state space
        if i < total_steps*step_size:
            step = step_size
        else:
            step = final_step

        counter += 1
        if counter % 10 == 0 or i == 0:
            print('step ' + str(counter) + ' out of ' + str(total_steps+1) + '. Start state: ' + str(i) + '. End state: ' + str(i+step*10))

        ### get subset of state space
        state_space_subset = state_space[i:i+step]

        #### get distances and indices
        dist_subset, ind_subset = knn.kneighbors(state_space_subset) 
        if i == 0:
            dist, ind = dist_subset, ind_subset
        else:
            dist = np.append(dist,dist_subset,axis=0)
            ind = np.append(ind,ind_subset,axis=0)

        ### save interim models every slighty less then 10%
        if i % (math.floor(total_steps/10)*step_size) == 0 and i > 0:
                saving_step = math.ceil((i / state_space.shape[0])*100)
                #print('fitted AND SAVED all data up to ' + str(saving_step) + "%")
                time_elapsed = time.time() - eval_since
                hours = time_elapsed//3600
                temp = time_elapsed - 3600*hours
                minutes = temp//60
                seconds = temp - 60*minutes
                print('KNN INTERIM STEP completed in %d hours, %d minutes and %d seconds' %(hours,minutes,seconds))
                dist_df = pd.DataFrame(dist)
                ind_df = pd.DataFrame(ind)
                dist_df.to_csv(os.path.join(interimdir, 'KNN_pi_behavior_interim_step_' + str(i) + '_' +str(eval_type) + '_dist.csv'), index=False)
                ind_df.to_csv(os.path.join(interimdir, 'KNN_pi_behavior_interim_step_' + str(i) + '_' +str(eval_type) + '_ind.csv'), index=False)

    print(str(eval_type) + " state space processed")

    ###########################################
    all_states_action_probabilities = np.zeros([state_space.shape[0],len(np.unique(data_dict[eval_type]['action']))])
    # For each state in the state_space assign the probability of each action to the appropriate column in the final_df
    for i in range(state_space.shape[0]):
        if i % 5000 == 0: # and i > 0:
            print(str(eval_type) + " APPENDING step " + str(i) + " out of " + str(state_space.shape[0]) + ".")

        # get count of actions for this state
        this_state_action = data_dict[eval_type]['action'][(i)]
        similar_actions_for_this_state = data_dict[eval_type]['action'][(ind[i])]
        all_state_actions= np.append(this_state_action, similar_actions_for_this_state)

        # count frequency of each possible (out of ?) actions from (the performed action +  smiliar actions performed by clinicians in similar states)
        all_action_count = []

        # add the action count of each action for j in action range (0 to unique amount of actions)
        for j in np.unique(data_dict[eval_type]['action']):
            all_action_count.append(all_state_actions.tolist().count(j))

        # get the probability of each action in this state out of [similar_action_for_this_sate+this_state_action]
        all_action_probability = [x / sum(all_action_count) for x in all_action_count]
        all_action_probs = np.around(all_action_probability,3)

        # add to results matrix
        all_states_action_probabilities[i,:] = all_action_probs

    # visual inspection of final dataframe with assigned action probabilities
    results_df = pd.DataFrame.from_records(all_states_action_probabilities)

    # save results
    results_df.columns = ['A' + str(i) for i in np.unique(data_dict[eval_type]['action'])]

    # save to pickle
    results_df.to_pickle(os.path.join(out_dir, 'KNN_pi_behavior_' + str(eval_type) +'data.pkl'))
    
    ################################################################################################################
   
print('the shape of the df is ', results_df.shape)
#####



Physician model for: test
step 1 out of 456. Start state: 0. End state: 500
step 10 out of 456. Start state: 450. End state: 950
step 20 out of 456. Start state: 950. End state: 1450
step 30 out of 456. Start state: 1450. End state: 1950
step 40 out of 456. Start state: 1950. End state: 2450
KNN INTERIM STEP completed in 0 hours, 0 minutes and 18 seconds
step 50 out of 456. Start state: 2450. End state: 2950
step 60 out of 456. Start state: 2950. End state: 3450
step 70 out of 456. Start state: 3450. End state: 3950
step 80 out of 456. Start state: 3950. End state: 4450
step 90 out of 456. Start state: 4450. End state: 4950
KNN INTERIM STEP completed in 0 hours, 0 minutes and 35 seconds
step 100 out of 456. Start state: 4950. End state: 5450
step 110 out of 456. Start state: 5450. End state: 5950
step 120 out of 456. Start state: 5950. End state: 6450
step 130 out of 456. Start state: 6450. End state: 6950
KNN INTERIM STEP completed in 0 hours, 0 minutes and 54 seconds
step 140 out of 

In [8]:
results_df.head(n=5)

Unnamed: 0,A0,A1,A2,A3,A4,A5,A6,A7,A8,A9,...,A11,A12,A13,A14,A15,A16,A17,A18,A19,A20
0,0.983,0.01,0.007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.983,0.01,0.007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.983,0.01,0.007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.983,0.01,0.007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.983,0.01,0.007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
