In [1]:
import scipy.io
from pylab import *
from matplotlib import *
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
def convert_to_df(file):
    # use scipy to load matlab file
    matlab_data = scipy.io.loadmat('data/' + str(file))
    
    # create dataframe and transpose
    df = pd.DataFrame(matlab_data['all_data']).T
    
    # create column headers
    df.columns = [
        'Mean Area Under Heart Beat',
        'Mean R-to-R Peak Interval',
        'Heart Rate',
        'Peak to Peak Blood Pressure',
        'Systolic Blood Pressure',
        'Diastolic Blood Pressure',
        'Pulse Pressure'
    ]
    
    # add labels column
    df['Golden Alarms'] = matlab_data['all_labels'][0]
    
    return df

filenames = ['1_a41178.mat', '2_a42126.mat', '3_a40076.mat', \
             '4_a40050.mat', '5_a41287.mat', '6_a41846.mat', \
             '7_a41846.mat', '8_a42008.mat', '9_a41846.mat']

patient_data = [convert_to_df(file) for file in filenames]

# floor all data
for df in patient_data:
    for c in df.columns:
        df[c] = df[c].apply(np.floor)

In [3]:
# create arrays to hold train and test dataframes for each patient
train_data = []
test_data = []

for df in patient_data:
    # get split index
    total = len(df)
    split_idx = int(total * 2 / 3)
    
    # append [0, split_idx) to train_data
    train_data.append(df.head(split_idx))
    
    # append [split_idx, total) to test data
    test_data.append(df.tail(total - split_idx))

In [4]:
prior_probabilities = pd.DataFrame(columns=['PH0', 'PH1'])

for i in range(9):
    PH1 = train_data[i]['Golden Alarms'].sum() / len(train_data[i]['Golden Alarms'])
    PH0 = 1 - PH1
    prior_probabilities.loc[i] = [PH0, PH1]

# for index, df in enumerate(train_data):
#     PH1 = df['Golden Alarms'].sum() / len(df['Golden Alarms'])
#     PH0 = 1 - PH1
#     print('Patient %d \tP(H0): %f \tP(H1): %f' % (index, PH0, PH1))

prior_probabilities

Unnamed: 0,PH0,PH1
0,0.973482,0.026518
1,0.983653,0.016347
2,0.997905,0.002095
3,0.997507,0.002493
4,0.998954,0.001046
5,0.97823,0.02177
6,0.982897,0.017103
7,0.99221,0.00779
8,0.97823,0.02177


In [5]:
df = train_data[0]
df = df[df['Golden Alarms'] == 1]

# 9 x 6 matrix of dataframes
likelihood_matrix = []

for patient in train_data:
    # get patient data for each hypothesis
    h1 = patient[patient['Golden Alarms'] == 1]
    h0 = patient[patient['Golden Alarms'] == 0]
        
    # generate likelihood matrix for each patient
    patient_likelihood_matrix = []

    # drop golden alarms
    columns = patient.columns.drop('Golden Alarms')

    for col in columns:
        # get unique value counts for each feature
        # scale by length of patient dataframe hypothesis to get probability
        h0_val_counts = h0[col].value_counts() / len(h0)
        h1_val_counts = h1[col].value_counts() / len(h1)
        
        patient_likelihood_dataframe = pd.DataFrame([h0_val_counts, h1_val_counts]).T
        patient_likelihood_dataframe.columns = ['%s H0' % col, '%s H1' % col]
        
        patient_likelihood_dataframe.fillna(0, inplace=True)
        
        # append to patient likelihood matrix
        patient_likelihood_matrix.append(patient_likelihood_dataframe)
        
    # append patient likelihood matrix to likelihood matrix
    likelihood_matrix.append(patient_likelihood_matrix)

In [6]:
features = [
    'Mean Area Under Heart Beat',
    'Mean R-to-R Peak Interval',
    'Heart Rate',
    'Peak to Peak Blood Pressure',
    'Systolic Blood Pressure',
    'Diastolic Blood Pressure',
    'Pulse Pressure'
]

bars = []
for patient_index, patient in enumerate(likelihood_matrix):
    for feature_index, feature in enumerate(patient):
        fig, ax = plt.subplots(figsize=(15, 10))
        ax.bar(feature.index - 0.2, feature[feature.columns[0]], width=0.4, alpha=0.5, color='green')
        ax.bar(feature.index + 0.2, feature[feature.columns[1]], width=0.4, alpha=0.5, color='blue')
        ax.legend(['H0', 'H1'])
        ax.set_title('Patient %d - %s' % (patient_index, columns[feature_index]))
        bars.append(ax)

# for bar in bars:
#     plt.show()



In [7]:
for patient_index, patient in enumerate(likelihood_matrix):
    for feature_index, feature in enumerate(patient):
        feature_h0 = feature[feature.columns[0]]
        feature_h1 = feature[feature.columns[1]]
        prior_h0 = prior_probabilities['PH0'].loc[patient_index]
        prior_h1 = prior_probabilities['PH1'].loc[patient_index]
        
        feature['ML']  = (feature_h1 >= feature_h0).astype(int)
        feature['MAP'] = (prior_h1 * feature_h1 >= prior_h0 * feature_h0).astype(int)
        
        
df = likelihood_matrix[0][0]
df

Unnamed: 0,Mean Area Under Heart Beat H0,Mean Area Under Heart Beat H1,ML,MAP
-8.0,0.001075,0.0,0,0
-7.0,0.000717,0.0,0,0
-6.0,0.020072,0.0,0,0
-5.0,0.650538,0.0,0,0
-4.0,0.135125,0.0,0,0
-3.0,0.041219,0.0,0,0
-2.0,0.036918,0.013158,0,0
-1.0,0.029391,0.026316,0,0
0.0,0.02724,0.026316,0,0
1.0,0.017921,0.092105,1,0


In [8]:
# for patient_index, patient in enumerate(test_data):
#     for col in patient.columns:
#         for i in range(len(patient)):
#             print(patient[col].iloc[i])


generated_alarms = []
for patient_index, patient in enumerate(test_data):
    generated_alarms.append(pd.DataFrame())
    for feature_index, feature in enumerate(features):
        df = generated_alarms[patient_index]
        df[['%s ML'  % str(feature)]] = patient[[feature]]
#         for row in range(len(df)):
#             value = df[['%s ML'  % str(feature)]].iloc[row]
#             df[['%s ML'  % str(feature)]].iloc[row] = likelihood_matrix[patient_index][feature_index]['ML'].loc[value]
        df[['%s MAP' % str(feature)]] = patient[[feature]]
        
generated_alarms[0]
        
# test_data[0][['Mean Area Under Heart Beat']] = test_data[0][['Mean Area Under Heart Beat']].apply(lambda row: lookup_ML(0,0,row), axis=1)
# test_data[0][['Mean Area Under Heart Beat']]
        
#         df['Value'] = df.apply(my_test2, axis=1)
# df = generated_alarms[0]
# df['test'] = df.apply(lambda x: lookup_ML(0,0,-2.0), axis=1)
# df

# df_new = pd.DataFrame()
# # df_new = test_data[0]['Golden Alarms'].copy(deep=True)
# df_new['MAP'] = test_data[0]['Heart Rate'].copy(deep=True)
# df_new
    
# df = test_data[0]
# df[df.columns[0]].iloc[0]

# # df.loc[df['column_name'] == some_value]

# # likelihood_matrix[0][0]['MAP'].loc[2.0]
# likelihood_matrix[0][0]
# df

Unnamed: 0,Mean Area Under Heart Beat ML,Mean Area Under Heart Beat MAP,Mean R-to-R Peak Interval ML,Mean R-to-R Peak Interval MAP,Heart Rate ML,Heart Rate MAP,Peak to Peak Blood Pressure ML,Peak to Peak Blood Pressure MAP,Systolic Blood Pressure ML,Systolic Blood Pressure MAP,Diastolic Blood Pressure ML,Diastolic Blood Pressure MAP,Pulse Pressure ML,Pulse Pressure MAP
2866,2.0,2.0,77.0,77.0,96.0,96.0,78.0,78.0,82.0,82.0,62.0,62.0,19.0,19.0
2867,-2.0,-2.0,97.0,97.0,78.0,78.0,86.0,86.0,85.0,85.0,64.0,64.0,20.0,20.0
2868,2.0,2.0,89.0,89.0,84.0,84.0,97.0,97.0,82.0,82.0,63.0,63.0,19.0,19.0
2869,-3.0,-3.0,89.0,89.0,84.0,84.0,85.0,85.0,83.0,83.0,65.0,65.0,18.0,18.0
2870,-4.0,-4.0,90.0,90.0,84.0,84.0,80.0,80.0,83.0,83.0,65.0,65.0,18.0,18.0
2871,-3.0,-3.0,103.0,103.0,72.0,72.0,95.0,95.0,84.0,84.0,65.0,65.0,19.0,19.0
2872,-5.0,-5.0,98.0,98.0,78.0,78.0,96.0,96.0,85.0,85.0,61.0,61.0,23.0,23.0
2873,-5.0,-5.0,92.0,92.0,78.0,78.0,100.0,100.0,84.0,84.0,63.0,63.0,20.0,20.0
2874,-5.0,-5.0,98.0,98.0,78.0,78.0,100.0,100.0,82.0,82.0,60.0,60.0,22.0,22.0
2875,-5.0,-5.0,96.0,96.0,72.0,72.0,98.0,98.0,83.0,83.0,62.0,62.0,21.0,21.0


In [14]:
def lookup_MAP(row):
    return 1
def lookup_ML(patient_index, feature_index, value):
#     return 1
    return likelihood_matrix[patient_index][feature_index]['ML'].loc[value]

In [19]:
likelihood_matrix[0][0]

Unnamed: 0,Mean Area Under Heart Beat H0,Mean Area Under Heart Beat H1,ML,MAP
-8.0,0.001075,0.0,0,0
-7.0,0.000717,0.0,0,0
-6.0,0.020072,0.0,0,0
-5.0,0.650538,0.0,0,0
-4.0,0.135125,0.0,0,0
-3.0,0.041219,0.0,0,0
-2.0,0.036918,0.013158,0,0
-1.0,0.029391,0.026316,0,0
0.0,0.02724,0.026316,0,0
1.0,0.017921,0.092105,1,0


In [21]:
lookup_ML(0,0,0.0)

0

In [124]:
corrcoef = []
for i in range(9):
    temp = []
    for j in range(9):
        a = patient_data[i]['Heart Rate']
        b = patient_data[j]['Heart Rate']
        length = min(len(a), len(b))
        temp.append(abs(np.corrcoef(np.asarray(a[:length]), np.asarray(b[:length]))[0][1]))
    corrcoef.append(temp)
df = pd.DataFrame(corrcoef)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1.0,0.083897,0.188648,0.046239,0.035651,0.146867,0.167403,0.151857,0.146867
1,0.083897,1.0,0.415132,0.067249,0.110244,0.010139,0.343511,0.142679,0.010139
2,0.188648,0.415132,1.0,0.051025,0.212478,0.031632,0.58361,0.270494,0.031632
3,0.046239,0.067249,0.051025,1.0,0.070834,0.100789,0.02885,0.138294,0.100789
4,0.035651,0.110244,0.212478,0.070834,1.0,0.094013,0.096442,0.193773,0.094013
5,0.146867,0.010139,0.031632,0.100789,0.094013,1.0,0.072926,0.313802,1.0
6,0.167403,0.343511,0.58361,0.02885,0.096442,0.072926,1.0,0.220698,0.072926
7,0.151857,0.142679,0.270494,0.138294,0.193773,0.313802,0.220698,1.0,0.313802
8,0.146867,0.010139,0.031632,0.100789,0.094013,1.0,0.072926,0.313802,1.0


In [125]:
for col in df.columns:
    print('Patient %d\t%f' % (col, df[col].sum()-1))

Patient 0	0.967429
Patient 1	1.182990
Patient 2	1.784650
Patient 3	0.604068
Patient 4	0.907448
Patient 5	1.770169
Patient 6	1.586366
Patient 7	1.745399
Patient 8	1.770169


Patient 3 has the least total correlation with the other patients, implying that their data is problematic and is an outlier compared to other patients.  A high total correlation means that any given patient is closely related to the rest of the patients