In [20]:
import os
import json
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats.stats import pearsonr
from get_12ECG_features import get_12ECG_features
from driver import load_challenge_data

import sklearn
from sklearn.metrics import mutual_info_score

In [2]:
# Define directories here
data_dir = "/share/workhorse3/vsanil/physionet/Training_WFDB"
data_json_path = "/share/workhorse3/vsanil/physionet/"

In [3]:
# Define program variables here
data_dict = {}

In [27]:
# Extracts features and stores as a dictionary
def extract_all_data(path_name, data_dict):
    # Walk through directory
    for root, dirs, files in os.walk(path_name):
        for filename in files:
            if filename.endswith('.mat'):
                print(os.path.join(root, filename))
                data, header = load_challenge_data(os.path.join(root, filename))
                data_dict[filename.replace('.mat','')] = [data, header]
                
    return data_dict

# Save data in pickle format
def save_pickle(path, data_dict, filename):
    with open(os.path.join(path, '{}.p'.format(filename)), 'wb') as fp:
        pickle.dump(data_dict, fp, protocol=pickle.HIGHEST_PROTOCOL)
        
# Get label dict
def get_labels(data_dict):
    label_dict = {}
    for filename, feature_list in data_dict.items():
        label = feature_list[1][-4].replace("#Dx: ","").replace("\n","").split(',')
        for l in label:
            if l not in label_dict:
                label_dict[l] = 0
        
    return label_dict

In [5]:
# Check if model dictionary is already present
if os.path.exists(os.path.join(data_json_path, 'data.p')):
    with open(os.path.join(data_json_path, 'data.p'), 'rb') as fp:
        data_dict = pickle.load(fp)
else:
    data_dict = extract_all_data(data_dir, data_dict)
    save_pickle(data_json_path, data_dict, "data")
    
# Generate label dictionary
label_dict = get_labels(data_dict)
sex_dict = {"Male":0, "Female":1}

In [14]:
obs_dict = {'Age':[], 'Sex':[], 'Hashed Label':[], 'Label':[], 'Feature':[], 'Filename':[]}
feature_hash = {}
count = 0

for filename, feature_list in data_dict.items():
    feat_lead_12 = feature_list[0]
    header_data = feature_list[1]
    ind_obs_l_dict = {}
    
    # Get metadata 
    label = header_data[-4].replace("#Dx: ","").replace("\n","").split(',')
    age = header_data[-6].replace("#Age: ","").replace("\n","").split(',')[0]
    sex = sex_dict[header_data[-5].replace("#Sex: ","").replace("\n","").split(',')[0]]
    
    if age=="NaN":
        continue
    
    # Generate labels
    for k,v in label_dict.items():
        if k in label:
            ind_obs_l_dict[k] = 1
        else:
            ind_obs_l_dict[k] = 0
    feat_label = list(ind_obs_l_dict.values())
    if tuple(feat_label) not in feature_hash:
        feature_hash[tuple(feat_label)] = count
        count += 1
    
    # Construct a observation dictionary
    obs_dict['Age'].append(int(age))
    obs_dict['Sex'].append(sex)
    obs_dict['Hashed Label'].append(feature_hash[tuple(feat_label)])
    obs_dict['Label'].append(feat_label)
    obs_dict['Feature'].append(feat_lead_12)
    obs_dict['Filename'].append(filename)
    
    # Display header
    #for h in header_data:
    #    print(h[:-1])
    
df = pd.DataFrame(obs_dict, columns = list(obs_dict.keys()))

In [15]:
df

Unnamed: 0,Age,Sex,Hashed Label,Label,Feature,Filename
0,28,1,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0]","[[73.0, 122.0, 121.0, 135.0, 126.0, 111.0, 108...",A4086
1,17,0,1,"[0, 1, 0, 0, 0, 0, 0, 0, 0]","[[-36.0, -51.0, -47.0, -46.0, -38.0, -44.0, -4...",A4721
2,90,0,1,"[0, 1, 0, 0, 0, 0, 0, 0, 0]","[[6.0, 5.0, 9.0, 8.0, 4.0, 8.0, 10.0, 10.0, 8....",A0198
3,63,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0]","[[-77.0, -75.0, -72.0, -65.0, -61.0, -60.0, -5...",A5161
4,51,0,2,"[0, 0, 1, 0, 0, 0, 0, 0, 0]","[[102.0, 153.0, 145.0, 155.0, 158.0, 158.0, 15...",A6074
...,...,...,...,...,...,...
6867,76,1,10,"[0, 0, 0, 0, 0, 0, 0, 0, 1]","[[-36.0, -52.0, -47.0, -49.0, -48.0, -48.0, -4...",A3692
6868,27,1,5,"[0, 0, 0, 0, 0, 1, 0, 0, 0]","[[14.0, 12.0, 13.0, 12.0, 11.0, 10.0, 10.0, 8....",A0020
6869,82,0,8,"[0, 0, 0, 0, 0, 0, 0, 1, 0]","[[15.0, 8.0, 8.0, 17.0, 6.0, 2.0, -25.0, -53.0...",A0787
6870,39,0,0,"[1, 0, 0, 0, 0, 0, 0, 0, 0]","[[34.0, 33.0, 33.0, 27.0, 17.0, 4.0, -12.0, -2...",A1660


In [16]:
df.corr()

Unnamed: 0,Age,Sex,Hashed Label
Age,1.0,-0.130352,0.085924
Sex,-0.130352,1.0,-0.051748
Hashed Label,0.085924,-0.051748,1.0


In [29]:
save_pickle(data_json_path, obs_dict,"observations")