Get data from web and convert to dataframe

In [1]:
%run "cog-web-to-dataframe.ipynb"

[33mYou are using pip version 8.1.2, however version 9.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


Install python tables (for pandas)

In [2]:
!pip3 install tables

[33mYou are using pip version 8.1.2, however version 9.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


Define Data Imports

In [3]:
import pandas as pd
import os
import re
from datetime import datetime
import numpy as np


hdf_filename = 'data/hdf5/mit-bih.hdf'



In [4]:
def generate_normal_and_arrythmia_samples(anECGDataFrame, anOffsetWindow=[5000, 5000]):
    '''

    Args:
        anECGDataFrame : mitdb data
        anOffsetWindow

    Returns:
        DataFrame containing features for 
    '''
    def generate_time_interval(aFormatString=None, hours=0, minutes=0, seconds=0, microseconds=0):
        '''
        Args:
            aFormatString
            hours 			: number of hours
            minutes 		: number of minutes
            seconds 		: number of seconds
            mmicroseconds 	: number of mmicroseconds

        Returns:
            time intevrval
        '''
        if not aFormatString:
            aFormatString = ':'.join(map(str, [hours, minutes, seconds])) + '.' + str(microseconds)

        return datetime.strptime(aFormatString, '%H:%M:%S.%f') - datetime.strptime('0:0:0.0', '%H:%M:%S.%f')
    def generate_sample_intervals(aDataFrame, aTimeIndex, aLabel, aStartInterval, anEndInterval,
                                  aColumnList):
        '''
        Args:
            aDataFrame : time series data
            aTimeIndex : index of events in data
            aLabel : class to associate with these events
            aStartInterval : how far back to go 

        Returns:
            DataFrame
        '''
        def max_amplitude_filter(aSingleChannel):
            '''
            Args:
                aSingleChannel  :	  DataFrame

            Returns:
                filtered (list): temporal list of filtered values, peaks accentuated
            '''
            #modeFit = pd.rolling_kurt(aSingleChannel, 100)
            modeFit = aSingleChannel.rolling(window=100,center=False).kurt()
            #stdDev = pd.rolling_std(aSingleChannel - pd.rolling_mean(aSingleChannel, 10), 10)
            stdDev = (aSingleChannel - aSingleChannel.rolling(window=10,center=False).mean()).rolling(window=10,center=False).std()
            
            
            return aSingleChannel * modeFit * stdDev

        # for each event, generate an interval around it
        startIntervalList = aTimeIndex - aStartInterval
        endIntervalList = aTimeIndex + anEndInterval

        intervals = list(zip(startIntervalList, endIntervalList))
        sampleIntervals = []

        # for each event interval, save off series data and also features
        for start, end in intervals[1:]:

            # all mV values in a single series
            intervalSamples = aDataFrame.loc[start:end, aColumnList]
            intervalSeries = pd.Series(intervalSamples.as_matrix().ravel())

            # enhance features in each lead series
            # save off the max and var
            for signalName in aColumnList:
                lead_filtered = max_amplitude_filter(intervalSamples[signalName])
                intervalSeries[signalName + '_max'] = lead_filtered.max()
                intervalSeries[signalName + '_var'] = lead_filtered.var()

            intervalSeries['labels'] = aLabel

            sampleIntervals.append(intervalSeries)

        return pd.DataFrame(sampleIntervals)


    windowStartOffset = anOffsetWindow[0]
    windowEndOffset = anOffsetWindow[1]
    startInterval = generate_time_interval(microseconds=windowStartOffset)
    endInterval = generate_time_interval(microseconds=windowEndOffset)

    normalIndex = anECGDataFrame[anECGDataFrame.normal_events == 1].index
    arrythmiaIndex = anECGDataFrame[anECGDataFrame.arrythmia_events == 1].index

    leadNames = ['MLII_milliVolts', 'V5_milliVolts']
    normalFeatures    = generate_sample_intervals(anECGDataFrame, normalIndex, 0, startInterval, endInterval, leadNames)
    arrythmiaFeatures = generate_sample_intervals(anECGDataFrame, arrythmiaIndex, 1, startInterval, endInterval, leadNames)

    # combine data frames
    if arrythmiaFeatures.shape[0] > 0:
        normalFeatures = pd.concat([normalFeatures, arrythmiaFeatures])

    return normalFeatures

def generate_all_sample_record_intervals(anEcgDataFrame, anEqualSampling=True):
    '''
    grab records from HDS5 Datastore, process into huge set of feature vectors

    Args:
        anEcgDataFrame : mitDB data
        anEqualSampling : make equal number of each class

    Returns:
        DataFrame
    '''

    # get a list of all the recordings
    ecgFilter = filter(lambda x: re.search('Record_', x), anEcgDataFrame.keys())
    ecgDataFrames = [anEcgDataFrame[k] for k in ecgFilter]  # replace with equery

    # generate all the sample data intervals for each record that has arrythmias
    # each record will have time around each annotated event
    # also, derived features are around each event
    mlStage = pd.DataFrame()
    for record in ecgDataFrames:
        if record.arrythmia_events.sum() > 1:
            if len(record[record.arrythmia_events == 1].index) == 0:
                continue

            recordSamples = generate_normal_and_arrythmia_samples(record)
            mlStage = pd.concat([mlStage, recordSamples])

    mlStage.reset_index(drop=True, inplace=True)

    if len(mlStage.index) == 0:
        print("no arrythmia records, nothing to learn...")
        return

    ''' Reduce the number of normal samples to match the number of arrithmia samples
    '''
    if anEqualSampling:
        mask = mlStage['labels'] == 1  # 1 = arrythmia
        size = mlStage[mask].shape[0]  # total arrythmias
        randNormIndex = np.random.choice(mlStage[~mask].index, size)  # grab random normal
        index = np.concatenate([randNormIndex, mlStage[mask].index])  # 
        mlStage = mlStage.ix[index]
        mlStage.reset_index(drop=True, inplace=True)

    return mlStage

Create cached dataset if it doesn't exist, otherwise process it.

In [7]:
def extract_and_stage_ml(hdf_filename, anEqualSampling=True, useCached=True):
    '''
    Query the file for the arrythmia data, bring in normal data too

    Args:
        anEcgDataFrame		: data frame
        anEqualSampling		: equalize the amount of each class

    Returns:
        DataFrame
    '''
    cache = 'cached_eq_ml_data.hdf'
    if useCached and os.path.isfile(cache):
        equalized_data = pd.read_hdf(cache, 'cached_data')
        print('Using cached ML data...')
    else:
        ecgDataframe = pd.HDFStore(hdf_filename)
        equalized_data = generate_all_sample_record_intervals(ecgDataframe)
        if (useCached):
            print('Creating cached ML data...')
            equalized_data.to_hdf(cache, 'cached_data')

    return equalized_data

samples = extract_and_stage_ml(hdf_filename, anEqualSampling=True, useCached=True)

Using cached ML data...


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->axis0] [items->None]

  f(store)
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block0_items] [items->None]

  f(store)
