### Pre-processing
Dealing with windowing, flagging AF windows, and exploring 'Pause' periods.

In [1]:
import pandas as pd
import numpy as np
import os
import gc
from matplotlib import pyplot as plt
plt.style.use("seaborn")

In [2]:
filepath = os.path.join('Data', 'combined_series.csv')

df = pd.read_csv(filepath, parse_dates=['time'],
                 date_parser= lambda x: pd.to_datetime(x, format='%H:%M:%S'),
                 dtype={
                    'interval':'uint16',
                    'file_num':'uint16',
                    'row_num':'uint32',
                    'annotation':'object'
                 }
                )

#### Investigate number of 'Pause' periods in each 30-beat segment.

In [3]:
df['pause_threshold'] = (df['interval']>3000)
# Replace the interval values for pauses with missing values, so that they do not get used in calculating summary statistics.
df['interval'] = np.where(df['pause_threshold'], None, df['interval'])

In [4]:
# # 1. Filter to records exceeding the pause threshold, 2. Count the number of these per 30-beat segment in each file.
# pause_counts = df[df['pause_threshold']].groupby(by=['file_num', df['row_num']//30]).size() 
# pause_counts.hist(bins=30)   # Plot a histogram. Min is 1 and max is 30.
# plt.title("# of pause periods per 30-beat segment", size=14)
# plt.xlabel("# of pause periods")
# plt.ylabel("# of segments")

# # Count the total number of 30-beat segments across files to compare the % with Pauses.
# index_counts = df.groupby(by=['file_num', df['row_num']//30]).size()
# print("{0:.1f}% of 30-beat segments contain at least one pause".format(100*len(pause_counts)/len(index_counts)))

#### Find and tag all AF periods.
Primarily, this means detecting START AF in the annotation field and labelling all beats from that point until END AF appears.
If no END AF appears, then AF is present from START AF until the end of the file. Similarly, if END AF is written but no START AF then AF is present from the start of the file until END AF.
There are a small number of variations of these annotations which should be captured. E.g. in one file, the start flag is "AF START", and the end flag is written in various files as "AF END", "EN AF", "STOP AF", and "EINDE EPISODE AF".
- The condition to detect the start flag should be the presence of 'start' and 'af' in the annotation, regardless of order.
- The condition to detect the stop flag should be the presence of 'af' and ('end' or 'en' or 'stop' or 'einde episode') 

Method: 
1. Create a new column with a simplified 'start' / 'end' (encoded as integers 1 and 2).
2. Check the minimum non-NA value in this column for each file. If it is 'end' then place a 'start' in the first row for that file.
3. Fill forward the values in the column.
4. Replace the 'end's (2s) and NAs with 0 (AF beats will be represented by 1).

1. Create a new column with a simplified 'start' / 'end'

In [5]:
def simplify_start_end(col):
    return np.select( 
    [
        pd.isnull(col),                                 # If there is no recorded annotation.
        ~col.str.contains('af',case=False,na=False),  # If there is no 'af' in the annotation then return None.
        col.str.contains('start',case=False,na=False), # Condition for start af
        col.str.contains('end|(en af)|stop|(einde episode)', case=False,na=False)  # Condition for end af
    ], 
    [
        None,
        None,
        1,      # Input a 1 for 'start'
        2       # Input a 2 for 'end'   - use integers rather than strings to optimise for memory usage.
    ], 
    default=None
)

In [6]:
df['af_flag'] = simplify_start_end(df['annotation'])

# Get rid of the annotation string column to free up memory.
del df['annotation']
gc.collect()

  return func(self, *args, **kwargs)


32

2. Check the minimum non-NA value in this column for each file. If it is 'end' (2) then place a 'start' (1) in the first row for that file.

In [7]:
# Get the first non-null flag for each file.
first_flag = df.groupby('file_num', as_index=False)[['file_num','af_flag']].first()
# Extract the file numbers for which the first flag is 'end' (encoded as 2)
files_with_end_first = first_flag[first_flag['af_flag']==2]['file_num']
# Get the rows of the original dataframe that correspond to the first rows of the above files.
df_files_with_end_first = df[(df['file_num'].isin(files_with_end_first.values)) & (df['row_num']==0)]['file_num']
# Extract the indexes for these rows (corresponding to the indexes of the original df)
idx = df_files_with_end_first.index
# Insert 'start' (encoded as 1) to the af_flag column on these rows.
df.loc[df.index.isin(idx), 'af_flag'] = 1

3. Fill forward the values in the column.
4. Replace the 'end's (2s) and NAs with 0 (AF beats will be represented by 1).

In [8]:
# Fill forward the AF flag values. So 1 will be filled forward from the start to the end of the AF episode. 
# The 'end' (encoded as 2) will also be filled forward, but these will all be replaced in the next step.
df['af_flag'] = df['af_flag'].fillna(df.groupby('file_num')['af_flag'].ffill())
# Replace any 2 with a 0, and replace any missing values with 0.
df['af_flag'] = df['af_flag'].replace(2, 0).fillna(0)

#### Windowing
Create windows of the beats within which we will calculate summary statistics and try to predict if an AF episode is captured by the window (segment).

Create segments of 30 beats, and overlap each window by 50% so that we can capture AF episodes earlier and we ensure we capture as many AF episodes as possible.

Tolerate up to 3 pause periods in middle of 30-beat segment but don't use in calculating summary statistics. ΔRR should be Null for those intervals. 3 is chosen based on the distribution of pause periods in segments, this will allow us to capture the vast majority of valid segments.

In [9]:
# Create the windows based on taking every 30 values of the row number.
df['window_1'] = df['row_num']//30
# Create the second and third windows with a 10 & 20 beat offset, respectively. (i.e overlaps of 67%)
df['window_2'] = [(row-10)//30 if row>9 else None for row in df['row_num']]
df['window_3'] = [(row-20)//30 if row>19 else None for row in df['row_num']]

#### Write processed data to file.

In [12]:
# df.to_csv(os.path.join('Data', 'data_window_mapping.csv'), index=False, na_rep='', chunksize=100000)