In [1]:
# globally useful imports of standard libraries needed in this notebook
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# import project specific modules used in this notebook
import sys
sys.path.append('../src')
import mindwandering.data

# Mind Wandering Data Transformers

In this notebook we first set up some data transformer pipelines to create
a useful set of cleaned data from our raw mind wandering dataset.

We then do some feature scaling on the feature dataset, so that all features
have same scale, to avoid problems with some ML classifiers that are sensitive
to feature scale differences.

# Load Raw Data

Original raw data in raw data file will never be modified.  Load the raw data into a
dataframe named `df_raw` that will be starting point of all data transformation
pipelines we create for project.

In [2]:
df_raw = mindwandering.data.get_df_raw()

# there are 2 empty rows in the raw data file, just drop them immediately
#df_raw = df_raw.dropna(subset=['TrialID'])

# The raw data should have 4076 rows representing experiment trials, and
# 129 columns of raw features
df_raw.shape

(4078, 129)

# Experimental Metadata Transformer Pipeline

Pipeline to transform raw data into dataframe we name `df_experiment_metadata`.  This
dataframe contains all experiment metadata of experiment participants.  This include
participant id, location, and begin and end date times of each experiment trial.  
Also include a segment id, which will be a useful attribute of trials to sequence the
trials chronologically for each participant.

In [3]:
# custom transformers for experimental metadata pipelines, though
# some may be reused in other pipeline transformers below as well

class RenameColumnsUsingMapTransformer(BaseEstimator, TransformerMixin):
    """Use a given map to rename all of the indicated columns.  Also
    as a side effect, columns will be ordered by the order given in
    the map.
    """
    def __init__(self, columns_rename_map):
        self.columns_rename_map = columns_rename_map
        
    def fit(self, df, y=None):
        return self # nothing else to do

    def transform(self, df, y=None):
        df = df.rename(columns = self.columns_rename_map)
        return df

class DropRowsWithEmptyValuesInColumnTransformer(BaseEstimator, TransformerMixin):
    """This transformer will only drop rows
    for the columns that it is asked to check.  And only rows where the value
    in the column is empty or NaN will get dropped.
    """
    def __init__(self, columns_to_check = '[segment_index]'):
        self.columns_to_check = columns_to_check
        
    def fit(self, df, y=None):
        return self # nothing else to do

    def transform(self, df, y=None):
        df = df.dropna(subset = self.columns_to_check)
        return df

class ParticipantIdTransformer(BaseEstimator, TransformerMixin):
    """This transformer expects the participant_id field to have multiple features
    encoded in a string, using '-' as a separator.  It will split out into 2 columns,
    create the location column from the original encoding, and create a unique
    participant id.
    """
    def fit(self, df, y=None):
        return self # nothing else to do

    def transform(self, df, y=None):
        # create a separate dataframe with the two new fields we want
        fields = df.participant_id.str.split('-', expand=True)
        fields.columns = ['BE7', 'participant_id', 'participant_location']
        fields.drop(['BE7'], axis=1, inplace=True)
        
        # map all memphis locations to UM to regularize categorical variable and resulting
        # participant ids
        fields['participant_location'] = fields.participant_location.map({'Memphis': 'UM', 'ND': 'ND'})
        
        # there are duplicate participant ids from the 2 locations.  Map participant id to a string that
        # uses current participant id and the new derived location.  Also the participant id has an initial
        # P which we will remove
        fields['participant_id'] = fields.apply(lambda row: row[0][1:] + '-' + row[1], axis=1)
        
        # replace the participant_id in dataframe to return, add in the participant_location
        df['participant_id'] = fields['participant_id']
        df = df.join(fields['participant_location'])
        
        # new column was added to end, we want it to be at position 1
        cols = df.columns.to_list()
        cols = cols[0:1] + cols[-1:] + cols[1:-1]
        df = df[cols]
        
        return df
    
class TrialDateTimeTransformer(BaseEstimator, TransformerMixin):
    """Transformer to fix the time information in this dataset.  The time
    information was transformed into 2 parts which need to be added together
    to get a valid unix milliseconds (ms) since the epoch result.  
    This transformer combines the fields for start and end time
    into a valid datetime value.  It replaces the start_time and end_time
    fields with the respective datetime values.  It also make the
    trial_length into an int and drops the no longer needed
    time stamp fields.
    """
    def __init__(self, time_field_pairs = [('start_time', 'start_timestamp'), ('end_time', 'end_timestamp')]):
        self.time_field_pairs = time_field_pairs
        
    def fit(self, df, y=None):
        return self # nothing else to do

    def transform(self, df, y=None):
        # iterate through all given pairs of time and timestamp to combine
        for (time, timestamp) in self.time_field_pairs:
            # create a valid datetime value for the pair, replacing the time field with the new datetime value
            df[time] = pd.to_datetime(df[timestamp] + df[time], unit='ms')
            
            # drop the no longer timestamp filed from the dataframe
            df = df.drop(timestamp, axis=1)
       
        return df
    
class SetFeatureTypeTransformer(BaseEstimator, TransformerMixin):
    """Given a list of feature names, and desired type as a list of
    tuple values, transform all features to the indicated data type.
    """
    def __init__(self, feature_type_pairs = [('segment_index', int)]):
        self.feature_type_pairs = feature_type_pairs
        
    def fit(self, df, y=None):
        return self # nothing else to do
    
    def transform(self, df, y=None):
        # iterate through given pairs of feture name and desired type, converting all indicated
        # features to the new type
        for (feature, new_type) in self.feature_type_pairs:
            # sometimes features have nan, so can only set the type where notna
            #idx = df[feature].notna()
            
            # now set the type for all valid values to the new type
            #df.loc[idx, feature] = df.loc[idx, feature].astype(new_type)
            df[feature] = df[feature].astype(new_type)
            
        return df

In [4]:
# we start by creating data frame with the needed columns and renaming them, before
# any transformation pipelines.
experiment_metadata_features_map = {
    'ParticipantID':  'participant_id',
    'SegmentIndex':   'segment_index',
    'StartTime(ms)':  'start_time',
    'EndTime(ms)':    'end_time',
    'Length(ms)':     'trial_length',
    'StartTimestamp': 'start_timestamp',
    'EndTimestamp':   'end_timestamp',
}

# execute transformation pipeline
experiment_metadata_pipeline = Pipeline([
    ('rename_columns',          RenameColumnsUsingMapTransformer(experiment_metadata_features_map)),
    ('drop_empty_rows',         DropRowsWithEmptyValuesInColumnTransformer(['segment_index'])),
    ('extract_participant_id',  ParticipantIdTransformer()),
    ('transform_time_values',   TrialDateTimeTransformer([('start_time', 'start_timestamp'), ('end_time', 'end_timestamp')])),
    ('transform_feature_types', SetFeatureTypeTransformer([('segment_index', int), ('trial_length', int)])),
])

df_experiment_metadata = experiment_metadata_pipeline.fit_transform(df_raw[experiment_metadata_features_map.keys()])

# to double check this cell, we expect to have experimental metadata with
# 4076 trials and 6 feature columns.  There should be 135 unique participants, and
# 57 trial segment ids
num_rows, num_features = df_experiment_metadata.shape
num_participants = df_experiment_metadata.participant_id.unique().size
num_segments = df_experiment_metadata.segment_index.unique().size

print('Number of experimental trial/segment rows: ', num_rows)
print('Current number of feature columns: ', num_features)
print('Number of unique participants: ', num_participants)
print('Maximum number of trial/segments for a participant: ', num_segments)

Number of experimental trial/segment rows:  4076
Current number of feature columns:  6
Number of unique participants:  135
Maximum number of trial/segments for a participant:  57


In [5]:
df_experiment_metadata

Unnamed: 0,participant_id,participant_location,segment_index,start_time,end_time,trial_length
1,1002-UM,UM,57,2013-10-18 18:34:54.808000000,2013-10-18 18:35:22.271000064,27463
2,1002-UM,UM,56,2013-10-18 18:34:34.590000128,2013-10-18 18:34:54.808000000,20218
3,1002-UM,UM,55,2013-10-18 18:34:11.124999936,2013-10-18 18:34:34.590000128,23465
4,1002-UM,UM,54,2013-10-18 18:33:47.726000128,2013-10-18 18:34:11.124999936,23399
5,1002-UM,UM,53,2013-10-18 18:33:27.673999872,2013-10-18 18:33:47.726000128,20052
...,...,...,...,...,...,...
4072,1104-ND,ND,5,2013-12-15 17:55:09.703000064,2013-12-15 17:55:36.766000128,27063
4073,1104-ND,ND,4,2013-12-15 17:54:43.564000000,2013-12-15 17:55:09.703000064,26139
4074,1104-ND,ND,3,2013-12-15 17:54:16.943000064,2013-12-15 17:54:43.564000000,26621
4075,1104-ND,ND,2,2013-12-15 17:53:49.171000064,2013-12-15 17:54:16.943000064,27772


# Target Label Transformer Pipeline

Pipeline to transform raw data into dataframe we name `df_label`.  This
dataframe contains all columns/features of original dataset that are possibly
useful as target labels for training a classifier.  In particular, we
infer a `mind_wandered_label` binary feature from the original data, which
is useful as basic label for training a binary classifier.  But the other
features in this dataframe may also be useful for other classifiers.

In [6]:
class CreateMindWanderedLabelTransformer(BaseEstimator, TransformerMixin):
    """Infer a boolean label (False/True) from features that indirectly indicate mind wandering or
    no mind wandering.  Can use either number_of_reports which will be 1 or greater if a mind wandering
    was recorded during the trial.  Also can use first_report_type which is none for all
    trials where no mind wandering occured, and self-caught for all trials where it does.
    """
    def fit(self, df, y=None):
        return self # nothing else to do
    
    def transform(self, df, y=None):
        df['mind_wandered_label'] = (df['first_report_type'] == 'self-caught')
        return df


In [7]:
# we start by creating data frame with the needed columns and renaming them, before
# any transformation pipelines.
label_features_map = {
    'NumberOfReports':            'number_of_reports',
    'FirstReportType':            'first_report_type',
    'FirstReportContent':         'first_report_content',
    'FirstReportTimestamp':       'first_report_timestamp',
    'FirstReportTrialTime(ms)':   'first_report_trial_time',
    'FirstReportSegmentTime(ms)': 'first_report_segment_time',
}

# execute transformation pipeline
label_pipeline = Pipeline([
    ('rename_columns',             RenameColumnsUsingMapTransformer(label_features_map)),
    ('drop_empty_rows',            DropRowsWithEmptyValuesInColumnTransformer(['number_of_reports'])),
    ('transform_time_values',      TrialDateTimeTransformer([('first_report_trial_time', 'first_report_timestamp')])),
    #('transform_feature_types',   SetFeatureTypeTransformer([('first_report_segment_time', int)])), # actually cant make int and still have NaN values
    ('create_mind_wandered_label', CreateMindWanderedLabelTransformer()),
])

df_label = label_pipeline.fit_transform(df_raw[label_features_map.keys()])

# To double check this pipeline, we expect to have 4076 trial rows, and we have 6 label features in this label dataframe.
# The inferred mind_wandered_label should have 2963 negative instances and 1113 positive instances, totaling the 4076 trials.
num_rows, num_features = df_label.shape
num_negative_labels = sum(df_label.mind_wandered_label == False)
num_positive_labels = sum(df_label.mind_wandered_label == True)

print('Number of trials in the label dataframe: ', num_rows)
print('Number of features in the label dataframe: ', num_features)
print('Number of negative labels: ', num_negative_labels)
print('Number of positive labels: ', num_positive_labels)

Number of trials in the label dataframe:  4076
Number of features in the label dataframe:  6
Number of negative labels:  2963
Number of positive labels:  1113


In [8]:
df_label

Unnamed: 0,number_of_reports,first_report_type,first_report_content,first_report_trial_time,first_report_segment_time,mind_wandered_label
1,0.0,none,none,NaT,,False
2,0.0,none,none,NaT,,False
3,0.0,none,none,NaT,,False
4,0.0,none,none,NaT,,False
5,1.0,self-caught,other,2013-10-18 18:33:39.182000128,11508.0,True
...,...,...,...,...,...,...
4072,0.0,none,none,NaT,,False
4073,0.0,none,none,NaT,,False
4074,0.0,none,none,NaT,,False
4075,1.0,self-caught,task-related,2013-12-15 17:54:12.535000064,23364.0,True


# Basic Training Features Transformer Pipeline

Pipeline to transform raw data into dataframe we name df_features.  This dataframe is specifically created to extract and replicate the
initial 62 features used in the following paper to build classification models:

- Faber, M., Bixler, R., & D’Mello, S. K. (2018). An automated behavioral measure of mind
  wandering during computerized reading. Behavior Research Methods, 50(1), 134-150.

As far as we can tell, the 62 features extracted should be almost identical to the ones used in that paper.  One unresolved descrepancy
of this feature dataset is that the paper states a total of 132 unique participants, wheras the raw data we have in this repository
appears to have a total of 135 unique participants.  

In [9]:
class NumberOfBlinksTransformer(BaseEstimator, TransformerMixin):
    """Number of blinks appear like it should be whole number values, but a number of values have fractional
    parts.  It appears that values between 0 and 1 should actually be a single blink, looking at the mean and min
    and max blink durations.  Thus we need to actually take the ceiling of the number_of_blinks value, then make into
    an int.
    """
    def fit(self, df, y=None):
        return self # nothing else to do
    
    def transform(self, df, y=None):
        df['number_of_blinks'] = np.ceil(df.number_of_blinks)
        df['number_of_blinks'] = df.number_of_blinks.astype(int)
        return df

class FillMissingValuesTransformer(BaseEstimator, TransformerMixin):
    """General transformer to fill in missing values for a feature or features with indicated value.
    """
    def __init__(self, feature_value_pairs = [ ('blink_duration_mean', 0.0) ]):
        self.feature_value_pairs = feature_value_pairs
        
    def fit(self, df, y=None):
        return self # nothing else to do
    
    def transform(self, df, y=None):
        # iterate over all features we are asked to fill with missing values
        for (feature, value) in self.feature_value_pairs:
            df[feature] = df[feature].fillna(value)
        return df


In [10]:
# the 48 features listed as the eye movement descriptive features in paper we are replicating
eye_movement_descriptive_features_map = {
    'FixDurMed':      'fixation_duration_median',
    'FixDurMean':     'fixation_duration_mean',
    'FixDurSD':       'fixation_duration_standard_deviation',
    'FixDurMin':      'fixation_duration_minimum',
    'FixDurMax':      'fixation_duration_maximum',
    'FixDurRange':    'fixation_duration_range',
    'FixDurSkew':     'fixation_duration_skew',
    'FixDurKur':      'fixation_duration_kurtosis',
    'SacDurMed':      'saccade_duration_median',
    'SacDurMean':     'saccade_duration_mean',
    'SacDurSD':       'saccade_duration_standard_deviation',
    'SacDurMin':      'saccade_duration_minimum',
    'SacDurMax':      'saccade_duration_maximum',
    'SacDurRange':    'saccade_duration_range',
    'SacDurSkew':     'saccade_duration_skew',
    'SacDurKur':      'saccade_duration_kurtosis',
    'SacAmpMed':      'saccade_amplitude_median',
    'SacAmpMean':     'saccade_amplitude_mean',
    'SacAmpSD':       'saccade_amplitude_standard_deviation',
    'SacAmpMin':      'saccade_amplitude_minimum',
    'SacAmpMax':      'saccade_amplitude_maximum',
    'SacAmpRange':    'saccade_amplitude_range',
    'SacAmpSkew':     'saccade_amplitude_skew',
    'SacAmpKur':      'saccade_amplitude_kurtosis',
    'SacVelMed':      'saccade_velocity_median',
    'SacVelMean':     'saccade_velocity_mean',
    'SacVelSD':       'saccade_velocity_sd',
    'SacVelMin':      'saccade_velocity_min',
    'SacVelMax':      'saccade_velocity_max',
    'SacVelRange':    'saccade_velocity_range',
    'SacVelSkew':     'saccade_velocity_skew',
    'SacVelKur':      'saccade_velocity_kurtosis',
    'SacAngAbsMed':   'saccade_angle_absolute_median',
    'SacAngAbsMean':  'saccade_angle_absolute_mean',
    'SacAngAbsSD':    'saccade_angle_absolute_standard_deviation',
    'SacAngAbsMin':   'saccade_angle_absolute_minimum',
    'SacAngAbsMax':   'saccade_angle_absolute_maximum',
    'SacAngAbsRange': 'saccade_angle_absolute_range',
    'SacAngAbsSkew':  'saccade_angle_absolute_skew',
    'SacAngAbsKur':   'saccade_angle_absolute_kurtosis',
    'SacAngRelMed':   'saccade_angle_relative_median',
    'SacAngRelMean':  'saccade_angle_relative_mean',
    'SacAngRelSD':    'saccade_angle_relative_standard_deviation',
    'SacAngRelMin':   'saccade_angle_relative_minimum',
    'SacAngRelMax':   'saccade_angle_relative_maximum',
    'SacAngRelRange': 'saccade_angle_relative_range',
    'SacAngRelSkew':  'saccade_angle_relative_skew',
    'SacAngRelKur':   'saccade_angle_relative_kurtosis',
}

# the 8 pupil diameter descriptive features
pupil_diameter_descriptive_features_map = {
    'PupilDiametersZMed':   'pupil_diameter_median',
    'PupilDiametersZMean':  'pupil_diameter_mean',
    'PupilDiametersZSD':    'pupil_diameter_standard_deviation',
    'PupilDiametersZMin':   'pupil_diameter_minimum',
    'PupilDiametersZMax':   'pupil_diameter_maximum',
    'PupilDiametersZRange': 'pupil_diameter_range',
    'PupilDiametersZSkew':  'pupil_diameter_skew',
    'PupilDiametersZKur':   'pupil_diameter_kurtosis',
}

# The 2 blink features used.  We do not use all of the other derived statistics here because many 
# times number of blinks are 0 or 1 for  atrial, meaning mean, standard deviation, and other measures are not really meaningful.
# There are actually 2260 trials where no blinks occur, and none of these would have meaningful statistics, and of the remaining,
# something like 1191 had a single blink, meaning many statistics like standard deviation don't make sense in those cases.
blink_features_map = {
    'BlinkDurN':     'number_of_blinks',
    'BlinkDurMean':  'blink_duration_mean',
}

# the 4 miscellaneous features used in the results
miscellaneous_features_map = {
    'SacDurN':               'number_of_saccades',
    'horizontalSaccadeProp': 'horizontal_saccade_proportion',
    'FxDisp':                'fixation_dispersion',
    'FxSacRatio':            'fixation_saccade_durtion_ratio',
}

# combine all 4 types of feature dictionaries into a merged dictionary of the 62 features
feature_map = {
    **eye_movement_descriptive_features_map,             
    **pupil_diameter_descriptive_features_map, 
    **blink_features_map, 
    **miscellaneous_features_map
}

# execute transformation pipeline
feature_pipeline = Pipeline([
    ('rename_columns',               RenameColumnsUsingMapTransformer(feature_map)),
    ('drop_empty_rows',              DropRowsWithEmptyValuesInColumnTransformer( ['fixation_duration_mean'] )),
    ('transform_number_of_blinks',   NumberOfBlinksTransformer()),
    ('fill_missing_blink_durations', FillMissingValuesTransformer( [('blink_duration_mean', 0.0)] )),
])

df_features = feature_pipeline.fit_transform(df_raw[feature_map.keys()])

# To double check this pipeline, we expect to have 4076 trial rows, and we have 62 label features in this label dataframe.
# Also check some of our transformations.  There should be no missing values (all blink duration means filled with 0).  Also
# there should just be 7 unique values for the number of blinks
num_rows, num_features = df_features.shape
number_of_missing_values = df_features.isna().sum().sum()

print('Number of trials in the features dataframe: ', num_rows)
print('Number of features in the features dataframe: ', num_features)
print('Number of missing values: ', number_of_missing_values)
print('number_of_blinks unique values: ', df_features.number_of_blinks.unique())

Number of trials in the features dataframe:  4076
Number of features in the features dataframe:  62
Number of missing values:  0
number_of_blinks unique values:  [0 3 1 2 4 5 6]


In [11]:
df_features

Unnamed: 0,fixation_duration_median,fixation_duration_mean,fixation_duration_standard_deviation,fixation_duration_minimum,fixation_duration_maximum,fixation_duration_range,fixation_duration_skew,fixation_duration_kurtosis,saccade_duration_median,saccade_duration_mean,...,pupil_diameter_maximum,pupil_diameter_range,pupil_diameter_skew,pupil_diameter_kurtosis,number_of_blinks,blink_duration_mean,number_of_saccades,horizontal_saccade_proportion,fixation_dispersion,fixation_saccade_durtion_ratio
1,200.0,221.000000,101.294620,83.0,366.0,283.0,0.132989,-1.389706,17.0,123.300000,...,-0.868433,1.496751,-0.302367,-1.148603,0,0.000000,10.0,1.000000,0.429,1.972
2,183.0,209.090909,107.757556,133.0,499.0,366.0,2.241717,5.662505,25.5,104.800000,...,0.475580,1.414238,-0.449710,-0.030409,0,0.000000,10.0,1.000000,0.436,2.195
3,167.0,188.538462,100.261005,83.0,416.0,333.0,1.010131,0.510088,42.0,120.500000,...,1.834626,2.006478,-0.345951,-0.968456,0,0.000000,12.0,1.000000,0.554,1.695
4,158.0,198.714286,125.994157,83.0,516.0,433.0,1.562621,2.184748,17.0,76.846154,...,-0.070521,1.493373,0.559856,-0.078687,3,161.333333,13.0,0.923077,0.333,2.785
5,191.5,190.166667,41.252732,116.0,250.0,134.0,-0.243659,-0.937062,67.0,122.636364,...,1.426649,1.094245,0.619191,1.268933,1,233.000000,11.0,1.000000,0.502,1.692
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4072,192.0,208.400000,64.849056,142.0,350.0,208.0,1.226822,0.509694,17.0,57.000000,...,1.676945,1.372384,0.437309,-0.593770,1,208.000000,14.0,0.928571,0.393,3.917
4073,179.0,204.750000,76.666812,125.0,425.0,300.0,1.939411,3.944185,16.0,28.800000,...,0.997046,1.481454,0.052573,0.136014,0,0.000000,15.0,1.000000,0.458,7.583
4074,200.0,239.214286,141.660580,91.0,642.0,551.0,2.126507,4.905940,17.0,30.153846,...,1.247847,1.113773,0.141168,-0.498063,1,150.000000,13.0,0.923077,0.416,8.543
4075,192.0,195.470588,53.755369,92.0,300.0,208.0,0.075579,-0.241790,13.0,31.375000,...,1.279968,1.428811,1.069052,2.495914,0,0.000000,16.0,0.937500,0.582,6.620


# Feature Scaling and Normalization Transformer Pipeline

This pipeline reuses the basic features pipeline of previous section but adds on standard feature scaling.  So we assume that the
feature maps are defined before runningthe following pipeline.

All features are scaled/normalized to have similar ranges, necessary for some ML models that are sensitive to differences in scale of the features being trained with.  We create both a set of features with standard scaling using the  `StandardScaler()`, which transforms all
features to have a mean of 0 and a standard deviation of 1.

We also create a set of features with min-max scaling using the 'MinMaxScaler()', which transforms all features to have
values between 0 and 1.

## Standard Scaled Features

In [12]:
# execute transformation pipeline
features_standard_scaled_pipeline = Pipeline([
    ('standard_scaler',              StandardScaler()),
])

# this pipeline reuses the results of the standard df_features, and adds standard scaling
df_features_standard_scaled_nparray = features_standard_scaled_pipeline.fit_transform(df_features.copy())

# the SciKitLearn preprocessors like StandardScaler seem to transform back into a NumPy array.  We can always make
# a DataFrame a NumPy array, and vice versa.  Lets put this back into a Pandas DataFrame and put back on the feature
# labels
df_features_standard_scaled = pd.DataFrame(df_features_standard_scaled_nparray, columns = feature_map.values())

# To double check this pipeline, we expect to have 4076 trial rows, and we have 62 label features in this label dataframe.
# Also check some of our transformations.  There should be no missing values (all blink duration means filled with 0).  Also
# there should just be 7 unique values for the number of blinks
num_rows, num_features = df_features_standard_scaled.shape
number_of_missing_values = df_features_standard_scaled.isna().sum().sum()

print('Number of trials in the features dataframe: ', num_rows)
print('Number of features in the features dataframe: ', num_features)
print('Number of missing values: ', number_of_missing_values)
print('number_of_blinks unique values: ', df_features_standard_scaled.number_of_blinks.unique())

Number of trials in the features dataframe:  4076
Number of features in the features dataframe:  62
Number of missing values:  0
number_of_blinks unique values:  [-0.73626571  2.62433792  0.3839355   1.50413671  3.74453913  4.86474034
  5.98494156]


In [13]:
df_features_standard_scaled.describe()

Unnamed: 0,fixation_duration_median,fixation_duration_mean,fixation_duration_standard_deviation,fixation_duration_minimum,fixation_duration_maximum,fixation_duration_range,fixation_duration_skew,fixation_duration_kurtosis,saccade_duration_median,saccade_duration_mean,...,pupil_diameter_maximum,pupil_diameter_range,pupil_diameter_skew,pupil_diameter_kurtosis,number_of_blinks,blink_duration_mean,number_of_saccades,horizontal_saccade_proportion,fixation_dispersion,fixation_saccade_durtion_ratio
count,4076.0,4076.0,4076.0,4076.0,4076.0,4076.0,4076.0,4076.0,4076.0,4076.0,...,4076.0,4076.0,4076.0,4076.0,4076.0,4076.0,4076.0,4076.0,4076.0,4076.0
mean,1.436535e-16,4.233611e-16,1.847557e-16,1.0895220000000001e-18,1.655256e-16,5.692753000000001e-17,-7.117303e-17,-6.213000000000001e-17,-2.253077e-15,1.305792e-16,...,1.617396e-16,1.036136e-16,-7.55175e-18,4.75304e-17,5.319047e-16,1.1784e-15,-2.195823e-15,4.326628e-16,-3.982203e-16,-1.615353e-16
std,1.000123,1.000123,1.000123,1.000123,1.000123,1.000123,1.000123,1.000123,1.000123,1.000123,...,1.000123,1.000123,1.000123,1.000123,1.000123,1.000123,1.000123,1.000123,1.000123,1.000123
min,-2.232735,-2.173276,-1.357598,-1.39564,-1.587998,-1.622102,-4.33205,-1.639986,-0.8295678,-0.9272281,...,-3.354454,-2.120504,-5.090566,-1.685798,-0.7362657,-0.7923815,-2.125203,-7.694547,-2.918593,-1.4448
25%,-0.70651,-0.6861085,-0.6368714,-0.7950409,-0.6660583,-0.6737353,-0.6897181,-0.7293568,-0.4794688,-0.5840373,...,-0.6452865,-0.695099,-0.5753198,-0.5645888,-0.7362657,-0.7923815,-0.7465099,0.4577536,-0.7355,-0.7542147
50%,-0.2268394,-0.1958532,-0.2563098,-0.1944416,-0.2287956,-0.2478221,-0.09336446,-0.3161902,-0.3238692,-0.3395569,...,-0.04660406,-0.1599973,-0.01272094,-0.2378913,-0.7362657,-0.7923815,-0.05716365,0.4577536,-0.01243736,-0.1697047
75%,0.4926665,0.4595249,0.3178335,0.6534634,0.3823186,0.3711718,0.6517878,0.4700829,0.1429295,0.1740982,...,0.5890354,0.5295374,0.5738029,0.242231,0.3839355,0.7214874,0.6321826,0.4577536,0.6967202,0.5378297
max,6.750188,6.565541,8.056259,5.670234,7.631397,6.02743,3.385362,4.143071,14.41919,8.337266,...,7.378747,8.543861,5.118857,10.76329,5.984942,3.611601,3.044895,0.4577536,3.593727,12.72503


## Min-Max Scaled Features

In [14]:
# execute transformation pipeline
features_minmax_scaled_pipeline = Pipeline([
    ('minmax_scaler',              MinMaxScaler()),
])

# this pipeline reuses the results of the standard df_features, and adds standard scaling
df_features_minmax_scaled_nparray = features_minmax_scaled_pipeline.fit_transform(df_features.copy())

# the SciKitLearn preprocessors like StandardScaler seem to transform back into a NumPy array.  We can always make
# a DataFrame a NumPy array, and vice versa.  Lets put this back into a Pandas DataFrame and put back on the feature
# labels
df_features_minmax_scaled = pd.DataFrame(df_features_minmax_scaled_nparray, columns = feature_map.values())

# To double check this pipeline, we expect to have 4076 trial rows, and we have 62 label features in this label dataframe.
# Also check some of our transformations.  There should be no missing values (all blink duration means filled with 0).  Also
# there should just be 7 unique values for the number of blinks
num_rows, num_features = df_features_minmax_scaled.shape
number_of_missing_values = df_features_minmax_scaled.isna().sum().sum()

print('Number of trials in the features dataframe: ', num_rows)
print('Number of features in the features dataframe: ', num_features)
print('Number of missing values: ', number_of_missing_values)
print('number_of_blinks unique values: ', df_features_minmax_scaled.number_of_blinks.unique())

Number of trials in the features dataframe:  4076
Number of features in the features dataframe:  62
Number of missing values:  0
number_of_blinks unique values:  [0.         0.5        0.16666667 0.33333333 0.66666667 0.83333333
 1.        ]


In [15]:
df_features_minmax_scaled.describe()

Unnamed: 0,fixation_duration_median,fixation_duration_mean,fixation_duration_standard_deviation,fixation_duration_minimum,fixation_duration_maximum,fixation_duration_range,fixation_duration_skew,fixation_duration_kurtosis,saccade_duration_median,saccade_duration_mean,...,pupil_diameter_maximum,pupil_diameter_range,pupil_diameter_skew,pupil_diameter_kurtosis,number_of_blinks,blink_duration_mean,number_of_saccades,horizontal_saccade_proportion,fixation_dispersion,fixation_saccade_durtion_ratio
count,4076.0,4076.0,4076.0,4076.0,4076.0,4076.0,4076.0,4076.0,4076.0,4076.0,...,4076.0,4076.0,4076.0,4076.0,4076.0,4076.0,4076.0,4076.0,4076.0,4076.0
mean,0.248553,0.248692,0.144213,0.197518,0.172245,0.212052,0.561335,0.283585,0.054402,0.100084,...,0.312531,0.19884,0.498614,0.135415,0.109544,0.179924,0.411057,0.94385,0.448165,0.101963
std,0.111336,0.114446,0.106239,0.141543,0.10848,0.130743,0.129593,0.17294,0.065587,0.107952,...,0.09318,0.093782,0.097961,0.080337,0.148801,0.227095,0.193444,0.12268,0.153574,0.070581
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.169903,0.170179,0.07656,0.085,0.1,0.123977,0.471963,0.157465,0.022959,0.037044,...,0.25241,0.133661,0.442263,0.090064,0.0,0.0,0.266667,1.0,0.335225,0.048736
50%,0.223301,0.22628,0.116986,0.17,0.147429,0.179655,0.549237,0.228909,0.033163,0.063433,...,0.308189,0.183837,0.497368,0.116306,0.0,0.0,0.4,1.0,0.446255,0.089987
75%,0.303398,0.301277,0.177975,0.29,0.213714,0.260575,0.645791,0.364871,0.063776,0.118876,...,0.36741,0.248495,0.554818,0.154873,0.166667,0.34375,0.533333,1.0,0.55515,0.139919
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Outlier Transformations

In reference paper 2 they specifically mention using outlier replacement (Winsorization), where values that are 3 standard deviations
above or below the mean were replaced with that value.  We can easily apply Winsorization as a feature transformation on the
standard scaled features.  Since all features in that dataframe are scaled to have a mean of 0 and a standard deviation of 1, then any values
below -3 or above 3 is 3 standard deviations away from the mean, and can be replaced with -3 or 3 respectively to Winzorize it.

In [16]:
# just out of curiosity, how many are we talking about for each feature...
outliers = ( (df_features_standard_scaled < -3.0) | (df_features_standard_scaled > 3.0) )
outliers.sum()

fixation_duration_median                63
fixation_duration_mean                  65
fixation_duration_standard_deviation    84
fixation_duration_minimum               44
fixation_duration_maximum               69
                                        ..
blink_duration_mean                     34
number_of_saccades                       2
horizontal_saccade_proportion           96
fixation_dispersion                      5
fixation_saccade_durtion_ratio          52
Length: 62, dtype: int64

In [17]:
class WinsorizationOutlierTransformer(BaseEstimator, TransformerMixin):
    """This transformer transforms all features of the dataframe to remove outliers.
    It assumes the dataframe has been scaled using standard scaling, such that
    the mean of each feature is 0.0 and the standard deviation is 1.0.
    This transformer scales all features, we might want a more specialized
    one that only scales the requested features, so that you could specify
    which features are already standard scaled.
    """
    def fit(self, df, y=None):
        return self # nothing else to do
    
    def transform(self, df, y=None):
        # use ceiling for all outliers greater than 3.0 standard deviations above the mean
        df[df > 3.0] = 3.0
        
        # take the floor for all outliers less than 3.0 standard deviations below the mean
        df[df < 3.0] = -3.0
        
        return df

In [18]:
# execute transformation pipeline
outlier_winsorization_pipeline = Pipeline([
    ('outlier_winsorization', WinsorizationOutlierTransformer()),
])

# this pipeline reuses the results of the standard df_features_standard_scaled, and removes outliers using Winsorization
df_features_outliers_removed = outlier_winsorization_pipeline.fit_transform(df_features_standard_scaled.copy())

# To double check this pipeline, we expect to have 4076 trial rows, and we have 62 label features in this label dataframe.
# Also check some of our transformations.  There should be no missing values (all blink duration means filled with 0).  Also
# there should just be 7 unique values for the number of blinks
num_rows, num_features = df_features_outliers_removed.shape
number_of_missing_values = df_features_outliers_removed.isna().sum().sum()
outliers = ( (df_features_outliers_removed < -3.0) | (df_features_outliers_removed > 3.0) )
number_of_outliers = outliers.sum().sum()

print('Number of trials in the features dataframe: ', num_rows)
print('Number of features in the features dataframe: ', num_features)
print('Number of missing values: ', number_of_missing_values)
print('Number of outliers remaining: ', number_of_outliers)

Number of trials in the features dataframe:  4076
Number of features in the features dataframe:  62
Number of missing values:  0
Number of outliers remaining:  0


In [19]:
# double checking again that outremovers are indeed gone for each feature
outliers = ( (df_features_outliers_removed < -3.0) | (df_features_outliers_removed > 3.0) )
outliers.sum()

fixation_duration_median                0
fixation_duration_mean                  0
fixation_duration_standard_deviation    0
fixation_duration_minimum               0
fixation_duration_maximum               0
                                       ..
blink_duration_mean                     0
number_of_saccades                      0
horizontal_saccade_proportion           0
fixation_dispersion                     0
fixation_saccade_durtion_ratio          0
Length: 62, dtype: int64

# Outlier Transformations Revisited

To support grid search, might want to perform the Winsorization outlier transformation on data that hasn't been scaled.  Here we develop
method to do that with any data, scaled or not.

In [20]:
#df_features = mindwandering.data.get_df_features()
# make a small dataframe for testing
np.random.seed(42)
df_features = pd.DataFrame(data=np.random.randn(10,5), columns=['A', 'B', 'C', 'D', 'E'])
print(df_features)

# transform each column with different mean and standard deviaiton
df_features.A = (df_features.A * 2.0) + 5.0 # mean 5, std 2
df_features.B = (df_features.B * 10.0) + 100.0 # mean 100, std 10
df_features.C = (df_features.C * 5.0) - 10.0 # mean -10.0, std 5
df_features.D = (df_features.D * 0.5) + 2.0 # mean 2.0, std 0.5
df_features.E = (df_features.E * 4.0) - 25.0 # mean -25.0, std 4.0
print(df_features)
print(df_features.mean())
print(df_features.std())

          A         B         C         D         E
0  0.496714 -0.138264  0.647689  1.523030 -0.234153
1 -0.234137  1.579213  0.767435 -0.469474  0.542560
2 -0.463418 -0.465730  0.241962 -1.913280 -1.724918
3 -0.562288 -1.012831  0.314247 -0.908024 -1.412304
4  1.465649 -0.225776  0.067528 -1.424748 -0.544383
5  0.110923 -1.150994  0.375698 -0.600639 -0.291694
6 -0.601707  1.852278 -0.013497 -1.057711  0.822545
7 -1.220844  0.208864 -1.959670 -1.328186  0.196861
8  0.738467  0.171368 -0.115648 -0.301104 -1.478522
9 -0.719844 -0.460639  1.057122  0.343618 -1.763040
          A           B          C         D          E
0  5.993428   98.617357  -6.761557  2.761515 -25.936613
1  4.531726  115.792128  -6.162826  1.765263 -22.829760
2  4.073165   95.342702  -8.790189  1.043360 -31.899671
3  3.875425   89.871689  -8.428763  1.545988 -30.649215
4  7.931298   97.742237  -9.662359  1.287626 -27.177531
5  5.221845   88.490064  -8.121510  1.699681 -26.166775
6  3.796587  118.522782 -10.067486  

In [21]:
# define outlier threshold
outlier_threshold = 1.0 # 3 standard deviations

# get mean and standard deviation of each feature in the dataframe
feature_means = df_features.mean()
feature_standard_deviations = df_features.std()

# for each feature column, find values that are +/- 3 standard deviations from the mean for that column
# start by basically standard scaling the data, subtract the means per column and divide by standard deviation
df_outliers = (df_features.copy() - feature_means) / feature_standard_deviations

# now we can create a boolean matrix of all values that are above/below the threshold
df_outliers_bool_mask = df_outliers.abs() > outlier_threshold

In [22]:
df_outliers_bool_mask

Unnamed: 0,A,B,C,D,E
0,False,False,False,True,False
1,False,True,False,False,True
2,False,False,False,True,True
3,False,True,False,False,False
4,True,False,False,False,False
5,False,True,False,False,False
6,False,True,False,False,True
7,True,False,True,False,False
8,True,False,False,False,False
9,False,False,True,False,True


In [23]:
df_outliers = (df_features - feature_means) / feature_standard_deviations
df_outliers

Unnamed: 0,A,B,C,D,E
0,0.74144,-0.175632,0.619179,2.168015,0.37044
1,-0.168121,1.557823,0.764731,0.146292,1.18196
2,-0.453466,-0.506144,0.126018,-1.318687,-1.18713
3,-0.576512,-1.058335,0.213881,-0.298689,-0.860507
4,1.947301,-0.263958,-0.086007,-0.822991,0.046308
5,0.261314,-1.197783,0.288574,0.013204,0.310321
6,-0.62557,1.833428,-0.184493,-0.450571,1.474492
7,-1.3961,0.174725,-2.550069,-0.725013,0.82077
8,1.042307,0.136881,-0.308658,0.317132,-0.929693
9,-0.772595,-0.501006,1.116846,0.971308,-1.226961


In [24]:
df_outliers[df_outliers > outlier_threshold] = outlier_threshold
df_outliers[df_outliers < -outlier_threshold] = -outlier_threshold
df_outliers

Unnamed: 0,A,B,C,D,E
0,0.74144,-0.175632,0.619179,1.0,0.37044
1,-0.168121,1.0,0.764731,0.146292,1.0
2,-0.453466,-0.506144,0.126018,-1.0,-1.0
3,-0.576512,-1.0,0.213881,-0.298689,-0.860507
4,1.0,-0.263958,-0.086007,-0.822991,0.046308
5,0.261314,-1.0,0.288574,0.013204,0.310321
6,-0.62557,1.0,-0.184493,-0.450571,1.0
7,-1.0,0.174725,-1.0,-0.725013,0.82077
8,1.0,0.136881,-0.308658,0.317132,-0.929693
9,-0.772595,-0.501006,1.0,0.971308,-1.0


In [25]:
(df_outliers * feature_standard_deviations) + feature_means

Unnamed: 0,A,B,C,D,E
0,5.993428,98.617357,-6.761557,2.185948,-25.936613
1,4.531726,110.265318,-6.162826,1.765263,-23.526383
2,4.073165,95.342702,-8.790189,1.2004,-31.183255
3,3.875425,90.44966,-8.428763,1.545988,-30.649215
4,6.408945,97.742237,-9.662359,1.287626,-27.177531
5,5.221845,90.44966,-8.12151,1.699681,-26.166775
6,3.796587,110.265318,-10.067486,1.471145,-23.526383
7,3.194861,102.088636,-13.422096,1.335907,-24.212555
8,6.408945,101.713683,-10.578241,1.849448,-30.914088
9,3.560312,95.393612,-5.195039,2.171809,-31.183255


This procedure should work in general.  Lets create a function to do the work.

In [26]:
def transform_outliers_in_dataframe(df_features, outlier_threshold=3.0):
    """Given a dataframe of features and an outlier threshold, perform outlier
    Winsorization.  Here any value that is above or below the threshold (measured
    in standard deviations) will be floored/ceilened to that threshold.
    
    We basically create a a temporay array of the values that we perform a 
    standard scaling on (to make all features have a mean of 0 and a standard deviation of 1),
    then threshold the values that are above/below the outlier_threshold, then undo
    the standard scaling.
    """
    # get mean and standard deviation of each feature in the dataframe
    feature_means = df_features.mean()
    feature_standard_deviations = df_features.std()

    # perform standard scaling on each feature by subtracting the mean and dividing by the standard deviation.
    # the result is a that all features will now have a mean of 0 and a std of 1
    df_outliers = (df_features.copy() - feature_means) / feature_standard_deviations

    # now we can replace outliers that are above/below the outlier_threshold
    df_outliers[df_outliers > outlier_threshold] = outlier_threshold
    df_outliers[df_outliers < -outlier_threshold] = -outlier_threshold
    
    # now undo the scaling and return the transformed dataframe
    df_outliers = (df_outliers * feature_standard_deviations) + feature_means
    
    return df_outliers

In [27]:
# test as above, should get the same test matrix
transform_outliers_in_dataframe(df_features, outlier_threshold=1.0)

Unnamed: 0,A,B,C,D,E
0,5.993428,98.617357,-6.761557,2.185948,-25.936613
1,4.531726,110.265318,-6.162826,1.765263,-23.526383
2,4.073165,95.342702,-8.790189,1.2004,-31.183255
3,3.875425,90.44966,-8.428763,1.545988,-30.649215
4,6.408945,97.742237,-9.662359,1.287626,-27.177531
5,5.221845,90.44966,-8.12151,1.699681,-26.166775
6,3.796587,110.265318,-10.067486,1.471145,-23.526383
7,3.194861,102.088636,-13.422096,1.335907,-24.212555
8,6.408945,101.713683,-10.578241,1.849448,-30.914088
9,3.560312,95.393612,-5.195039,2.171809,-31.183255


In [28]:
# another test, get in the unscaled features, and remove outliers at 3.0 threshold.  Then
# perform standard scaling and see if, in fact, no feature is above/below the outlier threshold
df_features = mindwandering.data.get_df_features()

In [29]:
df_features.max()

fixation_duration_median                 529.000000
fixation_duration_mean                   597.166667
fixation_duration_standard_deviation     609.742276
fixation_duration_minimum                283.000000
fixation_duration_maximum               1925.000000
                                           ...     
blink_duration_mean                      400.000000
number_of_saccades                        20.000000
horizontal_saccade_proportion              1.000000
fixation_dispersion                        0.736342
fixation_saccade_durtion_ratio            61.776000
Length: 62, dtype: float64

In [30]:
df_features.min()

fixation_duration_median                117.000000
fixation_duration_mean                  127.142857
fixation_duration_standard_deviation     24.369380
fixation_duration_minimum                83.000000
fixation_duration_maximum               175.000000
                                           ...    
blink_duration_mean                       0.000000
number_of_saccades                        5.000000
horizontal_saccade_proportion             0.375000
fixation_dispersion                       0.268000
fixation_saccade_durtion_ratio            0.328000
Length: 62, dtype: float64

In [31]:
# we need to save the means and stds to do the standard scaling ourselves
feature_means = df_features.mean()
feature_standard_deviations = df_features.std()

In [32]:
df_features = transform_outliers_in_dataframe(df_features, outlier_threshold=3.0)

In [33]:
df_features.max()

fixation_duration_median                 357.015247
fixation_duration_mean                   405.411155
fixation_duration_standard_deviation     295.356651
fixation_duration_minimum                207.429277
fixation_duration_maximum               1045.950894
                                           ...     
blink_duration_mean                      344.483641
number_of_saccades                        19.870815
horizontal_saccade_proportion              1.000000
fixation_dispersion                        0.693670
fixation_saccade_durtion_ratio            19.604642
Length: 62, dtype: float64

In [34]:
df_features.min()

fixation_duration_median                117.000000
fixation_duration_mean                  127.142857
fixation_duration_standard_deviation     24.369380
fixation_duration_minimum                83.000000
fixation_duration_maximum               175.000000
                                           ...    
blink_duration_mean                       0.000000
number_of_saccades                        5.000000
horizontal_saccade_proportion             0.734881
fixation_dispersion                       0.268000
fixation_saccade_durtion_ratio            0.328000
Length: 62, dtype: float64

In [35]:
# need to do standard scaling by hand with the original mean and standard deviation, or else don't get the expected result
df_scaled = (df_features.copy() - feature_means) / feature_standard_deviations

In [36]:
df_scaled.max()

fixation_duration_median                3.000000
fixation_duration_mean                  3.000000
fixation_duration_standard_deviation    3.000000
fixation_duration_minimum               3.000000
fixation_duration_maximum               3.000000
                                          ...   
blink_duration_mean                     3.000000
number_of_saccades                      3.000000
horizontal_saccade_proportion           0.457697
fixation_dispersion                     3.000000
fixation_saccade_durtion_ratio          3.000000
Length: 62, dtype: float64

In [37]:
df_scaled.min()

fixation_duration_median               -2.232461
fixation_duration_mean                 -2.173009
fixation_duration_standard_deviation   -1.357432
fixation_duration_minimum              -1.395469
fixation_duration_maximum              -1.587803
                                          ...   
blink_duration_mean                    -0.792284
number_of_saccades                     -2.124942
horizontal_saccade_proportion          -3.000000
fixation_dispersion                    -2.918235
fixation_saccade_durtion_ratio         -1.444623
Length: 62, dtype: float64