<a href="https://colab.research.google.com/github/sidt-ai/data-science-competitions/blob/main/dphi/ds75-child-healthcare/notebooks/02_data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
import os
import gc
import warnings

gc.enable()
warnings.filterwarnings('ignore')

%pprint

Pretty printing has been turned OFF


In [2]:
import numpy as np
import pandas as pd
pd.set_option('precision', 4)
pd.set_option('display.max_columns', None)

In [3]:
SEED = 2311

os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

In [4]:
train_url = 'https://raw.githubusercontent.com/sidt-ai/data-science-competitions/main/dphi/ds75-child-healthcare/data/train_dataset.csv'
test_url = 'https://raw.githubusercontent.com/sidt-ai/data-science-competitions/main/dphi/ds75-child-healthcare/data/test_dataset.csv'

In [5]:
train = pd.read_csv(train_url)
test = pd.read_csv(test_url)

In [6]:
features = test.columns.to_list()

# Preprocessing

In [7]:
train.head()

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,mean_value_of_long_term_variability,histogram_width,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health
0,133.0,0.003,0.0,0.004,0.004,0.0,0.0,30.0,1.5,0.0,5.3,102.0,67.0,169.0,9.0,1.0,147.0,137.0,144.0,40.0,1.0,1.0
1,130.0,0.001,0.001,0.012,0.01,0.0,0.001,62.0,2.2,0.0,0.0,161.0,50.0,211.0,9.0,0.0,60.0,89.0,113.0,250.0,0.0,3.0
2,141.0,0.0,0.008,0.0,0.0,0.0,0.0,75.0,0.3,49.0,4.6,9.0,136.0,145.0,1.0,0.0,143.0,141.0,143.0,0.0,1.0,2.0
3,144.0,0.0,0.002,0.002,0.0,0.0,0.0,84.0,0.3,34.0,5.5,38.0,132.0,170.0,2.0,0.0,144.0,143.0,145.0,0.0,-1.0,3.0
4,106.0,0.001,0.0,0.011,0.0,0.0,0.0,63.0,0.6,0.0,11.5,30.0,95.0,125.0,1.0,0.0,112.0,110.0,112.0,1.0,0.0,1.0


In [8]:
train['fetal_health'] = train['fetal_health'].astype('int8')

In [9]:
def fix_datatypes(df):

    df['baseline value'] = df['baseline value'].astype('int16')
    df['accelerations'] = df['accelerations'].apply(lambda x: x * 1000).astype('int8')
    df['fetal_movement'] = df['fetal_movement'].apply(lambda x: x * 1000).astype('int8')
    df['uterine_contractions'] = df['uterine_contractions'].apply(lambda x: x * 1000).astype('int8')
    df['light_decelerations'] = df['light_decelerations'].apply(lambda x: x * 1000).astype('int8')
    df['severe_decelerations'] = df['severe_decelerations'].apply(lambda x: x * 1000).astype('int8')
    df['prolongued_decelerations'] = df['prolongued_decelerations'].apply(lambda x: x * 1000).astype('int8')
    df['abnormal_short_term_variability'] = df['abnormal_short_term_variability'].astype('int8')
    df['percentage_of_time_with_abnormal_long_term_variability'] = df['percentage_of_time_with_abnormal_long_term_variability'].astype('int8')
    df['histogram_width'] = df['histogram_width'].astype('int16')
    df['histogram_min'] = df['histogram_min'].astype('int16')
    df['histogram_width'] = df['histogram_width'].astype('int16')
    df['histogram_max'] = df['histogram_max'].astype('int16')
    df['histogram_number_of_peaks'] = df['histogram_number_of_peaks'].astype('int8')
    df['histogram_number_of_zeroes'] = df['histogram_number_of_zeroes'].astype('int8')
    df['histogram_mode'] = df['histogram_mode'].astype('int16')
    df['histogram_mean'] = df['histogram_mean'].astype('int16')
    df['histogram_median'] = df['histogram_median'].astype('int16')
    df['histogram_variance'] = df['histogram_variance'].astype('int16')
    df['histogram_tendency'] = df['histogram_tendency'].astype('int8')

    return df

In [10]:
train = fix_datatypes(train)
test = fix_datatypes(test)

gc.collect()

50

In [11]:
train.head()

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,mean_value_of_long_term_variability,histogram_width,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health
0,133,3,0,4,4,0,0,30,1.5,0,5.3,102,67,169,9,1,147,137,144,40,1,1
1,130,1,1,12,10,0,1,62,2.2,0,0.0,161,50,211,9,0,60,89,113,250,0,3
2,141,0,8,0,0,0,0,75,0.3,49,4.6,9,136,145,1,0,143,141,143,0,1,2
3,144,0,2,2,0,0,0,84,0.3,34,5.5,38,132,170,2,0,144,143,145,0,-1,3
4,106,1,0,11,0,0,0,63,0.6,0,11.5,30,95,125,1,0,112,110,112,1,0,1


# Feature Engineering

In [12]:
def transform_features(df):

    df['accelerations_cat'] = df['accelerations'].apply(lambda x: int(x > 5))
    df['fetal_movement_cat'] = df['fetal_movement'].apply(lambda x: int(x > 0))
    df['uterine_contractions_cat'] = df['uterine_contractions'].apply(lambda x: int(x > 0))
    df['decelerations'] = df['light_decelerations'] + df['severe_decelerations'] + df['prolongued_decelerations']
    df['abnormal_long_term_variability_cat'] = df['percentage_of_time_with_abnormal_long_term_variability'].apply(lambda x: int(x > 0))

    return df


In [13]:
train_ext = transform_features(train.copy())
test_ext = transform_features(test.copy())

gc.collect()

50

### Feature sets

In [14]:
extended_features = test.columns.to_list()

In [15]:
cat_features = ['severe_decelerations', 'accelerations_cat', 'fetal_movement_cat',
                'uterine_contractions_cat', 'decelerations', 'abnormal_long_term_variability_cat']

In [16]:
hist_features = ['histogram_width', 'histogram_min', 'histogram_max', 
                 'histogram_number_of_peaks', 'histogram_number_of_zeroes', 
                 'histogram_mode', 'histogram_mean', 'histogram_median', 
                 'histogram_variance', 'histogram_tendency']

non_hist_features = [f for f in extended_features if f not in hist_features]

# Processed datasets

Cleaned dataset

In [17]:
train.to_csv('train_proc.csv', index=False)
test.to_csv('test_proc.csv', index=False)

In [18]:
!head train_proc.csv

baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,mean_value_of_long_term_variability,histogram_width,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health
133,3,0,4,4,0,0,30,1.5,0,5.3,102,67,169,9,1,147,137,144,40,1,1
130,1,1,12,10,0,1,62,2.2,0,0.0,161,50,211,9,0,60,89,113,250,0,3
141,0,8,0,0,0,0,75,0.3,49,4.6,9,136,145,1,0,143,141,143,0,1,2
144,0,2,2,0,0,0,84,0.3,34,5.5,38,132,170,2,0,144,143,145,0,-1,3
106,1,0,11,0,0,0,63,0.6,0,11.5,30,95,125,1,0,112,110,112,1,0,1
135,6,0,6,5,0,0,27,1.5,0,5.8,96,69,165,5,2,143,139,142,17,1,1
127,0,0,7,6,0,0,20,2.2,0,19.3,78,81,159,4,0,131,122,124,23,0,1
110,3,0,3,1,0,0,64,1.6,0,6.8,116,62,178,12,1,107,105,108,37,0,1
140,0,22,

In [19]:
!head test_proc.csv

baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,mean_value_of_long_term_variability,histogram_width,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency
134,0,0,4,7,0,0,37,1.1,22,2.9,93,50,143,5,0,138,129,136,31,1
143,0,17,0,0,0,0,70,0.3,50,5.1,13,142,155,1,0,147,146,148,0,0
144,10,0,6,0,0,0,33,1.1,0,5.3,54,120,174,1,0,155,154,155,5,0
137,0,1,6,3,0,0,59,1.8,0,6.5,102,64,166,5,2,142,134,140,14,1
134,0,0,9,6,0,2,64,1.5,0,3.8,70,83,153,5,0,137,119,124,50,1
122,2,0,3,0,0,0,21,1.8,0,16.3,44,108,152,1,0,125,127,127,5,0
140,6,0,8,0,0,0,38,5.4,0,34.7,149,50,199,8,1,148,148,151,43,1
139,10,1,8,4,0,0,50,1.4,0,15.0,102,76,178,10,0,162,144,149,57,1
141,1,0,5,0,0,0,25,1.5,0,13.4,103,7

Extended dataset

In [20]:
train_ext.to_csv('train_ext.csv', index=False)
test_ext.to_csv('test_ext.csv', index=False)

In [21]:
!head train_ext.csv

baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,mean_value_of_long_term_variability,histogram_width,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health,accelerations_cat,fetal_movement_cat,uterine_contractions_cat,decelerations,abnormal_long_term_variability_cat
133,3,0,4,4,0,0,30,1.5,0,5.3,102,67,169,9,1,147,137,144,40,1,1,0,0,1,4,0
130,1,1,12,10,0,1,62,2.2,0,0.0,161,50,211,9,0,60,89,113,250,0,3,0,1,1,11,0
141,0,8,0,0,0,0,75,0.3,49,4.6,9,136,145,1,0,143,141,143,0,1,2,0,1,0,0,1
144,0,2,2,0,0,0,84,0.3,34,5.5,38,132,170,2,0,144,143,145,0,-1,3,0,1,1,0,1
106,1,0,11,0,0,0,63,0.6,0,11.5,30,95,125,1,0,112,110,112,1,0,1,0,0,1,0,0
135,6,0,6,5,0,0,27,1.5,0,5.8,96,69,1

In [22]:
!head test_ext.csv

baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,mean_value_of_long_term_variability,histogram_width,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,accelerations_cat,fetal_movement_cat,uterine_contractions_cat,decelerations,abnormal_long_term_variability_cat
134,0,0,4,7,0,0,37,1.1,22,2.9,93,50,143,5,0,138,129,136,31,1,0,0,1,7,1
143,0,17,0,0,0,0,70,0.3,50,5.1,13,142,155,1,0,147,146,148,0,0,0,1,0,0,1
144,10,0,6,0,0,0,33,1.1,0,5.3,54,120,174,1,0,155,154,155,5,0,1,0,1,0,0
137,0,1,6,3,0,0,59,1.8,0,6.5,102,64,166,5,2,142,134,140,14,1,0,1,1,3,0
134,0,0,9,6,0,2,64,1.5,0,3.8,70,83,153,5,0,137,119,124,50,1,0,0,1,8,0
122,2,0,3,0,0,0,21,1.8,0,16.3,44,108,152,1,0,125,127,127,5,0,