In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np

from tasrif.processing_pipeline import ProcessingPipeline
from tasrif.processing_pipeline.pandas import ConvertToDatetimeOperator, SortOperator, ReplaceOperator
from tasrif.processing_pipeline.pandas import DropDuplicatesOperator, DropNAOperator, DropFeaturesOperator
from tasrif.processing_pipeline.custom import OneHotEncoderOperator

from tasrif.data_readers.sleep_health import MyHealthDataset


In [2]:
def col_stats(df):
    print('Some important stats:')
    print('\t- This dataset contains', len(df) ,' rows.')
    for col in df.columns:
        null_percentage = "{:.2f}".format(df[col].isnull().sum()/len(df)*100)
        print('\t - ``', col, '`` has', df[col].isnull().sum(), 'NAs (', df[col].count().sum(), '/', len(df), ') =',
              null_percentage, '%')

In [3]:
# Full MyFamilyDataset
mf = MyHealthDataset(shc_folder="../../data/sleephealth/", pipeline=None)
df = mf.raw_df.copy()
print("Shape:", df.shape)
df.head()

Shape: (1551, 114)


Unnamed: 0,participantId,age_allergies,current_allergies,impactsleep_allergies,sleepimpact_allergies,allergies,anxiety,age_anxiety,current_anxiety,impactsleep_anxiety,...,risk,sleep_trouble,social_activities,stressed,uars,age_uars,current_uars,impactsleep_uars,sleepimpact_uars,timestamp
0,f2514967-9173-4834-96f7-0acdd0298e84,,,,,2.0,3.0,,,,...,4,2.0,4.0,3,3.0,,,,,2016-03-03T07:28:00-08:00
1,7a8e66eb-7c5d-4e55-8967-f3f46f781253,5.0,1.0,1.0,2.0,1.0,1.0,32.0,1.0,1.0,...,23,3.0,5.0,5,2.0,,,,,2016-09-17T23:05:39-05:00
2,2b8a2d5f-f9b1-416d-84f5-a2d87384cc56,43.0,1.0,2.0,1.0,1.0,1.0,42.0,1.0,1.0,...,4,3.0,3.0,4,2.0,,,,,2016-09-18T10:28:16-04:00
3,11599500-9817-47ff-b036-019d6fa85bbf,15.0,2.0,1.0,2.0,1.0,2.0,,,,...,4,2.0,3.0,3,2.0,,,,,2016-03-04T01:25:44-06:00
4,0eba3ad1-3fd1-46d3-9771-dfff5c477b96,50.0,1.0,1.0,1.0,1.0,1.0,44.0,1.0,1.0,...,3,2.0,1.0,1,2.0,,,,,2016-03-24T21:39:55+13:00


In [4]:
col_stats(df)

Some important stats:
	- This dataset contains 1551  rows.
	 - `` participantId `` has 0 NAs ( 1551 / 1551 ) = 0.00 %
	 - `` age_allergies `` has 867 NAs ( 684 / 1551 ) = 55.90 %
	 - `` current_allergies `` has 833 NAs ( 718 / 1551 ) = 53.71 %
	 - `` impactsleep_allergies `` has 833 NAs ( 718 / 1551 ) = 53.71 %
	 - `` sleepimpact_allergies `` has 835 NAs ( 716 / 1551 ) = 53.84 %
	 - `` allergies `` has 3 NAs ( 1548 / 1551 ) = 0.19 %
	 - `` anxiety `` has 3 NAs ( 1548 / 1551 ) = 0.19 %
	 - `` age_anxiety `` has 1094 NAs ( 457 / 1551 ) = 70.54 %
	 - `` current_anxiety `` has 1085 NAs ( 466 / 1551 ) = 69.95 %
	 - `` impactsleep_anxiety `` has 1085 NAs ( 466 / 1551 ) = 69.95 %
	 - `` sleepimpact_anxiety `` has 1084 NAs ( 467 / 1551 ) = 69.89 %
	 - `` anxious `` has 1 NAs ( 1550 / 1551 ) = 0.06 %
	 - `` apnea `` has 3 NAs ( 1548 / 1551 ) = 0.19 %
	 - `` age_apnea `` has 1228 NAs ( 323 / 1551 ) = 79.17 %
	 - `` current_apnea `` has 1223 NAs ( 328 / 1551 ) = 78.85 %
	 - `` impactsleep_apnea `

In [37]:
print("Shape after dropping duplicate participants:", df["participantId"].drop_duplicates().shape)

Shape after dropping duplicate participants: (1478,)


In [12]:
# Default Pipeline
pipeline = ProcessingPipeline([
    ConvertToDatetimeOperator(feature_names="timestamp", format="%Y-%m-%dT%H:%M:%S%z", utc=True),
    SortOperator(by=["participantId", "timestamp"]),
    DropDuplicatesOperator(subset="participantId", keep="last"),
    ReplaceOperator(to_replace={"allergies": {3: np.nan}, "anxiety": {3: np.nan}, "apnea": {3: np.nan},
                                "asthma": {3: np.nan}, "atrial": {3: np.nan}, "hi_blood_pressure": {3: np.nan},
                                "cancer": {3: np.nan}, "depression": {3: np.nan}, "diabetes": {3: np.nan},
                                "erectile": {3: np.nan}, "gastroesophageal": {3: np.nan}, "heart_disease": {3: np.nan},
                                "insomnia": {3: np.nan}, "lung": {3: np.nan}, "narcolepsy": {3: np.nan},
                                "nocturia": {3: np.nan}, "restless_legs_syndrome": {3: np.nan}, "stroke": {3: np.nan},
                                "uars": {3: np.nan}
                               }),
    DropNAOperator(subset=["anxious", "cardiovascular", "compare_one_year", "day_to_day", "depressed", "emotional",
                           "fatigued", "general_health", "mental_health", "physical_activities", "physical_health",
                           "risk", "sleep_trouble", "social_activities", "stressed"]),
    OneHotEncoderOperator(feature_names=["anxious", "cardiovascular", "compare_one_year", "day_to_day",
                                         "depressed", "emotional", "fatigued", "general_health",
                                         "mental_health", "physical_activities", "physical_health",
                                         "risk", "sleep_trouble", "social_activities", "stressed"],
                          drop_first=True),
    ])

mypipe = MyHealthDataset(shc_folder="../../data/sleephealth/", pipeline=pipeline)
df_piped = mypipe.processed_dataframe()
print("Shape:", df_piped.shape)
df_piped.head()

Shape: (1478, 115)


Unnamed: 0,participantId,age_allergies,impactsleep_allergies,sleepimpact_allergies,allergies,anxiety,age_anxiety,current_anxiety,impactsleep_anxiety,sleepimpact_anxiety,...,social_activities,stressed,uars,age_uars,current_uars,impactsleep_uars,sleepimpact_uars,timestamp,current_allergies=2.0,current_allergies=nan
1494,00a3be71-6b4c-46c5-9e50-acf2db62f27b,6.0,2.0,2.0,1.0,1.0,40.0,1.0,1.0,1.0,...,3.0,4,2.0,,,,,2019-02-06 05:03:51+00:00,0,0
502,00c13261-dd38-4730-90c4-beb25ff35822,15.0,1.0,1.0,1.0,2.0,,,,,...,3.0,3,2.0,,,,,2016-07-24 16:23:38+00:00,1,0
287,00d1fe00-fa24-4dcf-a8e8-baafab0cf945,25.0,2.0,2.0,1.0,2.0,,,,,...,3.0,3,2.0,,,,,2016-03-07 17:12:07+00:00,1,0
1064,00e076a6-c761-43f1-8bf3-22ca0e0cd1a9,,,,2.0,2.0,,,,,...,3.0,4,2.0,,,,,2017-02-25 17:40:20+00:00,0,1
1360,00fd4039-9b5e-4bbb-8295-4983a3f58371,,,,2.0,1.0,24.0,2.0,1.0,1.0,...,4.0,4,2.0,,,,,2018-08-17 12:30:04+00:00,0,1
