In [3]:
%load_ext autoreload
%autoreload 2

import numpy as np

from tasrif.processing_pipeline import ProcessingPipeline
from tasrif.processing_pipeline.pandas import ConvertToDatetimeOperator, SortOperator, ReplaceOperator
from tasrif.processing_pipeline.pandas import DropDuplicatesOperator, DropNAOperator, DropFeaturesOperator
from tasrif.processing_pipeline.custom import OneHotEncoderOperator

from tasrif.data_readers.sleep_health import SleepAssessmentDataset


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
def col_stats(df):
    print('Some important stats:')
    print('\t- This dataset contains', len(df) ,' rows.')
    for col in df.columns:
        null_percentage = "{:.2f}".format(df[col].isnull().sum()/len(df)*100)
        print('\t - ``', col, '`` has', df[col].isnull().sum(), 'NAs (', df[col].count().sum(), '/', len(df), ') =',
              null_percentage, '%')

In [6]:
# Full MyFamilyDataset
mf = SleepAssessmentDataset(shc_folder="../../data/sleephealth/", pipeline=None)
df = mf.raw_df.copy()
print("Shape:", df.shape)
df.head()

Shape: (2325, 23)


Unnamed: 0,participantId,alcohol,concentrating_problem_one,concentrating_problem_two,discomfort_in_sleep,exercise,fatigue_limit,feel_tired_frequency,felt_alert,had_problem,...,sleep_aids,sleep_problem,think_clearly,tired_easily,told_by_doctor,told_by_doctor_specify,told_to_doctor,other_selected,trouble_staying_awake,timestamp
0,6da7e848-daaa-410c-a83f-35b63498595c,5.0,2.0,3.0,12356,2.0,3.0,3.0,3.0,2.0,...,5.0,1.0,2.0,4.0,1.0,1.0,1.0,,3.0,2016-03-02T00:10:29-05:00
1,b57890d5-2346-4e14-aae1-90d8e8206faf,4.0,2.0,3.0,24,3.0,2.0,4.0,2.0,4.0,...,5.0,1.0,2.0,3.0,2.0,,1.0,,2.0,2016-03-03T12:21:19+02:00
2,f2514967-9173-4834-96f7-0acdd0298e84,5.0,3.0,2.0,1456,1.0,3.0,3.0,1.0,3.0,...,4.0,3.0,2.0,3.0,2.0,,1.0,,2.0,2016-03-02T00:25:14-08:00
3,a051552c-d12f-46fc-8a16-082349220970,4.0,1.0,1.0,146,5.0,1.0,2.0,4.0,3.0,...,1.0,3.0,1.0,2.0,2.0,,1.0,,1.0,2016-09-22T09:56:19-07:00
4,f2ac8670-492a-42ce-8c6c-c694f42f3ddf,4.0,4.0,4.0,234,1.0,4.0,4.0,3.0,3.0,...,4.0,3.0,4.0,4.0,2.0,,2.0,,4.0,2016-03-03T05:25:42-05:00


In [7]:
col_stats(df)

Some important stats:
	- This dataset contains 2325  rows.
	 - `` participantId `` has 0 NAs ( 2325 / 2325 ) = 0.00 %
	 - `` alcohol `` has 2 NAs ( 2323 / 2325 ) = 0.09 %
	 - `` concentrating_problem_one `` has 4 NAs ( 2321 / 2325 ) = 0.17 %
	 - `` concentrating_problem_two `` has 4 NAs ( 2321 / 2325 ) = 0.17 %
	 - `` discomfort_in_sleep `` has 7 NAs ( 2318 / 2325 ) = 0.30 %
	 - `` exercise `` has 30 NAs ( 2295 / 2325 ) = 1.29 %
	 - `` fatigue_limit `` has 9 NAs ( 2316 / 2325 ) = 0.39 %
	 - `` feel_tired_frequency `` has 1 NAs ( 2324 / 2325 ) = 0.04 %
	 - `` felt_alert `` has 1 NAs ( 2324 / 2325 ) = 0.04 %
	 - `` had_problem `` has 6 NAs ( 2319 / 2325 ) = 0.26 %
	 - `` hard_times `` has 3 NAs ( 2322 / 2325 ) = 0.13 %
	 - `` medication_by_doctor `` has 4 NAs ( 2321 / 2325 ) = 0.17 %
	 - `` poor_sleep_problems `` has 0 NAs ( 2325 / 2325 ) = 0.00 %
	 - `` sleep_aids `` has 4 NAs ( 2321 / 2325 ) = 0.17 %
	 - `` sleep_problem `` has 8 NAs ( 2317 / 2325 ) = 0.34 %
	 - `` think_clearly `` has

In [8]:
print("Shape after dropping duplicate participants:", df["participantId"].drop_duplicates().shape)

Shape after dropping duplicate participants: (2228,)


In [25]:
# Default Pipeline
pipeline = ProcessingPipeline([
    ConvertToDatetimeOperator(feature_names="timestamp", format="%Y-%m-%dT%H:%M:%S%z", utc=True),
    SortOperator(by=["participantId", "timestamp"]),
    DropDuplicatesOperator(subset="participantId", keep="last"),
    ReplaceOperator(to_replace={"alcohol": {7: np.nan},
                                "medication_by_doctor": {7: np.nan},
                                "sleep_aids": {7: np.nan},
                                "told_by_doctor": {3: np.nan},
                                "told_to_doctor": {3: np.nan},
                                "told_by_doctor_specify": {np.nan: '8'},
                                "other_selected": {np.nan: ''},
                                }),
    DropNAOperator(subset=['alcohol', 'concentrating_problem_one', 'concentrating_problem_two',
                           'discomfort_in_sleep', 'exercise', 'fatigue_limit', 'feel_tired_frequency',
                           'felt_alert', 'had_problem', 'hard_times', 'medication_by_doctor',
                           'poor_sleep_problems', 'sleep_aids', 'sleep_problem', 'think_clearly',
                           'tired_easily', 'told_by_doctor',  'told_to_doctor', 'trouble_staying_awake']),
    OneHotEncoderOperator(feature_names=['alcohol', 'concentrating_problem_one', 'concentrating_problem_two',
                           'discomfort_in_sleep', 'exercise', 'fatigue_limit', 'feel_tired_frequency',
                           'felt_alert', 'had_problem', 'hard_times', 'medication_by_doctor',
                           'poor_sleep_problems', 'sleep_aids', 'sleep_problem', 'think_clearly',
                           'tired_easily', 'told_by_doctor',  'told_to_doctor', 'trouble_staying_awake',
                            'told_by_doctor_specify'],
                            drop_first=True),
    ])


mypipe = SleepAssessmentDataset(shc_folder="../../data/sleephealth/", pipeline=pipeline)
df_piped = mypipe.processed_dataframe()
print("Shape:", df_piped.shape)
df_piped.head()

Shape: (2123, 83)


Unnamed: 0,participantId,other_selected,timestamp,alcohol=2.0,alcohol=3.0,alcohol=4.0,alcohol=5.0,alcohol=6.0,concentrating_problem_one=2.0,concentrating_problem_one=3.0,...,trouble_staying_awake=3.0,trouble_staying_awake=4.0,trouble_staying_awake=5.0,told_by_doctor_specify=2,told_by_doctor_specify=3,told_by_doctor_specify=4,told_by_doctor_specify=5,told_by_doctor_specify=6,told_by_doctor_specify=7,told_by_doctor_specify=8
2251,00a3be71-6b4c-46c5-9e50-acf2db62f27b,,2019-02-01 15:14:34+00:00,0,1,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
1146,00a55fb5-da33-4e2e-ae61-28f589fcc174,,2016-05-13 11:11:28+00:00,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1
894,00c13261-dd38-4730-90c4-beb25ff35822,,2016-07-24 16:25:08+00:00,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1030,00dbfe89-8c89-4933-9e84-bb8624787026,,2016-03-29 14:54:45+00:00,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1707,00e076a6-c761-43f1-8bf3-22ca0e0cd1a9,Bi-polar disorder,2017-02-25 17:43:16+00:00,0,0,1,0,0,1,0,...,0,0,0,1,0,0,0,0,1,0


In [26]:
df_piped["other_selected"].unique()

array(['', 'Bi-polar disorder',
       'Obstructive, Cental, & hypopnea Sleep apnea’s with a rating of 27 for moderate to severe. I also have Postural Orthostatic Tachycardia Syndrome & chronic hypotension ',
       'Narcolepsy with Cataplexy  Periodic Limb Movement Disorder  Restless Leg Syndrome ',
       'PLMD', 'OSA', '"Unidentified Sleep Disorder"',
       "Spent a night in a sleep lab. Was told that I don't go into REM sleep but have micro burst of activity that keep me from going into a deep sleep",
       'I think there’s a glitch in your system. I did not check other. I checked sleep apnea, insomnia, hypersomnia, and circadian rhythm disorder.',
       'Bipolar Disorder',
       "Hypersomnia (from Lyme's Disease) that struggle with daily and periods of insomnia (prob from Hypersomnia medication Midafinil)",
       'Dysautonomia sometimes has sleep problems as a symptom',
       'Recurrent hypersomnia ', 'Sleep paralysis/night terrors',
       'Primary Central Sleep Apnea Shall