In [41]:
from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [42]:
daily_activity = pd.read_csv("./training/DAILYactivity.csv")
hourly_intensity = pd.read_csv("./training/INTENSITYhour.csv")
minute_intensity = pd.read_csv("./training/INTENSITYminuteNarrow_merged.csv")
minute_sleep = pd.read_csv("./training/SLEEPmin.csv")
hourly_steps = pd.read_csv("./training/STEPShourly.csv")

## Cleaning and manipulating training data

In [46]:
minute_intensity["calendar date"] = minute_intensity["ActivityMinute"].apply(lambda x: str(datetime.strptime(x, "%m/%d/%Y %I:%M:%S %p").date()))
minute_intensity["unit time"] = minute_intensity["ActivityMinute"].apply(lambda x: str(datetime.strptime(x, "%m/%d/%Y %I:%M:%S %p").time()))

Unnamed: 0,Id,ActivityMinute,Intensity,calendar date,unit time
0,1503960366,3/12/2016 12:00:00 AM,0,2016-03-12,00:00:00
1,1503960366,3/12/2016 12:01:00 AM,0,2016-03-12,00:01:00
2,1503960366,3/12/2016 12:02:00 AM,0,2016-03-12,00:02:00
3,1503960366,3/12/2016 12:03:00 AM,0,2016-03-12,00:03:00
4,1503960366,3/12/2016 12:04:00 AM,0,2016-03-12,00:04:00
...,...,...,...,...,...
1445035,8877689391,4/12/2016 8:55:00 AM,0,2016-04-12,08:55:00
1445036,8877689391,4/12/2016 8:56:00 AM,0,2016-04-12,08:56:00
1445037,8877689391,4/12/2016 8:57:00 AM,0,2016-04-12,08:57:00
1445038,8877689391,4/12/2016 8:58:00 AM,0,2016-04-12,08:58:00


In [48]:
minute_sleep["ActivityDate"] = minute_sleep["date"].apply(lambda x: str(datetime.strptime(x, "%m/%d/%Y %I:%M:%S %p").date()))
minute_sleep["unit time"] = minute_sleep["date"].apply(lambda x: str(datetime.strptime(x, "%m/%d/%Y %I:%M:%S %p").time()))
number_of_mins_of_sleep_per_person_per_day = minute_sleep.groupby(["Id", "ActivityDate"]).count()["unit time"].to_frame().reset_index()

In [49]:
number_of_mins_of_sleep_per_person_per_day # need to bin into poor, adequate, and over sleep

Unnamed: 0,Id,ActivityDate,unit time
0,1503960366,2016-03-13,426
1,1503960366,2016-03-14,386
2,1503960366,2016-03-15,335
3,1503960366,2016-03-16,366
4,1503960366,2016-03-17,437
...,...,...,...
462,8792009665,2016-04-05,414
463,8792009665,2016-04-06,463
464,8792009665,2016-04-07,475
465,8792009665,2016-04-08,447


In [52]:
number_of_mins_of_sleep_per_person_per_day["ActivityDate"]=number_of_mins_of_sleep_per_person_per_day["ActivityDate"].apply(lambda x: datetime.strptime(x, "%Y-%m-%d").strftime("%m/%d/%Y"))

In [55]:
daily_activity["ActivityDate"]=daily_activity["ActivityDate"].apply(lambda x: datetime.strptime(x, "%m/%d/%Y").date().strftime("%m/%d/%Y"))

In [56]:
merged_main=pd.merge(left=daily_activity, right=number_of_mins_of_sleep_per_person_per_day, how='left', on=['Id', 'ActivityDate'])

In [57]:
merged_main=merged_main.rename(columns={'unit time': 'number_of_mins_of_sleep_per_day'})
merged_main=merged_main[merged_main['number_of_mins_of_sleep_per_day'].notna()] #weeding out records we dont have sleep data for, either entirely or for certain dates
merged_main=merged_main.drop(columns=['Calories']+[i for i in merged_main.columns if 'Distance' in i])

In [69]:
merged_main=merged_main.drop(columns=['Calories']+[i for i in merged_main.columns if 'Distance' in i])

In [70]:
merged_main

Unnamed: 0,Id,ActivityDate,TotalSteps,VeryActiveMinutes,FairlyActiveMinutes,LightlyActiveMinutes,SedentaryMinutes,number_of_mins_of_sleep_per_day
0,1503960366,03/25/2016,11004,33,12,205,804,386.0
1,1503960366,03/26/2016,17609,89,17,274,588,472.0
2,1503960366,03/27/2016,12736,56,5,268,605,506.0
3,1503960366,03/28/2016,13231,39,20,224,1080,77.0
4,1503960366,03/29/2016,12041,28,28,243,763,378.0
...,...,...,...,...,...,...,...,...
437,8792009665,04/05/2016,2332,1,10,111,904,414.0
438,8792009665,04/06/2016,2121,0,0,122,855,463.0
439,8792009665,04/07/2016,1291,0,0,77,888,475.0
440,8792009665,04/08/2016,1467,2,8,71,912,447.0


## Cleaning and manipulating test data

In [139]:
test_daily_activity = pd.read_csv("./test/dailyActivity_merged.csv")
test_sleep = pd.read_csv("./test/SLEEPDay_merged.csv")
intensity = pd.read_csv("./test/INTENSITYdaily.csv")

In [140]:
test_sleep["ActivityDate"]=test_sleep["SleepDay"].apply(lambda x: str(datetime.strptime(x, "%m/%d/%Y %I:%M:%S %p").date().strftime("%-m/%-d/%Y")))

In [141]:
merged_main_test=pd.merge(left=test_daily_activity, right=test_sleep, how='left', on=['Id', 'ActivityDate'])

In [142]:
merged_main_test=merged_main_test[merged_main_test['TotalMinutesAsleep'].notna()]
merged_main_test=merged_main_test.rename(columns={'TotalMinutesAsleep': 'number_of_mins_of_sleep_per_day'})
merged_main_test=merged_main_test.drop(columns=['Calories', 'SleepDay', 'TotalSleepRecords', 'TotalTimeInBed']+[i for i in merged_main_test.columns if 'Distance' in i])#weeding out records we dont have sleep data for, either entirely or for certain dates

In [143]:
merged_main_test

Unnamed: 0,Id,ActivityDate,TotalSteps,VeryActiveMinutes,FairlyActiveMinutes,LightlyActiveMinutes,SedentaryMinutes,number_of_mins_of_sleep_per_day
0,1503960366,4/12/2016,13162,25,13,328,728,327.0
1,1503960366,4/13/2016,10735,21,19,217,776,384.0
3,1503960366,4/15/2016,9762,29,34,209,726,412.0
4,1503960366,4/16/2016,12669,36,10,221,773,340.0
5,1503960366,4/17/2016,9705,38,20,164,539,700.0
...,...,...,...,...,...,...,...,...
901,8792009665,4/30/2016,7174,10,20,301,749,343.0
902,8792009665,5/1/2016,1619,0,0,79,834,503.0
903,8792009665,5/2/2016,1831,0,0,101,916,415.0
904,8792009665,5/3/2016,2421,0,0,156,739,516.0
