In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go


data = pd.read_csv("./data/dailyActivity_merged.csv")
print(data.head())

           Id ActivityDate  TotalSteps  TotalDistance  TrackerDistance  \
0  1503960366    4/12/2016       13162           8.50             8.50   
1  1503960366    4/13/2016       10735           6.97             6.97   
2  1503960366    4/14/2016       10460           6.74             6.74   
3  1503960366    4/15/2016        9762           6.28             6.28   
4  1503960366    4/16/2016       12669           8.16             8.16   

   LoggedActivitiesDistance  VeryActiveDistance  ModeratelyActiveDistance  \
0                       0.0                1.88                      0.55   
1                       0.0                1.57                      0.69   
2                       0.0                2.44                      0.40   
3                       0.0                2.14                      1.26   
4                       0.0                2.71                      0.41   

   LightActiveDistance  SedentaryActiveDistance  VeryActiveMinutes  \
0                 6.06

Before starting the analysis, we must verify if the information presents empty fields

In [15]:
print(data.isnull().sum())

Id                          0
ActivityDate                0
TotalSteps                  0
TotalDistance               0
TrackerDistance             0
LoggedActivitiesDistance    0
VeryActiveDistance          0
ModeratelyActiveDistance    0
LightActiveDistance         0
SedentaryActiveDistance     0
VeryActiveMinutes           0
FairlyActiveMinutes         0
LightlyActiveMinutes        0
SedentaryMinutes            0
Calories                    0
dtype: int64


Once verified that there is no null data, we can see information
about columns and their data types

In [16]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 940 entries, 0 to 939
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Id                        940 non-null    int64  
 1   ActivityDate              940 non-null    object 
 2   TotalSteps                940 non-null    int64  
 3   TotalDistance             940 non-null    float64
 4   TrackerDistance           940 non-null    float64
 5   LoggedActivitiesDistance  940 non-null    float64
 6   VeryActiveDistance        940 non-null    float64
 7   ModeratelyActiveDistance  940 non-null    float64
 8   LightActiveDistance       940 non-null    float64
 9   SedentaryActiveDistance   940 non-null    float64
 10  VeryActiveMinutes         940 non-null    int64  
 11  FairlyActiveMinutes       940 non-null    int64  
 12  LightlyActiveMinutes      940 non-null    int64  
 13  SedentaryMinutes          940 non-null    int64  
 14  Calories  

Our target column will be ActivituDate, since this contains the first data we will analyze, so we need to convert its format to Datetime

In [17]:
data['ActivityDate'] = pd.to_datetime(data['ActivityDate'],format='%m/%d/%Y')
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 940 entries, 0 to 939
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Id                        940 non-null    int64         
 1   ActivityDate              940 non-null    datetime64[ns]
 2   TotalSteps                940 non-null    int64         
 3   TotalDistance             940 non-null    float64       
 4   TrackerDistance           940 non-null    float64       
 5   LoggedActivitiesDistance  940 non-null    float64       
 6   VeryActiveDistance        940 non-null    float64       
 7   ModeratelyActiveDistance  940 non-null    float64       
 8   LightActiveDistance       940 non-null    float64       
 9   SedentaryActiveDistance   940 non-null    float64       
 10  VeryActiveMinutes         940 non-null    int64         
 11  FairlyActiveMinutes       940 non-null    int64         
 12  LightlyActiveMinutes  


We proceed to combine the information from the VeryActiveMinutes, FairlyActiveMinutes, LightlyActiveMinutes, SedentaryMinutes columns of the dataset
in order to obtain a dataset

In [18]:
data['TotalMinutes'] = data['VeryActiveMinutes'] + data['FairlyActiveMinutes'] + data['LightlyActiveMinutes'] + data['SedentaryMinutes']
print(data['TotalMinutes'].sample(5))

530    1067
3       998
234    1440
338    1104
194     902
Name: TotalMinutes, dtype: int64



Now let's have a look to the descriptive statistics of the dataset

In [19]:
print(data.describe())

                 Id    TotalSteps  TotalDistance  TrackerDistance  \
count  9.400000e+02    940.000000     940.000000       940.000000   
mean   4.855407e+09   7637.910638       5.489702         5.475351   
std    2.424805e+09   5087.150742       3.924606         3.907276   
min    1.503960e+09      0.000000       0.000000         0.000000   
25%    2.320127e+09   3789.750000       2.620000         2.620000   
50%    4.445115e+09   7405.500000       5.245000         5.245000   
75%    6.962181e+09  10727.000000       7.712500         7.710000   
max    8.877689e+09  36019.000000      28.030001        28.030001   

       LoggedActivitiesDistance  VeryActiveDistance  ModeratelyActiveDistance  \
count                940.000000          940.000000                940.000000   
mean                   0.108171            1.502681                  0.567543   
std                    0.619897            2.658941                  0.883580   
min                    0.000000            0.000000   

The data set has a column of calories, which contains the number of calories burned per day, let's see the relationship between the number of calories burned and the total number of steps per day

In [None]:
fiugure = px.scatter(data_frame = data, x='Calories',
                    y='TotalSteps', size='VeryActiveMinutes',
                    trendline='ols',
                    title='Relationship between Calories & Total Steps')
fiugure.show()

<img src = media/scatter.jpg>


Now let's see an average between the total active minutes per day


In [None]:
from turtle import width


label = ["Very Active Minutes", "Fairly Active Minutes", 
         "Lightly Active Minutes", "Inactive Minutes"]
counts = data[["VeryActiveMinutes", "FairlyActiveMinutes", 
               "LightlyActiveMinutes", "SedentaryMinutes"]].mean()
colors = ['gold','lightgreen', "pink", "blue"]

fig = go.Figure(data=[go.Pie(labels=label, values=counts)])
fig.update_layout(title_text='Total Active Minutes')
fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=30,
                marker=dict(colors=colors, line=dict(color='black',width=3 )))

<img src = media/pie.jpg>

Earlier we transformed the data type of the 'ActivityDate' column to a datetime type, let's use that column to find the lazy days and add a new column to the dataset to represent those days

<img src = media/specs.png>

In [23]:
data['Day'] = data['ActivityDate'].dt.day_name()
print(data['Day'].head())

0      Tuesday
1    Wednesday
2     Thursday
3       Friday
4     Saturday
Name: Day, dtype: object


Let's see graphically the distribution of the days with activity


In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(
    x=data['Day'],
    y=data['VeryActiveMinutes'],
    name='Very Active',
    marker_color='purple'
))
fig.add_trace(go.Bar(
    x=data['Day'],
    y=data['FairlyActiveMinutes'],
    name='Fairly Active',
    marker_color='green'
))
fig.add_trace(go.Bar(
    x=data['Day'],
    y=data['LightlyActiveMinutes'],
    name='Lightly Active',
    marker_color='pink'
))
fig.update_layout(barmode='group', xaxis_tickangle=-45)
fig.show()

<img src = media/figure.jpg>


Now let's see the number of inactive minutes on each day of the week

In [None]:
day = data['Day'].value_counts()
label = day.index
counts = data['SedentaryMinutes']
colors = ['gold','lightgreen','pink','blue','skyblue','cyan','orange']

fig = go.Figure(data=[go.Pie(labels=label, values=counts)])
fig.update_layout(title_text='Inactive Minutes Daily')
fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=30,
                  marker=dict(colors=colors, line=dict(color='black', width=3)))
fig.show()




<img  src = media/pie2.jpg>


With the information collected from several people we can reach the conclusion that on Thursdays
are the least productive days in general
Now let's see the number of calories burned for each day of the week

In [None]:
calories = data['Day'].value_counts()
label = calories.index
counts = data['Calories']
colors = ['gold','lightgreen','pink','blue','purple','orange']

fig = go.Figure(data=[go.Pie(labels=label, values=counts)])
fig.update_layout(title_text='Calories Burned Daily')
fig.update_traces(hoverinfo='label+percent', textinfo='value', textfont_size=30,
                  marker=dict(colors=colors, line=dict(color='black', width=3)))
fig.show()




<img src = media/pie3.jpg>

In conclusion, we can observe that Tuesday is indeed one of the days with the greatest movement for most of the individuals in the dataset, since most of the calories are burned on that day  