# Google Data Analytics Professional Certificate Case Study.
Bellabeat, a high-tech manufacturer of health-focused products for women is a successful small company, but they have the potential to become a larger player in the global smart device market. Urška Sršen, cofounder and Chief Creative Officer of Bellabeat, believes that analyzing smart device fitness data could help unlock new growth opportunities for the company. Hence the objective of this case study is to analyze smart device data to gain insight into how consumers are using their smart devices. Based on the available dataset, activity and sleep data will be analysed.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### There are 4 tables to create: daily_activity,  hourly_activity,   daily_sleep,  minute_sleep
### 1. Prepate the daily_activity dataset:

In [None]:
daily_activity = pd.read_csv('../input/fitbit/Fitabase Data 4.12.16-5.12.16/dailyActivity_merged.csv')
daily_activity

In [None]:
daily_activity.info()
#no null columns.

In [None]:
daily_activity[daily_activity.duplicated()]
# no duplicate rows.

In [None]:
daily_activity['Id'] = daily_activity.Id.astype('category')
daily_activity['ActivityDate'] = pd.to_datetime(daily_activity.ActivityDate)

daily_activity['TotalHours'] = ((daily_activity.VeryActiveMinutes + daily_activity.FairlyActiveMinutes 
                               + daily_activity.LightlyActiveMinutes + daily_activity.SedentaryMinutes) / 60).round(2)
daily_activity['Day'] = daily_activity['ActivityDate'].dt.day_name()

week_day = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
daily_activity['Day'] = daily_activity['Day'].astype('category')
daily_activity['Day'].cat.reorder_categories(week_day, inplace= True)

In [None]:
daily_activity

### 2. Prepare the hourly_activity dataset:

In [None]:
#hourly_ativity = "hourlyCalories_merged.csv" + "hourlyIntensities_merged.csv" + "hourlySteps_merged.csv"

hourly_calories = pd.read_csv('../input/fitbit/Fitabase Data 4.12.16-5.12.16/hourlyCalories_merged.csv')
hourly_intensities = pd.read_csv('../input/fitbit/Fitabase Data 4.12.16-5.12.16/hourlyIntensities_merged.csv')
hourly_steps = pd.read_csv('../input/fitbit/Fitabase Data 4.12.16-5.12.16/hourlySteps_merged.csv')

In [None]:
print('hourly_calories | Total rows:',len(hourly_calories), '| Column Names:', hourly_calories.columns)
print('hourly_intensities | Total rows:',len(hourly_intensities), '| Column Names:',hourly_intensities.columns)
print('hourly_steps | Total rows:',len(hourly_steps), '| Column Names:',hourly_steps.columns)

In [None]:
hourly_activity = pd.merge(hourly_calories, hourly_intensities, how='left', left_on=['Id','ActivityHour'], right_on = ['Id','ActivityHour'])
hourly_activity = pd.merge(hourly_activity, hourly_steps, how='left', left_on=['Id','ActivityHour'], right_on = ['Id','ActivityHour'])

In [None]:
hourly_activity

In [None]:
hourly_activity.info()
# no null values

In [None]:
hourly_activity[hourly_activity.duplicated()]
# no duplicate rows

In [None]:
hourly_activity['Id'] = hourly_activity.Id.astype('category')
hourly_activity['ActivityHour'] = pd.to_datetime(hourly_activity.ActivityHour)
hourly_activity['Hour'] = hourly_activity.ActivityHour.dt.hour

In [None]:
hourly_activity

### 3. Prepare the daily_sleep dataset:

In [None]:
daily_sleep = pd.read_csv('../input/fitbit/Fitabase Data 4.12.16-5.12.16/sleepDay_merged.csv')
daily_sleep

In [None]:
daily_sleep.columns = ['Id', 'Date', 'TotalSleepRecords', 'MinutesAsleep', 'MinutesInBed']

In [None]:
daily_sleep.info()
# no null values.

In [None]:
daily_sleep[daily_sleep.duplicated()]

In [None]:
daily_sleep = daily_sleep.drop_duplicates()
daily_sleep
# 3 duplicate rows removed

In [None]:
import warnings
warnings.filterwarnings('ignore')

daily_sleep['Id'] = daily_sleep.Id.astype('category')
daily_sleep['Date'] = pd.to_datetime(daily_sleep.Date)

In [None]:
daily_sleep['HoursAsleep'] = (daily_sleep.MinutesAsleep / 60).round(2)
daily_sleep['HoursInBed'] = (daily_sleep.MinutesInBed / 60).round(2)
daily_sleep['MinutesNotAsleep'] = daily_sleep.MinutesInBed - daily_sleep.MinutesAsleep
daily_sleep['PercentAsleep'] = (daily_sleep.MinutesAsleep / daily_sleep.MinutesInBed).round(2)

daily_sleep['Day'] = daily_sleep['Date'].dt.day_name()
week_day = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
daily_sleep['Day'] = daily_sleep['Day'].astype('category')
daily_sleep['Day'].cat.reorder_categories(week_day, inplace= True)

In [None]:
daily_sleep

### 4. Prepare the minute_sleep dataset:

In [None]:
minute_sleep = pd.read_csv('../input/fitbit/Fitabase Data 4.12.16-5.12.16/minuteSleep_merged.csv')
minute_sleep

In [None]:
minute_sleep.info()
#no null values

In [None]:
minute_sleep[minute_sleep.duplicated()]

In [None]:
#Randomly see what the duplicate rows look like.
minute_sleep[(minute_sleep.Id == 4702921684) & (minute_sleep.date == '5/7/2016 6:12:00 AM')]

In [None]:
minute_sleep = minute_sleep.drop_duplicates()
minute_sleep
# 543 rows removed

In [None]:
minute_sleep.info()

In [None]:
minute_sleep.columns = ['Id', 'Date', 'Value', 'LogId']

In [None]:
minute_sleep['Id'] = minute_sleep.Id.astype('category')
minute_sleep['LogId'] = minute_sleep.LogId.astype('category')
minute_sleep['Date'] = pd.to_datetime(minute_sleep.Date)

In [None]:
minute_sleep

# Let's Start Analysing!

### Q1: Let's look at people Activity Level breakdown.

In [None]:
summary_daily_activity = daily_activity.describe().transpose().round(2)
summary_daily_activity

In [None]:
Q1_df = summary_daily_activity[8:12]['mean']
Q1_df

In [None]:
pie, ax = plt.subplots(figsize=[8,8])
labels = Q1_df.keys()
plt.pie(x=Q1_df, autopct="%.1f%%", labels=labels, pctdistance=0.5,labeldistance = 1.2) #explode=[0.05]*4)#
plt.title("Activity Level Breakdown", fontsize=14)
plt.show()

# On average 81.3% of the time people are sedentary.

### Q2: How do people's activity vary throughout the week?

In [None]:
plt.rcParams['figure.figsize'] = 10,6
sns.set_style('darkgrid')

f, axes = plt.subplots(2, 1, figsize = (10,12), sharex=True)

sns.boxplot(data = daily_activity, x='Day', y='Calories', ax = axes[0])
sns.boxplot(data = daily_activity, x='Day', y='TotalSteps', ax = axes[1])
plt.show()

# The calories burned do not vary much throughout the week.

# Although the median steps taken throughout the week do not differ much, 
# on Saturdays the data is skewed towards people taking more steps, while on Sunday they take less steps.

### Q3: How do Calories and Steps vary by the hour?

In [None]:
# Create a table to summarise the average values per hour.

summary_hourly_activity = hourly_activity.groupby(['Hour']).mean().reset_index().round(2)
summary_hourly_activity.columns = ['Hour', 'AvgCalories', 'AvgTotalIntensity', 'AvgIntensity', 'AvgSteps']
summary_hourly_activity

In [None]:
f, axes = plt.subplots(3, 1, figsize = (10,7), sharex=True)

Q3_axis = np.arange(0,24)
Q3_f1 = sns.lineplot(data = summary_hourly_activity, x = 'Hour', y = 'AvgCalories', ax = axes[0])
Q3_f2 = sns.lineplot(data = summary_hourly_activity, x = 'Hour', y = 'AvgSteps', ax = axes[1])
Q3_f3 = sns.lineplot(data = summary_hourly_activity, x = 'Hour', y = 'AvgIntensity', ax = axes[2])

Q3_f1.set_xticks(Q3_axis)
plt.show()

In [None]:
#The three plots above have very similar shapes, showing they are very highly correlated with each other. 
#Let's verify this with a heatmap.

correlations = summary_hourly_activity.corr()
sns.heatmap(correlations, annot = True)
plt.show()

In [None]:
#Just to prove the correlation, we can see here that StepTotal and Calories are positively correlated.

sns.lmplot(data = hourly_activity, x = 'StepTotal', y = 'Calories', height = 6, aspect = 1.5)
plt.show()

### Q5: How many hours of sleep do fitbit users get in a day?

In [None]:
daily_sleep.describe().transpose()

In [None]:
Q5_axis = np.arange(0,14)
Q5_f1 = sns.histplot(daily_sleep.HoursAsleep)
Q5_f1.set_xticks(Q5_axis)
plt.show()

# On average people get approximately 7 hours of sleep per day.
# It is interesting to note the number of people who record only 1-2 hours of sleep a day.

### Q6: How much time do people spend awake in bed?

In [None]:
Q6_axis = np.arange(0,400,20)

Q6_f1 = sns.histplot(daily_sleep.MinutesNotAsleep, bins=40)
Q6_f1.set_xticks(Q6_axis)
plt.show()

# On average people spend 40 minutes lying awake in bed.

### Q7: Do people have different sleep patterns on different day of the week?

In [None]:
sns.boxplot(x = daily_sleep.Day, y = daily_sleep.HoursAsleep)
plt.show()

# There is a wider variation in sleep time on the weekends.
# Also people spend more time in bed on Sundays.

In [None]:
Q7_df = daily_sleep.groupby(['Day']).mean()
Q7_df

In [None]:
sns.barplot(data = Q7_df, x = Q7_df.index, y='HoursAsleep')
plt.show()
# On average people get more than 7 hours of sleep on Wednesday and Sunday, and less than 7 on other days.

### Q8: How frequently do people take naps?

In [None]:
daily_sleep.head()

In [None]:
# Here I assume that more than 1 sleep record means the user took a nap sometime in the day in addition to their normal sleep at night.

daily_sleep.groupby(['TotalSleepRecords'])['Id'].count().to_frame()

In [None]:
Q8_df = daily_sleep[daily_sleep.TotalSleepRecords >1].groupby(['Day'])['Id'].count().to_frame()

Q8_df.reset_index(inplace=True)
Q8_df = Q8_df.rename(columns = {'Id':'NapCount'})
Q8_df

In [None]:
sns.catplot(data=Q8_df, kind='bar', x='Day', y='NapCount', 
            height = 6, aspect = 1.5)

plt.show()

### Q9: What time do people usually go to bed?

In [None]:
len(minute_sleep.LogId.unique())

# There are 459 sleep "sessions". So let's create a table to summarise these sleep session.

In [None]:
sleep_session = minute_sleep.groupby(['LogId'])['Id'].count().to_frame().reset_index()
sleep_session = sleep_session.rename(columns = {'Id':'SleepHours'})
sleep_session['SleepHours'] = (sleep_session.SleepHours/60).round(2)

In [None]:
x = list(sleep_session.LogId)
time_list = []
for record in x:
    df = minute_sleep[minute_sleep.LogId == record]
    min_time = min(df.Date)
    time_list.append(min_time)

In [None]:
sleep_session['StartTime'] = time_list
sleep_session['StartHour'] = sleep_session.StartTime.dt.hour
sleep_session['Day'] = sleep_session['StartTime'].dt.day_name()
sleep_session['Day'] = sleep_session['Day'].astype('category')

week_day = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
sleep_session['Day'].cat.reorder_categories(week_day, inplace= True)

In [None]:
# Create a nap column to identify when a particular LogId start in the day time.
sleep_session['Nap'] = sleep_session.StartHour.apply(lambda x: 'Yes' if (x >= 9) & (x <=19) else 'No')

In [None]:
sleep_session

In [None]:
Q9_axis = np.arange(0,24)

Q9_f1 = sns.histplot(sleep_session.StartHour, bins = 23)
Q9_f1.set_xticks(Q9_axis)
plt.show()

# Most people go to bed before midnight.

In [None]:
# Now let's see how bedtime differs by Day of the week.
# Here we can see that more people are going to bed after midnight as the weekend gets nearer.

Q9_f2 = sns.FacetGrid(sleep_session, col='Day')
kws = dict(bins = 23)
Q9_f2.map(plt.hist, 'StartHour', **kws)
plt.show()

### Q10: How is the bedtime and duration of sleep correlated?

In [None]:
f, axes = plt.subplots(1, 1, figsize = (12,7))

# midday naps are removed to reduce clutter
Q10_f1 = sns.boxplot(data=(sleep_session[sleep_session.Nap == 'No']), x='StartHour', y='SleepHours') 
Q10_f1.axhline(7, c='gray', ls='--')
Q10_f1.axhline(9, c='gray', ls='--')
plt.show()

# As expected those who go to bed earlier tend to get the recommended 7-9 hours of sleep.

### Q11: Do people regularly wear their Fitbit throughout the day and to monitor their sleep?

In [None]:
# First we count how many times the user used their Fitbit to record their sleep in the month under review.

sleep_records = daily_sleep.groupby(['Id'])['Date'].count().to_frame().sort_values(by = 'Date', ascending=True)
sleep_records.reset_index(inplace=True)
sleep_records = sleep_records.rename(columns = {'Date':'TotalSleepRecords'})
sleep_records.head()

In [None]:
# Then we count how many days each user wore their Fitbit throughout the month. 

activity_records = daily_activity.groupby(['Id'])['ActivityDate'].count().to_frame().sort_values(by = 'ActivityDate', ascending=True)
activity_records.reset_index(inplace=True)
activity_records = activity_records.rename(columns = {'ActivityDate':'TotalActivityRecords'})
activity_records.head()

In [None]:
#And now we combine the two.

user_records = pd.merge(activity_records, sleep_records, how='outer', left_on=['Id'], right_on = ['Id'])

user_records.head()

In [None]:
user_records['TotalSleepRecords'] = user_records['TotalSleepRecords'].fillna(0)
user_records['Id'] = user_records.Id.astype('category')
user_records['TotalSleepRecords'] = user_records.TotalSleepRecords.astype('int')

In [None]:
user_records.sort_values(by='TotalSleepRecords')

#Here we can see that some people do not regularly wear their Fitbit to monitor their sleep.

In [None]:
user_records.describe()

# On average a person wears their Fitbit for 28 days, but only in 12 of those days do they monitor their sleep.
# Half of all users record their sleep only 5 times or less throughout the month.