### Importing Packages

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from bokeh.plotting import figure, show
from bokeh.io import output_notebook

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

### Reading in Data

In [None]:
file_path = '/kaggle/input/fitbit/Fitabase Data 4.12.16-5.12.16/'
df_daily = pd.read_csv(file_path + 'dailyActivity_merged.csv')
df_daily_orig = df_daily
df_daily.head()

### Data Preprocessing of Daily Activity

In [None]:
df_daily.info()

##### A quick check on our data tells us that no values are null in the dataset

In [None]:
df_daily.isnull().values.any()

##### Converting ActivityDate to a datetime column

In [None]:
df_daily['ActivityDate'] = pd.to_datetime(df_daily['ActivityDate'])
df_daily['ActivityDate'].head()

# EDA of Daily Activity 

##### Our distributions of both Total Distance and Total Steps are Right Skewed.

##### Many days are spent walking less than one thousand steps or a mile

In [None]:
fig, ax = plt.subplots(1,2, figsize = (14,6))

sns.histplot(data = df_daily['TotalDistance'], ax = ax[0])

sns.histplot(data = df_daily['TotalSteps'], ax = ax[1])

In [None]:
scatter_plots = sns.PairGrid(df_daily[['TotalSteps', 'TotalDistance', 'Calories','SedentaryMinutes','VeryActiveMinutes']])
scatter_plots.map(plt.scatter)

#### Findings from PairPlot

* Calories has a high positive correlation with Steps, Distance, and Very Active Minutes.
* Sedentary Minutes has a negative correlation with Total Steps, Very Active Distance, and Total Distance

##### Taking a Closer Look at Total Steps and Total Distance vs Calories, they are very similar

In [None]:
df_steps = df_daily[df_daily['TotalSteps'] != 0]
df_distance = df_daily[df_daily['TotalDistance'] != 0]

fig, ax = plt.subplots(1,2,figsize=(14,6))

ax_steps = sns.regplot(data = df_daily, x = 'TotalSteps', y = 'Calories', ax = ax[0])

ax_distance = sns.regplot(data = df_daily, x = 'TotalDistance', y = 'Calories', ax = ax[1])

#### As the month progresses, the amount of times people that track their daily activity drops

In [None]:
fig, ax = plt.subplots(2,1,figsize=(16, 12))


df_daily_num_logs = pd.DataFrame(df_daily['ActivityDate'].value_counts())

ax_user_count = sns.lineplot(data = df_daily_num_logs, ax = ax[0])
ax_user_count.set_ylim(0, 35)
ax_user_count.set_ylabel('Count of Activity Log')
ax_user_count.set_xlabel('Date')
ax_user_count.set_title('Count of Users that Logged Their Daily Activity')
ax_user_count.grid()

ax_mean_steps = sns.lineplot(data = df_daily.groupby('ActivityDate')['TotalSteps'].mean(), ax = ax[1])
ax_mean_steps.set_ylabel('Total Mean Steps Of All Users')
ax_mean_steps.set_xlabel('Date')
ax_mean_steps.set_title('Mean Steps of 33 Unique Users')
ax_mean_steps.grid()

plt.show()

### Correlation Matrix and Correlation Heatmap

In [None]:
corr = df_daily.drop(columns = 'Id').corr()
corr

##### The total distance someone travels has the highest correlation with calories burnt

In [None]:
# Highest correlated fields with calories
corr.iloc[-1].sort_values(ascending = False)

##### Heatmap of daily activities. Values lighter and closer to 1 are positively correlated, while those that negative values are negatively correlated

In [None]:
plt.figure(figsize = (12,12))
sns.heatmap(df_daily.drop(columns = 'Id').corr(), annot=True)

#### Regressionplot of Activity Distance and Calories, Very Active is the most correlated with Calories

In [None]:
fig, ax = plt.subplots(2,2,figsize=(16, 10))

sns.regplot(ax = ax[0,0], data = df_daily, x='VeryActiveDistance', y ='Calories')

sns.regplot(ax = ax[0,1],data = df_daily, x='ModeratelyActiveDistance', y ='Calories')

sns.regplot(ax = ax[1,0],data = df_daily, x='LightActiveDistance', y ='Calories')

sns.regplot(ax = ax[1,1],data = df_daily, x='SedentaryActiveDistance', y ='Calories')

#### Activity Type Minutes Shown more in magnitude, minutes spent sedentary have a negative correlation with calories burned

In [None]:
# Regressionplot of Minutes and Calories
fig, ax = plt.subplots(2,2,figsize=(16, 10))

sns.regplot(ax = ax[0,0], data = df_daily, x='VeryActiveMinutes', y ='Calories', scatter_kws={'alpha':0.5})

sns.regplot(ax = ax[0,1],data = df_daily, x='FairlyActiveMinutes', y ='Calories',scatter_kws={'alpha':0.5})

sns.regplot(ax = ax[1,0],data = df_daily, x='LightlyActiveMinutes', y ='Calories',scatter_kws={'alpha':0.5})

sns.regplot(ax = ax[1,1],data = df_daily, x='SedentaryMinutes', y ='Calories',scatter_kws={'alpha':0.5})

## Breaking Down Metrics by Weekday

In [None]:
print("The data is recorded from:",df_daily['ActivityDate'].min(), "to", df_daily['ActivityDate'].max())

#### 2016-04-12 was a tuesday, and 2016-05-12 was a wednesday. Visualizaing the weekdays and the amount of activity logs on that day show that people are least like to wear their fitbit on Monday.

#### The day people are most likely to wear a fitbit is on Tuesday, which the counts then decrease steadily into next week's Monday

In [None]:
df_daily['weekday'] = df_daily['ActivityDate'].dt.weekday

df_daily['weekday'].value_counts().sort_index(ascending=True).plot(kind='bar')

# Monday = 0
# Sunday = 6

In [None]:
print("Range of our User's Average Burned Calories:",int(df_daily.groupby('Id')['Calories'].mean().min()),
      "-", int(df_daily.groupby('Id')['Calories'].mean().max()))

In [None]:
df_daily.groupby('Id')['Calories'].mean().sort_values(ascending=True).plot(kind='bar')

## Weekday vs Activity Types

#### On a top level view, there isn't a large difference between the days people do activies

In [None]:
df_groupby_sedentary = pd.DataFrame(df_daily.groupby('weekday')['SedentaryMinutes'].mean())
df_groupby_lightly = pd.DataFrame(df_daily.groupby('weekday')['LightlyActiveMinutes'].mean())
df_groupby_fairly = pd.DataFrame(df_daily.groupby('weekday')['FairlyActiveMinutes'].mean())
df_gropuby_very_active = pd.DataFrame(df_daily.groupby('weekday')['VeryActiveMinutes'].mean())

In [None]:
#Plotting out the activity minute means across weekdays,
#with monday = 0, Sunday = 6

fig, ax = plt.subplots(2,2,figsize=(16, 10))

sns.barplot(ax = ax[0,0], data = df_groupby_sedentary, x = df_groupby_sedentary.index, y = 'SedentaryMinutes')

sns.barplot(ax = ax[0,1], data = df_groupby_sedentary, x = df_groupby_sedentary.index, y = 'SedentaryMinutes')

sns.barplot(ax = ax[1,0], data = df_groupby_sedentary, x = df_groupby_sedentary.index, y = 'SedentaryMinutes')

sns.barplot(ax = ax[1,1], data = df_groupby_sedentary, x = df_groupby_sedentary.index, y = 'SedentaryMinutes')

### There are about 1440 minutes in a day, it seems that there are some days where people are completely sedentary. There also seems to be a separation around the 900 minute time. This could be the split between people who wear their fit bit to sleep and those who do not

In [None]:
plt.figure(figsize=(18,12))
ax_strip_plot = sns.stripplot(x="weekday", y="SedentaryMinutes", data=df_daily)

#This line marks 24 hours
ax_strip_plot.axhline(1440)

##### Most of people's days are spent sedentary

In [None]:
plt.figure(figsize=(18,6))
sns.boxplot(x="weekday", y="SedentaryMinutes", data=df_daily)

In [None]:
plt.figure(figsize=(18,6))
sns.boxplot(x="weekday", y="LightlyActiveMinutes", data=df_daily)

In [None]:
plt.figure(figsize=(18,6))
sns.boxplot(x="weekday", y="FairlyActiveMinutes", data=df_daily)

##### Lots of outliers on Sunday, yet the median and IQR is very low. This indicates that there are many people who prefer to exercise on Sunday and those who choose to take a break on that day.

##### While days like Monday, where there are few outliers and the boxplot contains more of the data points indiciates that many people prefer to exercise on Monday.

In [None]:
plt.figure(figsize=(18,6))
sns.boxplot(x="weekday", y="VeryActiveMinutes", data=df_daily)

# EDA of Minute Data

### Calories Burnt by Hour

In [None]:
df_minutes = pd.read_csv(file_path + 'minuteCaloriesWide_merged.csv')
df_minutes['ActivityHour'] = pd.to_datetime(df_minutes['ActivityHour'])

In [None]:
df_minutes_orig = df_minutes

##### Plot of mean calories burned every hour of every day summed together

In [None]:
df_minutes['ActivityHourExtractedhour'] = df_minutes['ActivityHour'].dt.hour

groupby_minutes_aggregate = df_minutes.drop(columns = ['Id', 'ActivityHour']).groupby('ActivityHourExtractedhour').mean()
groupby_minutes_aggregate

In [None]:
plt.figure(figsize= (22,8))
groupby_minutes_hour_mean = groupby_minutes_aggregate.sum(axis = 1) / len(groupby_minutes_aggregate.columns)

ax = sns.lineplot(data = groupby_minutes_hour_mean)
plt.xticks([i for i in range(24)])
ax.set_ylabel('Mean Calories')
ax.set_title("Mean Calories of All Hours Aggregate")
plt.show()

##### Interactive Line Plot, zooming in, we can see consistent peaks in the morning hours, afternoon hours, evening hours

In [None]:
#Calculating the mean sum of calories burnt every hour
df_minutes_date_groupby = df_minutes.drop(columns = ['Id']).groupby('ActivityHour').mean()

df_minutes_date_groupby_mean = df_minutes_date_groupby.sum(axis = 1)

df_minutes_date_groupby_mean

In [None]:
p = figure(title = 'Mean Calories Burnt by Hour', 
           x_axis_label = 'ActivityDateTime',
           y_axis_label = 'Mean Calories Burned',
           x_axis_type='datetime',
          )

In [None]:
output_notebook()
p.line(x = df_minutes_date_groupby_mean.index, y = df_minutes_date_groupby_mean.values, line_width = 2)

show(p)

## Quick Linear Regression on Predicting Daily Calories Burned

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import datetime as dt

df_daily_activities = df_daily_orig
df_daily_activities['ActivityDate'] = pd.to_datetime(df_daily_activities['ActivityDate'])
df_daily_activities

### Data Preprocessing

###### Drop columns that we cannot train our model on.

In [None]:
df_daily_activities = df_daily_activities.drop(columns = ['Id'])

##### Convert Activity Date to ordinal data. Though this can be useful for predicting the weight of a single user. For multiple users however, it can confuse the algorithm

In [None]:
df_daily_activities['ActivityDate']= df_daily_activities['ActivityDate'].map(dt.datetime.toordinal)

##### Splitting our Data into Train and Testing Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_daily_activities.drop(columns = ['Calories']),
                                                    df_daily_activities['Calories'], random_state = 42)

##### From a small data set of just 33 individuals, we can predict with a fairly decent $R^2$ value

In [None]:
Linear_Regression = LinearRegression()

Linear_Regression.fit(X_train, y_train)

prediction = Linear_Regression.predict(X_test)

Linear_Regression.score(X_test,y_test)