In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Following notebook is the partial work on Kaggle. For complete project, please refer to [Github project](https://github.com/peeush-the-developer/projects/tree/main/CaloriesBurnedPrediction)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

## 1. Load data from csv files in Pandas DataFrame

In [None]:
calories_df = pd.read_csv('/kaggle/input/fmendesdat263xdemos/calories.csv')
exercise_df = pd.read_csv('/kaggle/input/fmendesdat263xdemos/exercise.csv')

## 2. Glance data from both the dataframes

In [None]:
calories_df.head()

In [None]:
exercise_df.head()

In [None]:
print(f'Shape: Calories - {calories_df.shape}')
print(f'Shape: Exercise - {exercise_df.shape}')

From the dataframes, it seems that we can merge both the datasets on column 'User_ID'

## 3. Merge both dataframes on 'User_ID' column

In [None]:
df = pd.merge(exercise_df, calories_df, on='User_ID')
df.head()

In [None]:
df.shape

Now, we can get rid of 'User_ID' column as we don't require it for analysis.

In [None]:
df.drop('User_ID', axis=1, inplace=True)
df.head()

So, now our dataframe is ready for the exploration.

## 4. Data exploration

### 1. Distribution of data over 'Gender'

In [None]:
sns.catplot(x='Gender', kind='count', data=df)

In [None]:
sns.catplot(x='Gender', y='Calories', kind='box', data=df)

### 2. Distribution of data over 'Age'

In [None]:
sns.distplot(df['Age'])

In [None]:
sns.jointplot(x='Age', y='Calories', data=df)

Let's plot for each numeric column by defining Python function

In [None]:
def plot_numeric_col(col_name):
    fig, (ax1, ax2) = plt.subplots(1,2,figsize=(10,6))
    fig.suptitle(f'Plots for "{col_name}"')
    sns.distplot(df[col_name], ax=ax1)
    ax1.set_title(f'Distribution of {col_name}')
    sns.regplot(x=col_name, y='Calories', data=df, ax=ax2)
    ax2.set_title(f'{col_name} vs Calories')
    plt.show()

In [None]:
def get_numeric_cols():
    num_cols = df.select_dtypes(include=np.number).columns.tolist()
    del num_cols[-1]
    return num_cols

In [None]:
for col in get_numeric_cols():
    plot_numeric_col(col)

By looking at above plots, we can see that 'Duration', 'Heart_Rate', 'Body_Temp' are interesting columns for the prediction of calories burnt.

## Prepare for model building

### 1. Model with 'Duration' column

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [None]:
X = df[['Duration']]
y = df['Calories']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

In [None]:
lr_1 = LinearRegression()
lr_1.fit(X_train, y_train)

y_pred = lr_1.predict(X_test)
mse = mean_squared_error(y_pred, y_test)
print('Root Mean Squared Error: {}'.format(np.sqrt(mse)))

In [None]:
plt.scatter(X_test, y_test, color='blue') # plotting the observation line
plt.plot(X_test, y_pred, color='red') # plotting the regression line
plt.show()

### 2. Model with 'Duration', 'Heart_rate', 'Body_Temp' columns

In [None]:
X= df[['Duration', 'Heart_Rate', 'Body_Temp']]
y = df['Calories']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

In [None]:
lr_2 = LinearRegression()
lr_2.fit(X_train, y_train)

y_pred = lr_2.predict(X_test)
mse = mean_squared_error(y_pred, y_test)
print('Root Mean Squared Error: {}'.format(np.sqrt(mse)))

In [None]:
plt.scatter(X_test['Duration'], y_test, color='blue') # plotting the observation line
plt.plot(X_test['Duration'], y_pred, color='red') # plotting the regression line
plt.show()

In [None]:
plt.scatter(X_test['Heart_Rate'], y_test, color='blue') # plotting the observation line
plt.plot(X_test['Heart_Rate'], y_pred, color='red') # plotting the regression line
plt.show()

In [None]:
plt.scatter(X_test['Body_Temp'], y_test, color='blue') # plotting the observation line
plt.plot(X_test['Body_Temp'], y_pred, color='red') # plotting the regression line
plt.show()

### 3. Model with All columns

To prepare model with All columns (that includes 'Gender' as categorical column), we need to convert 'Gender' column from str to numeric.

We can map as following:
+ 'Male':1
+ 'Female':0

We can either do this by applying map function on the dataframe, or we can use LabelEncoder from sklearn package.
Here, I'm going to use LabelEncoder

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
df.head()

In [None]:
le = LabelEncoder()
df.loc[:, 'Gender'] = le.fit_transform(df.Gender.values)
df.head()

Split the data into train and validation sets

In [None]:
x_train, x_val, y_train, y_val = train_test_split(df.drop('Calories', axis=1), df.Calories.values, test_size=0.3, random_state=42)

In [None]:
print(x_train.shape, x_val.shape)
print(y_train.shape, y_val.shape)

#### Linear regression model

In [None]:
lr = LinearRegression()
lr.fit(x_train, y_train)
preds = lr.predict(x_val)
rmse = np.sqrt(mean_squared_error(y_val, preds))
r2 = r2_score(y_val, preds)

print(f'Model=LinearRegression, RMSE={rmse:.3f}, R2={r2:.3f}')

#### Decision tree model

In [None]:
dt = DecisionTreeRegressor(random_state=42)
dt.fit(x_train, y_train)
preds = dt.predict(x_val)
rmse = np.sqrt(mean_squared_error(y_val, preds))
r2 = r2_score(y_val, preds)

print(f'Model=DecisionTree, RMSE={rmse:.3f}, R2={r2:.3f}')

#### Random forest model

In [None]:
rf = RandomForestRegressor(n_estimators=30, random_state=42)
rf.fit(x_train, y_train)
preds = rf.predict(x_val)
rmse = np.sqrt(mean_squared_error(y_val, preds))
r2 = r2_score(y_val, preds)

print(f'Model=RandomForest, RMSE={rmse:.3f}, R2={r2:.3f}')

We can see from above, RandomForest model is giving us best R2_Score on validation dataset.

Above notebook is the partial work on Kaggle. For complete project, please refer to [Github project](https://github.com/peeush-the-developer/projects/tree/main/CaloriesBurnedPrediction)