# Load libraries and data

In [31]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

train.head()

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,0,male,36,189.0,82.0,26.0,101.0,41.0,150.0
1,1,female,64,163.0,60.0,8.0,85.0,39.7,34.0
2,2,female,51,161.0,64.0,7.0,84.0,39.8,29.0
3,3,male,20,192.0,90.0,25.0,105.0,40.7,140.0
4,4,female,38,166.0,61.0,25.0,102.0,40.6,146.0


In [32]:
test.head()

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,750000,male,45,177.0,81.0,7.0,87.0,39.8
1,750001,male,26,200.0,97.0,20.0,101.0,40.5
2,750002,female,29,188.0,85.0,16.0,102.0,40.4
3,750003,female,39,172.0,73.0,20.0,107.0,40.6
4,750004,female,30,173.0,67.0,16.0,94.0,40.5


In [33]:
sample_submission.head()

Unnamed: 0,id,Calories
0,750000,88.283
1,750001,88.283
2,750002,88.283
3,750003,88.283
4,750004,88.283


In [34]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   id          750000 non-null  int64  
 1   Sex         750000 non-null  object 
 2   Age         750000 non-null  int64  
 3   Height      750000 non-null  float64
 4   Weight      750000 non-null  float64
 5   Duration    750000 non-null  float64
 6   Heart_Rate  750000 non-null  float64
 7   Body_Temp   750000 non-null  float64
 8   Calories    750000 non-null  float64
dtypes: float64(6), int64(2), object(1)
memory usage: 51.5+ MB


In [35]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   id          250000 non-null  int64  
 1   Sex         250000 non-null  object 
 2   Age         250000 non-null  int64  
 3   Height      250000 non-null  float64
 4   Weight      250000 non-null  float64
 5   Duration    250000 non-null  float64
 6   Heart_Rate  250000 non-null  float64
 7   Body_Temp   250000 non-null  float64
dtypes: float64(5), int64(2), object(1)
memory usage: 15.3+ MB


In [36]:
train.describe(include='all')

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
count,750000.0,750000,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0
unique,,2,,,,,,,
top,,female,,,,,,,
freq,,375721,,,,,,,
mean,374999.5,,41.420404,174.697685,75.145668,15.421015,95.483995,40.036253,88.282781
std,216506.495284,,15.175049,12.824496,13.982704,8.354095,9.449845,0.779875,62.395349
min,0.0,,20.0,126.0,36.0,1.0,67.0,37.1,1.0
25%,187499.75,,28.0,164.0,63.0,8.0,88.0,39.6,34.0
50%,374999.5,,40.0,174.0,74.0,15.0,95.0,40.3,77.0
75%,562499.25,,52.0,185.0,87.0,23.0,103.0,40.7,136.0


# Pre−processing
- Dummy variables for categorical variables
- Split data into train and validation sets

In [37]:
# Dummy variables for categorical variables
train_dummies = pd.get_dummies(train)
test_dummies = pd.get_dummies(test)

# Split data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(train_dummies.drop('Calories', axis=1), train_dummies['Calories'], test_size=0.3, random_state=42)

X_train.head()

Unnamed: 0,id,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Sex_female,Sex_male
6036,6036,30,164.0,68.0,29.0,110.0,40.8,True,False
617631,617631,58,183.0,83.0,14.0,92.0,40.3,False,True
580701,580701,34,175.0,65.0,4.0,80.0,38.9,True,False
696804,696804,74,184.0,93.0,29.0,110.0,40.7,False,True
316225,316225,26,161.0,61.0,29.0,109.0,40.8,True,False


# Create Random forest regressor model and calculate RMSE

In [21]:
# Create model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions on validation set
y_pred = model.predict(X_val)

# Calculate RMSE
rmse = mean_squared_error(y_val, y_pred, squared=False)

rmse


14.697819060000004

# Predict for test data

In [28]:
y_test = model.predict(test_dummies)

submission = pd.DataFrame({'id': test_dummies['id'], 'Calories': y_test})
submission.to_csv('submission.csv', index=False)

submission

Unnamed: 0,id,Calories
0,750000,26.85
1,750001,107.92
2,750002,87.48
3,750003,128.48
4,750004,76.00
...,...,...
249995,999995,26.01
249996,999996,9.10
249997,999997,72.82
249998,999998,168.68
