In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 0)


from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score,mean_squared_error

import xgboost as xgb


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/playground-series-s5e5/sample_submission.csv
/kaggle/input/playground-series-s5e5/train.csv
/kaggle/input/playground-series-s5e5/test.csv


In [2]:
source_dir = '/kaggle/input/playground-series-s5e5/'

In [3]:
df_train = pd.read_csv(os.path.join(source_dir, 'train.csv'))
df_test = pd.read_csv(os.path.join(source_dir, 'test.csv'))
df_submission = pd.read_csv(os.path.join(source_dir, 'sample_submission.csv'))

del df_train['id']
del df_test['id']

In [4]:
df_train.shape, df_test.shape

((750000, 8), (250000, 7))

In [5]:
df_train.head(5)

Unnamed: 0,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,male,36,189.0,82.0,26.0,101.0,41.0,150.0
1,female,64,163.0,60.0,8.0,85.0,39.7,34.0
2,female,51,161.0,64.0,7.0,84.0,39.8,29.0
3,male,20,192.0,90.0,25.0,105.0,40.7,140.0
4,female,38,166.0,61.0,25.0,102.0,40.6,146.0


In [6]:
df_test.head(5)

Unnamed: 0,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,male,45,177.0,81.0,7.0,87.0,39.8
1,male,26,200.0,97.0,20.0,101.0,40.5
2,female,29,188.0,85.0,16.0,102.0,40.4
3,female,39,172.0,73.0,20.0,107.0,40.6
4,female,30,173.0,67.0,16.0,94.0,40.5


In [7]:
df_train.isna().sum()

Sex           0
Age           0
Height        0
Weight        0
Duration      0
Heart_Rate    0
Body_Temp     0
Calories      0
dtype: int64

In [8]:
df_test.isna().sum()

Sex           0
Age           0
Height        0
Weight        0
Duration      0
Heart_Rate    0
Body_Temp     0
dtype: int64

In [9]:
sex_lbl_enc = LabelEncoder()
sex_lbl_enc.fit(df_train['Sex'])


df_train['Sex'] = sex_lbl_enc.transform(df_train['Sex'])
df_test['Sex'] = sex_lbl_enc.transform(df_test['Sex'])


In [10]:
df_train.head(5)

Unnamed: 0,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,1,36,189.0,82.0,26.0,101.0,41.0,150.0
1,0,64,163.0,60.0,8.0,85.0,39.7,34.0
2,0,51,161.0,64.0,7.0,84.0,39.8,29.0
3,1,20,192.0,90.0,25.0,105.0,40.7,140.0
4,0,38,166.0,61.0,25.0,102.0,40.6,146.0


In [11]:
df_test.head(5)

Unnamed: 0,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,1,45,177.0,81.0,7.0,87.0,39.8
1,1,26,200.0,97.0,20.0,101.0,40.5
2,0,29,188.0,85.0,16.0,102.0,40.4
3,0,39,172.0,73.0,20.0,107.0,40.6
4,0,30,173.0,67.0,16.0,94.0,40.5


In [12]:
df_train.columns

Index(['Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp',
       'Calories'],
      dtype='object')

In [13]:
y = np.log1p(df_train["Calories"])
X = np.array(df_train.drop(columns=["Calories"]))

In [14]:
scaler = StandardScaler()
scaler.fit(X)

X_scale = scaler.transform(X)


In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scale, y, test_size=0.33, random_state=42)

In [16]:
model = LinearRegression()
model.fit(X_train, y_train)

mean_squared_error(y_train, model.predict(X_train)), mean_squared_error(y_test, model.predict(X_test))

(0.03230969054184837, 0.032487890469909804)

In [17]:
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1)

# Train the model
model.fit(X_train, y_train)

mean_squared_error(y_train, model.predict(X_train)), mean_squared_error(y_test, model.predict(X_test))

(0.003494728440652229, 0.0038627388402484625)

In [18]:
model = RandomForestRegressor(n_estimators=100, max_depth=12, random_state=42, n_jobs=-1)

# Train the model
model.fit(X_train, y_train)

mean_squared_error(y_train, model.predict(X_train)), mean_squared_error(y_test, model.predict(X_test))

(0.003384699360205874, 0.004241084124996743)

In [19]:
ypred_log = model.predict(np.array(df_test[['Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']]))
ypred = np.expm1(ypred_log)
df_submission['Calories'] = ypred
df_submission.to_csv("./submission.csv", index=False)