In [1]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [2]:
!unzip -qq "/content/gdrive/MyDrive/데이콘 Basic 칼로리 소모량 예측 AI 경진대회.zip"

In [1]:
import pandas as pd
import numpy as np
import random
import os
import gc

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

In [2]:
train = pd.read_csv('/content/open/train.csv')
test = pd.read_csv('/content/open/test.csv')

In [3]:
train

Unnamed: 0,ID,Exercise_Duration,Body_Temperature(F),BPM,Height(Feet),Height(Remainder_Inches),Weight(lb),Weight_Status,Gender,Age,Calories_Burned
0,TRAIN_0000,26.0,105.6,107.0,5.0,9.0,154.3,Normal Weight,F,45,166.0
1,TRAIN_0001,7.0,103.3,88.0,6.0,6.0,224.9,Overweight,M,50,33.0
2,TRAIN_0002,7.0,103.3,86.0,6.0,3.0,218.3,Overweight,M,29,23.0
3,TRAIN_0003,17.0,104.0,99.0,5.0,6.0,147.7,Normal Weight,F,33,91.0
4,TRAIN_0004,9.0,102.7,88.0,5.0,10.0,169.8,Normal Weight,M,38,32.0
...,...,...,...,...,...,...,...,...,...,...,...
7495,TRAIN_7495,22.0,105.1,104.0,4.0,10.0,112.4,Normal Weight,F,75,151.0
7496,TRAIN_7496,20.0,105.3,104.0,5.0,8.0,147.7,Normal Weight,F,21,114.0
7497,TRAIN_7497,8.0,103.1,90.0,6.0,2.0,202.8,Overweight,M,57,41.0
7498,TRAIN_7498,12.0,104.4,97.0,5.0,9.0,167.6,Overweight,M,35,57.0


In [4]:
test

Unnamed: 0,ID,Exercise_Duration,Body_Temperature(F),BPM,Height(Feet),Height(Remainder_Inches),Weight(lb),Weight_Status,Gender,Age
0,TEST_0000,26.0,105.1,107.0,5.0,1.0,114.6,Normal Weight,F,45
1,TEST_0001,29.0,104.9,111.0,6.0,2.0,198.4,Overweight,M,21
2,TEST_0002,11.0,104.0,90.0,5.0,9.0,169.8,Normal Weight,M,58
3,TEST_0003,24.0,105.4,108.0,5.0,12.0,196.2,Overweight,M,35
4,TEST_0004,29.0,106.0,116.0,5.0,7.0,147.7,Normal Weight,F,67
...,...,...,...,...,...,...,...,...,...,...
7495,TEST_7495,28.0,105.6,106.0,5.0,12.0,178.6,Normal Weight,M,51
7496,TEST_7496,4.0,102.0,84.0,5.0,10.0,183.0,Overweight,M,25
7497,TEST_7497,24.0,105.1,97.0,5.0,2.0,130.1,Normal Weight,F,42
7498,TEST_7498,8.0,103.1,83.0,5.0,5.0,134.5,Normal Weight,F,48


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ID                        7500 non-null   object 
 1   Exercise_Duration         7500 non-null   float64
 2   Body_Temperature(F)       7500 non-null   float64
 3   BPM                       7500 non-null   float64
 4   Height(Feet)              7500 non-null   float64
 5   Height(Remainder_Inches)  7500 non-null   float64
 6   Weight(lb)                7500 non-null   float64
 7   Weight_Status             7500 non-null   object 
 8   Gender                    7500 non-null   object 
 9   Age                       7500 non-null   int64  
 10  Calories_Burned           7500 non-null   float64
dtypes: float64(7), int64(1), object(3)
memory usage: 644.7+ KB


In [8]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ID                        7500 non-null   object 
 1   Exercise_Duration         7500 non-null   float64
 2   Body_Temperature(F)       7500 non-null   float64
 3   BPM                       7500 non-null   float64
 4   Height(Feet)              7500 non-null   float64
 5   Height(Remainder_Inches)  7500 non-null   float64
 6   Weight(lb)                7500 non-null   float64
 7   Weight_Status             7500 non-null   object 
 8   Gender                    7500 non-null   object 
 9   Age                       7500 non-null   int64  
dtypes: float64(6), int64(1), object(3)
memory usage: 586.1+ KB


In [5]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [6]:
# 독립변수로 설정할 train_x에서는 종속변수를 제거합니다. 또한 분석에 활용하지 않는 ID 데이터를 제거합니다.
train_x = train.drop(['ID', 'Calories_Burned'], axis = 1)
# train_y 변수를 종속변수로 사용하기 위해 Calories_Burned 데이터를 지정하였습니다.
train_y = train['Calories_Burned']

# train_x 데이터와 마찬가지로 분석에 활용하지 않는 ID 데이터를 제거합니다.
test_x = test.drop('ID', axis = 1)

In [7]:
ordinal_features = ['Weight_Status', 'Gender']

for feature in ordinal_features:
    le = LabelEncoder()
    le = le.fit(train_x[feature])
    train_x[feature] = le.transform(train_x[feature])

    # train데이터에서 존재하지 않았던 값이 test 데이터에 존재할 수도 있습니다.
    # 따라서 test 데이터를 바로 변형시키지 않고 고윳값을 확인후 test 데이터를 변환합니다.
    for label in np.unique(test_x[feature]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    test_x[feature] = le.transform(test_x[feature])

In [12]:
!pip install PyCaret

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting PyCaret
  Downloading pycaret-3.0.0-py3-none-any.whl (481 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m481.8/481.8 KB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tbats>=1.1.0
  Downloading tbats-1.1.2-py3-none-any.whl (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 KB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting joblib>=1.2.0
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 KB[0m [31m36.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting schemdraw>=0.14
  Downloading schemdraw-0.16-py3-none-any.whl (105 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.8/105.8 KB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
Collecting kaleido>=0.2.1
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)

In [9]:
from pycaret.regression import *

In [16]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp39-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.1.1


In [17]:

import pandas as pd
import torch
import torch.nn as nn
import xgboost
import lightgbm
import catboost


In [18]:
train_x['target'] = train_y

In [19]:
reg = setup(data=train_x, target='target')

Unnamed: 0,Description,Value
0,Session id,559
1,Target,target
2,Target type,Regression
3,Original data shape,"(7500, 10)"
4,Transformed data shape,"(7500, 10)"
5,Transformed train set shape,"(5250, 10)"
6,Transformed test set shape,"(2250, 10)"
7,Numeric features,9
8,Preprocess,True
9,Imputation type,simple


In [20]:
best_model = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,1.6156,6.6348,2.5547,0.9983,0.0529,0.0307,0.237
xgboost,Extreme Gradient Boosting,1.8837,8.2148,2.8577,0.9979,0.0473,0.0317,0.406
et,Extra Trees Regressor,2.0681,11.3231,3.3504,0.9972,0.0497,0.033,1.463
rf,Random Forest Regressor,2.5066,16.2094,4.0129,0.9959,0.0595,0.04,1.651
gbr,Gradient Boosting Regressor,2.8983,16.8501,4.0998,0.9958,0.1158,0.0686,0.541
dt,Decision Tree Regressor,4.608,51.8869,7.1814,0.9869,0.0907,0.0644,0.136
knn,K Neighbors Regressor,6.8503,89.4577,9.4421,0.9775,0.2244,0.1747,0.072
ada,AdaBoost Regressor,8.9338,128.4207,11.3229,0.9677,0.381,0.3756,0.545
br,Bayesian Ridge,8.3707,128.8702,11.3341,0.9677,0.4011,0.2943,0.058
lar,Least Angle Regression,8.3711,128.8721,11.3341,0.9677,0.4011,0.2941,0.105


Processing:   0%|          | 0/81 [00:00<?, ?it/s]

Root Mean Squared Error

In [29]:
# 독립변수로 설정할 train_x에서는 종속변수를 제거합니다. 또한 분석에 활용하지 않는 ID 데이터를 제거합니다.
train_x = train.drop(['ID', 'Calories_Burned'], axis = 1)
# train_y 변수를 종속변수로 사용하기 위해 Calories_Burned 데이터를 지정하였습니다.
train_y = train['Calories_Burned']

# train_x 데이터와 마찬가지로 분석에 활용하지 않는 ID 데이터를 제거합니다.
test_x = test.drop('ID', axis = 1)

In [30]:
ordinal_features = ['Weight_Status', 'Gender']

for feature in ordinal_features:
    le = LabelEncoder()
    le = le.fit(train_x[feature])
    train_x[feature] = le.transform(train_x[feature])

    # train데이터에서 존재하지 않았던 값이 test 데이터에 존재할 수도 있습니다.
    # 따라서 test 데이터를 바로 변형시키지 않고 고윳값을 확인후 test 데이터를 변환합니다.
    for label in np.unique(test_x[feature]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    test_x[feature] = le.transform(test_x[feature])

In [31]:
from catboost import CatBoostRegressor

In [32]:
model = CatBoostRegressor(random_state = 42)

In [33]:
model.fit(train_x, train_y)

Learning rate set to 0.056291
0:	learn: 59.7438295	total: 12.8ms	remaining: 12.8s
1:	learn: 56.7606447	total: 30.2ms	remaining: 15.1s
2:	learn: 53.9317721	total: 47ms	remaining: 15.6s
3:	learn: 51.2574391	total: 51.1ms	remaining: 12.7s
4:	learn: 48.7235503	total: 71.7ms	remaining: 14.3s
5:	learn: 46.3460342	total: 87.5ms	remaining: 14.5s
6:	learn: 44.0449341	total: 93.2ms	remaining: 13.2s
7:	learn: 41.9007806	total: 108ms	remaining: 13.3s
8:	learn: 39.8807253	total: 130ms	remaining: 14.4s
9:	learn: 37.9646220	total: 143ms	remaining: 14.2s
10:	learn: 36.1060691	total: 168ms	remaining: 15.1s
11:	learn: 34.4063165	total: 170ms	remaining: 14s
12:	learn: 32.8490843	total: 175ms	remaining: 13.3s
13:	learn: 31.3472204	total: 180ms	remaining: 12.7s
14:	learn: 29.8969888	total: 184ms	remaining: 12.1s
15:	learn: 28.4827896	total: 200ms	remaining: 12.3s
16:	learn: 27.1650134	total: 203ms	remaining: 11.7s
17:	learn: 25.9016165	total: 208ms	remaining: 11.4s
18:	learn: 24.7002362	total: 213ms	remain

<catboost.core.CatBoostRegressor at 0x7f6fff260940>

In [34]:
train_x.shape

(7500, 9)

In [35]:
test_x.shape

(7500, 9)

In [36]:
preds = model.predict(test_x)

In [38]:
submission = pd.read_csv('/content/open/sample_submission.csv')

In [39]:
submission

Unnamed: 0,ID,Calories_Burned
0,TEST_0000,0
1,TEST_0001,0
2,TEST_0002,0
3,TEST_0003,0
4,TEST_0004,0
...,...,...
7495,TEST_7495,0
7496,TEST_7496,0
7497,TEST_7497,0
7498,TEST_7498,0


In [40]:
submission['Calories_Burned'] = preds

In [42]:
submission

Unnamed: 0,ID,Calories_Burned
0,TEST_0000,172.301054
1,TEST_0001,189.117381
2,TEST_0002,53.231728
3,TEST_0003,161.120447
4,TEST_0004,225.215802
...,...,...
7495,TEST_7495,196.198983
7496,TEST_7496,9.324115
7497,TEST_7497,129.936597
7498,TEST_7498,31.978740


In [41]:
submission.to_csv('./submit.csv', index = False)