In [4]:
! pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.3-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.3-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.1.3


In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline

In [10]:
cal_ex = pd.read_csv('cal_ex.csv')

In [12]:
cal_ex.head()

Unnamed: 0,User_ID,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,14733363,0,68,190.0,94.0,29.0,105.0,40.8,231.0
1,14861698,1,20,166.0,60.0,14.0,94.0,40.3,66.0
2,11179863,0,69,179.0,79.0,5.0,88.0,38.7,26.0
3,16180408,1,34,179.0,71.0,13.0,100.0,40.5,71.0
4,17771927,1,27,154.0,58.0,10.0,81.0,39.8,35.0


In [15]:
xvals = [
    'Gender',
    'Age',
    'Heart_Rate',
    'Body_Temp'
]
X = cal_ex[xvals]
y = cal_ex['Calories']

In [17]:
X.head()

Unnamed: 0,Gender,Age,Heart_Rate,Body_Temp
0,0,68,105.0,40.8
1,1,20,94.0,40.3
2,0,69,88.0,38.7
3,1,34,100.0,40.5
4,1,27,81.0,39.8


In [19]:
y.head()

0    231.0
1     66.0
2     26.0
3     71.0
4     35.0
Name: Calories, dtype: float64

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1234)

In [23]:
X_train.shape, X_test.shape

((11250, 4), (3750, 4))

In [31]:
y_train.shape, y_test.shape

((11250,), (3750,))

In [33]:
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [41]:
models = [LinearRegression(), Lasso(), Ridge(), RandomForestRegressor(), XGBRegressor()]

for i in range(5):
    models[i].fit(X_train_sc, y_train)
    print(f'{models[i]}: ')
    
    training_score = models[i].score(X_train_sc, y_train)
    print('Training Score: ', training_score)
    
    training_preds = models[i].predict(X_train_sc)
    print('Training Error: ', metrics.root_mean_squared_error(y_train, training_preds))
    
    testing_score = models[i].score(X_test_sc, y_test)
    print('Testing Score: ', testing_score)
    
    testing_preds = models[i].predict(X_test_sc)
    print('Testing Error: ', metrics.root_mean_squared_error(y_test, testing_preds))
    
    print()

LinearRegression(): 
Training Score:  0.8704968148364643
Training Error:  22.484948206805676
Testing Score:  0.8674969066453765
Testing Error:  22.69370072105448

Lasso(): 
Training Score:  0.8697136741098095
Training Error:  22.55283200995234
Testing Score:  0.8678305790260078
Testing Error:  22.665108806231533

Ridge(): 
Training Score:  0.8704968107299644
Training Error:  22.48494856330052
Testing Score:  0.8674987459855437
Testing Error:  22.693543209335008

RandomForestRegressor(): 
Training Score:  0.9871910690560646
Training Error:  7.071444034595549
Testing Score:  0.9393397531386501
Testing Error:  15.354808387945319

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_c

In [45]:
pipe = Pipeline([
    ('PolyFeat', PolynomialFeatures(degree=2, include_bias=False)),
    ('XGBoost', XGBRegressor())
])

pipe.fit(X_train_sc, y_train)

In [55]:
pipe.get_params()

{'memory': None,
 'steps': [('PolyFeat', PolynomialFeatures(include_bias=False)),
  ('XGBoost',
   XGBRegressor(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=None, colsample_bynode=None,
                colsample_bytree=None, device=None, early_stopping_rounds=None,
                enable_categorical=False, eval_metric=None, feature_types=None,
                gamma=None, grow_policy=None, importance_type=None,
                interaction_constraints=None, learning_rate=None, max_bin=None,
                max_cat_threshold=None, max_cat_to_onehot=None,
                max_delta_step=None, max_depth=None, max_leaves=None,
                min_child_weight=None, missing=nan, monotone_constraints=None,
                multi_strategy=None, n_estimators=None, n_jobs=None,
                num_parallel_tree=None, random_state=None, ...))],
 'verbose': False,
 'PolyFeat': PolynomialFeatures(include_bias=False),
 'XGBoost': XGBRegressor(base_score=None, booster

In [99]:
%%time
pgrid = {
    'XGBoost__learning_rate': [None, 0.1, 0.2, 0.3, 0.4, 0.5],
    'XGBoost__max_depth': [None, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
}
gs = GridSearchCV(pipe, param_grid=pgrid, cv = 5, n_jobs=2)
gs.fit(X_train_sc, y_train)

CPU times: user 5.17 s, sys: 16.6 s, total: 21.8 s
Wall time: 41.7 s


In [101]:
gs.best_params_

{'XGBoost__learning_rate': 0.1, 'XGBoost__max_depth': 4}

In [103]:
gs.score(X_train_sc, y_train), gs.score(X_test_sc, y_test)

(0.9541369836834978, 0.948630523726958)