In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import warnings
warnings.filterwarnings('ignore')
rand_state = 1000

In [2]:
#import data and see what we're working with

In [3]:
df = pd.read_csv("bikeshare.csv")
df_raw = df
df.head()

In [67]:
df.drop('dteday', axis=1, inplace=True)

In [68]:
y = df['cnt']
X = df.drop('cnt', axis=1) # becareful inplace= False

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=rand_state)

In [69]:
from sklearn.ensemble import RandomForestRegressor

In [70]:
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor

In [71]:
from xgboost import XGBRegressor

In [72]:
%%time 
# Fitting RF classifier to the Training set
RF_regression = RandomForestRegressor(random_state=rand_state)
RF_regression.fit(X_train, y_train)

Wall time: 5.09 s


RandomForestRegressor(random_state=1000)

In [73]:
%%time
# Fitting AdaBoost classifier to the Training set
AdB_regression = AdaBoostRegressor(random_state=rand_state)
AdB_regression.fit(X_train, y_train)

Wall time: 1.01 s


AdaBoostRegressor(random_state=1000)

In [74]:
%%time
# Fitting Gradient Boosting classifier to the Training set
GBM_regression = GradientBoostingRegressor(random_state=rand_state)
GBM_regression.fit(X_train, y_train)

Wall time: 1.12 s


GradientBoostingRegressor(random_state=1000)

In [75]:
%%time
# Fitting XGBoost classifier to the Training set
XGB_regression = XGBRegressor(random_state=rand_state)
XGB_regression.fit(X_train, y_train)

Wall time: 778 ms


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=1000,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [76]:
np.round(RF_regression.score(X_test, y_test),4)

0.9488

In [77]:
np.round(AdB_regression.score(X_test, y_test),4)

0.6656

In [78]:
np.round(GBM_regression.score(X_test, y_test),4)

0.8563

In [79]:
np.round(XGB_regression.score(X_test, y_test),4)

0.9516

Fastest to slowest models: XGBoost, AdaBoost, GBM, Random Forest

Best to worst R^2 scores: XGBoost, Random Forest, GBM, AdaBoost

We have a clear winner with XGBoost! The R^2 for the XGB model is only slightly better than the next best, Random Forest, but it's more than 6 times faster.

In [100]:
df.loc[:, 'overload'] = np.where(df.cnt>500, 1, 0)

In [101]:
from sklearn.ensemble import RandomForestClassifier # we will be using RF as our benchmark.

In [102]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

In [103]:
from xgboost import XGBClassifier

In [104]:
y = df['overload']
X = df.drop(['overload', 'cnt'], axis=1) # becareful inplace= False

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=rand_state)

In [105]:
%%time 
# Fitting RF classifier to the Training set
RF_classifier = RandomForestClassifier(random_state=rand_state)
RF_classifier.fit(X_train, y_train)

Wall time: 1.17 s


RandomForestClassifier(random_state=1000)

In [106]:
%%time
# Fitting AdaBoost classifier to the Training set
AdB_classifier = AdaBoostClassifier(random_state=rand_state)
AdB_classifier.fit(X_train, y_train)

Wall time: 454 ms


AdaBoostClassifier(random_state=1000)

In [107]:
%%time
# Fitting Gradient Boosting classifier to the Training set
GBM_classifier = GradientBoostingClassifier(random_state=rand_state)
GBM_classifier.fit(X_train, y_train)

Wall time: 1.56 s


GradientBoostingClassifier(random_state=1000)

In [108]:
%%time
# Fitting XGBoost classifier to the Training set
XGB_classifier = XGBClassifier(random_state=rand_state)
XGB_classifier.fit(X_train, y_train)

Wall time: 825 ms


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1,
              random_state=1000, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [109]:
# Predicting the Test set probabilities and classes
y_hat_RF       = RF_classifier.predict(X_test)
y_hat_AdB      = AdB_classifier.predict(X_test)
y_hat_GBM      = GBM_classifier.predict(X_test)
y_hat_XGB      = XGB_classifier.predict(X_test)

In [110]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score

In [112]:
print('RF  f1 = {}'.format(f1_score(y_test, y_hat_RF)))
print('AdB f1 = {}'.format(f1_score(y_test, y_hat_AdB)))
print('GBM f1 = {}'.format(f1_score(y_test, y_hat_GBM)))
print('XGB f1 = {}'.format(f1_score(y_test, y_hat_XGB)))

RF  f1 = 0.7921348314606741
AdB f1 = 0.603318250377074
GBM f1 = 0.7780979827089336
XGB f1 = 0.864935064935065


Fastest to slowest models: Adaboost, XGBoost, Random Forest, GBM

Best to worst f1 scores: XGBoost, Random Forest, GBM, Adaboost

A good model is definitely more important than execution speed, so XGBoost is the winner again! Adaboost may be a little faster than XGBoost but it came back with the worst f1 score (by a lot).