In [27]:
import pandas as pd
import numpy as np
import datetime
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor


In [2]:
df = pd.read_csv('data/bike-sharing-demand/train.csv')
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [3]:
df.shape

(10886, 12)

In [4]:
df.isnull().sum()

datetime      0
season        0
holiday       0
workingday    0
weather       0
temp          0
atemp         0
humidity      0
windspeed     0
casual        0
registered    0
count         0
dtype: int64

In [5]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
season,10886.0,2.506614,1.116174,1.0,2.0,3.0,4.0,4.0
holiday,10886.0,0.028569,0.166599,0.0,0.0,0.0,0.0,1.0
workingday,10886.0,0.680875,0.466159,0.0,0.0,1.0,1.0,1.0
weather,10886.0,1.418427,0.633839,1.0,1.0,1.0,2.0,4.0
temp,10886.0,20.23086,7.79159,0.82,13.94,20.5,26.24,41.0
atemp,10886.0,23.655084,8.474601,0.76,16.665,24.24,31.06,45.455
humidity,10886.0,61.88646,19.245033,0.0,47.0,62.0,77.0,100.0
windspeed,10886.0,12.799395,8.164537,0.0,7.0015,12.998,16.9979,56.9969
casual,10886.0,36.021955,49.960477,0.0,4.0,17.0,49.0,367.0
registered,10886.0,155.552177,151.039033,0.0,36.0,118.0,222.0,886.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    10886 non-null  object 
 1   season      10886 non-null  int64  
 2   holiday     10886 non-null  int64  
 3   workingday  10886 non-null  int64  
 4   weather     10886 non-null  int64  
 5   temp        10886 non-null  float64
 6   atemp       10886 non-null  float64
 7   humidity    10886 non-null  int64  
 8   windspeed   10886 non-null  float64
 9   casual      10886 non-null  int64  
 10  registered  10886 non-null  int64  
 11  count       10886 non-null  int64  
dtypes: float64(3), int64(8), object(1)
memory usage: 1020.7+ KB


In [7]:
df.datetime = pd.to_datetime(df.datetime)

In [8]:
df["yr"] = df.datetime.dt.year
df["day"] = df.datetime.dt.day
df['month'] = df.datetime.dt.month
df["weekday"] = df.datetime.dt.day_of_week
df["hour"] = df.datetime.dt.hour
df["yearday"] = df.datetime.dt.day_of_year
df.drop("datetime", inplace=True, axis=1)

In [9]:
df.head()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,yr,day,month,weekday,hour,yearday
0,1,0,0,1,9.84,14.395,81,0.0,3,13,16,2011,1,1,5,0,1
1,1,0,0,1,9.02,13.635,80,0.0,8,32,40,2011,1,1,5,1,1
2,1,0,0,1,9.02,13.635,80,0.0,5,27,32,2011,1,1,5,2,1
3,1,0,0,1,9.84,14.395,75,0.0,3,10,13,2011,1,1,5,3,1
4,1,0,0,1,9.84,14.395,75,0.0,0,1,1,2011,1,1,5,4,1


In [11]:
df.drop(["casual", "registered"], axis=1)

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,yr,day,month,weekday,hour,yearday
0,1,0,0,1,9.84,14.395,81,0.0000,16,2011,1,1,5,0,1
1,1,0,0,1,9.02,13.635,80,0.0000,40,2011,1,1,5,1,1
2,1,0,0,1,9.02,13.635,80,0.0000,32,2011,1,1,5,2,1
3,1,0,0,1,9.84,14.395,75,0.0000,13,2011,1,1,5,3,1
4,1,0,0,1,9.84,14.395,75,0.0000,1,2011,1,1,5,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10881,4,0,1,1,15.58,19.695,50,26.0027,336,2012,19,12,2,19,354
10882,4,0,1,1,14.76,17.425,57,15.0013,241,2012,19,12,2,20,354
10883,4,0,1,1,13.94,15.910,61,15.0013,168,2012,19,12,2,21,354
10884,4,0,1,1,13.94,17.425,61,6.0032,129,2012,19,12,2,22,354


In [12]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [13]:
X.head()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,yr,day,month,weekday,hour
0,1,0,0,1,9.84,14.395,81,0.0,3,13,16,2011,1,1,5,0
1,1,0,0,1,9.02,13.635,80,0.0,8,32,40,2011,1,1,5,1
2,1,0,0,1,9.02,13.635,80,0.0,5,27,32,2011,1,1,5,2
3,1,0,0,1,9.84,14.395,75,0.0,3,10,13,2011,1,1,5,3
4,1,0,0,1,9.84,14.395,75,0.0,0,1,1,2011,1,1,5,4


In [14]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: yearday, dtype: int64

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=100)

In [20]:
xg_regressor = XGBRegressor()

In [21]:
xg_regressor.fit(X_train, y_train)

In [23]:
y_preds = xg_regressor.predict(X_test)

In [25]:
mse = mean_squared_error(y_test, y_preds)

In [26]:
mse

0.0029746988131502186

In [31]:
np.sqrt(mse)

0.05454079952797006

In [29]:
model = XGBRegressor(objective="reg:squarederror")
scores = cross_val_score(model, X, y, 
                         scoring="neg_mean_squared_error", cv=10)

In [30]:
scores

array([-0.14673416, -0.73540433, -0.75135861, -0.52729435, -0.62159375,
       -0.0730154 , -0.41831373, -0.55502902, -0.89691932, -0.80611393])

In [33]:
min(np.sqrt(-scores))

0.27021362284268424