# Importing Libraries

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_log_error
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings("ignore")

# Reading Data

In [2]:
# Read the data
X = pd.read_csv('./train.csv', index_col='datetime')
X_test = pd.read_csv('./test.csv', index_col='datetime')

In [3]:
# First 5 rows of X
X.head()

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [4]:
# First 5 rows of X_test
X_test.head()

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027
2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0
2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0
2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014
2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014


In [5]:
cols_with_missing = [col for col in X.columns
                     if X[col].isnull().any()]
cols_with_missing

[]

there is no missing values in any of the variables!

In [6]:
# Assign the variable count to y
y = X['count']
# Remove the variables we do not want to be in X
X.drop(['casual','registered','count'], axis=1, inplace=True)

# Feature Engineering

## Adding year, month and hour variables

In [7]:
X['year'] = X.index.str.slice(0, 4).astype(np.int64)
X['month'] = X.index.str.slice(5, 7).astype(np.int64)
X['hour'] = X.index.str.slice(11, 13).astype(np.int64)

X_test['year'] = X_test.index.str.slice(0, 4).astype(np.int64)
X_test['month'] = X_test.index.str.slice(5, 7).astype(np.int64)
X_test['hour'] = X_test.index.str.slice(11, 13).astype(np.int64)

## Encoding the variables month and hour

In [8]:
X['month_sin'] = np.sin((2*np.pi*X['month'])/12)
X['month_cos'] = np.cos((2*np.pi*X['month'])/12)
X['hour_sin'] = np.sin((2*np.pi*(X['hour']+1))/24)
X['hour_cos'] = np.cos((2*np.pi*(X['hour']+1))/24)
X.drop(['month','hour'], axis=1, inplace=True)                    
                       
X_test['month_sin'] = np.sin((2*np.pi*X_test['month'])/12)
X_test['month_cos'] = np.cos((2*np.pi*X_test['month'])/12)
X_test['hour_sin'] = np.sin((2*np.pi*(X_test['hour']+1))/24)
X_test['hour_cos'] = np.cos((2*np.pi*(X_test['hour']+1))/24)
X_test.drop(['month','hour'], axis=1, inplace=True )

## Encoding the variable season

In [9]:
X['season_sin'] = np.sin((2*np.pi*X['season'])/4)
X['season_cos'] = np.cos((2*np.pi*X['season'])/4)
X.drop(['season'], axis=1, inplace=True)                    
                       
X_test['season_sin'] = np.sin((2*np.pi*X_test['season'])/4)
X_test['season_cos'] = np.cos((2*np.pi*X_test['season'])/4)
X_test.drop(['season'], axis=1, inplace=True )

In [10]:
# # Create season dictionary 
# season_dictionary ={1 : 'spring', 2 : 'summer', 3 : 'fall', 4: 'winter'} 
  
# # Convert season values to words
# X['season'] = X['season'].map(season_dictionary) 
# X_test['season'] = X_test['season'].map(season_dictionary) 

# # OneHot Encode season
# X = pd.get_dummies(X, prefix_sep="_", columns=['season'])
# X_test = pd.get_dummies(X_test, prefix_sep="_", columns=['season'])
# for col in ['season_fall','season_spring','season_summer','season_winter']:
#     X[col] = X[col].astype('int64')
#     X_test[col] = X_test[col].astype('int64')

In [11]:
X.head()

Unnamed: 0_level_0,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month_sin,month_cos,hour_sin,hour_cos,season_sin,season_cos
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2011-01-01 00:00:00,0,0,1,9.84,14.395,81,0.0,2011,0.5,0.866025,0.258819,0.965926,1.0,6.123234000000001e-17
2011-01-01 01:00:00,0,0,1,9.02,13.635,80,0.0,2011,0.5,0.866025,0.5,0.866025,1.0,6.123234000000001e-17
2011-01-01 02:00:00,0,0,1,9.02,13.635,80,0.0,2011,0.5,0.866025,0.707107,0.707107,1.0,6.123234000000001e-17
2011-01-01 03:00:00,0,0,1,9.84,14.395,75,0.0,2011,0.5,0.866025,0.866025,0.5,1.0,6.123234000000001e-17
2011-01-01 04:00:00,0,0,1,9.84,14.395,75,0.0,2011,0.5,0.866025,0.965926,0.258819,1.0,6.123234000000001e-17


# Modeling

## Splitting data

In [12]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

## Random Forest

### Tuning a Random Forest Model with CV Grid Search

In [13]:
# Choose the type of classifier. 
rf_model = RandomForestRegressor()

# Choose some parameter combinations to try
rf_parameters = {'n_estimators': [10, 25, 50, 100, 200, 300], 
                 'max_features': ['log2', 'sqrt','auto'],
                 'criterion': ['mse'],
                 'max_depth': [2, 3, 5, 10], 
                 'min_samples_split': [2, 3, 5],
                 'min_samples_leaf': [1,5,8]
                }
              

# Run the grid search
rf_grid_obj = GridSearchCV(rf_model, rf_parameters, cv=3, scoring='neg_mean_squared_log_error')
rf_grid_obj = rf_grid_obj.fit(X_train, y_train)

# Set the rf_model to the best combination of parameters
rf_model = rf_grid_obj.best_estimator_

In [14]:
rf_model

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=300,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [15]:
# RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
#                       max_features='auto', max_leaf_nodes=None,
#                       min_impurity_decrease=0.0, min_impurity_split=None,
#                       min_samples_leaf=1, min_samples_split=2,
#                       min_weight_fraction_leaf=0.0, n_estimators=300,
#                       n_jobs=None, oob_score=False, random_state=None,
#                       verbose=0, warm_start=False)

### Fitting the best model to the training data and validating

In [16]:
# Fit the best algorithm to the data. 
rf_model.fit(X_train, y_train)

# Get the score of the model
rf_predictions = rf_model.predict(X_valid)
print(np.sqrt(mean_squared_log_error(y_valid, rf_predictions)))

0.37531060384426185


## XGBoost

### Tuning an XGBoost Model with CV Grid Search

In [None]:
# Choose the type of classifier. 
xgb_model = XGBRegressor()

# Choose some parameter combinations to try
xgb_parameters = {'n_estimators': [100, 200, 500, 1000, 5000], 
                  'max_depth':[3, 5, 7, 9, 10, 15],
                  'objective':['reg:squarederror']}

# Run the grid search
xgb_grid_obj = GridSearchCV(xgb_model, xgb_parameters, cv=3, scoring='neg_mean_squared_error')
xgb_grid_obj = xgb_grid_obj.fit(X_train, y_train)

# Set the rf_model to the best combination of parameters
xgb_model = xgb_grid_obj.best_estimator_

In [19]:
xgb_model

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=7, min_child_weight=1, missing=None, n_estimators=200,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)

In [None]:
# XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
#              colsample_bynode=1, colsample_bytree=1, gamma=0,
#              importance_type='gain', learning_rate=0.1, max_delta_step=0,
#              max_depth=7, min_child_weight=1, missing=None, n_estimators=200,
#              n_jobs=1, nthread=None, objective='reg:squarederror',
#              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
#              seed=None, silent=None, subsample=1, verbosity=1)

### Fitting the best model to the training data and validating

In [20]:
# Fit the best algorithm to the data. 
xgb_model.fit(X_train, y_train)

# Get the score of the model
xgb_predictions = xgb_model.predict(X_valid)
xgb_predictions = np.clip(xgb_predictions, y_train.min(), y_train.max())
print(np.sqrt(mean_squared_log_error(y_valid, xgb_predictions)))

0.39084634749963965


## Combining Models

In [21]:
print(np.sqrt(mean_squared_log_error(y_valid, 
                                     np.mean(np.array([rf_predictions, 
                                                       xgb_predictions]), axis=0 ))))

0.3484003009651389


# Make Test Predictions

In [24]:
# Fitting the rf_model to X
rf_model.fit(X, y)
# Fitting the xgb_model to X
xgb_model.fit(X, y)
# Make predictions using the rf_model
rf_preds_test = rf_model.predict(X_test)
# Make predictions using the xgb_model
xgb_preds_test = xgb_model.predict(X_test)
xgb_preds_test = np.clip(xgb_preds_test, y.min(), y.max())
# Combining predictions
preds_test = np.mean(np.array([rf_preds_test, xgb_preds_test]), axis=0 )

In [25]:
# Save predictions in format used for competition scoring
output = pd.DataFrame({'datetime': X_test.index,
                       'count': preds_test})
output.to_csv('submission.csv', index=False)