I have taken some references Vivek Srinivasan EDA & Ensemble Model (Top 10 Percentile) to build this kernel

In [None]:
# Import the necessary libraries
import numpy as np
import pandas as pd
import os
import time
import warnings
import os
from six.moves import urllib
import matplotlib
from datetime import datetime
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

In [None]:
#Add All the Models Libraries

# Scalers
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

# Models

from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_log_error,mean_squared_error, r2_score,mean_absolute_error

#regression
from sklearn.linear_model import LinearRegression,Ridge,Lasso,RidgeCV
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import train_test_split #training and testing data split
from sklearn import metrics #accuracy measure
from sklearn.metrics import confusion_matrix #for confusion matrix
from scipy.stats import reciprocal, uniform

# Cross-validation
from sklearn.model_selection import KFold #for K-fold cross validation
from sklearn.model_selection import cross_val_score #score evaluation
from sklearn.model_selection import cross_val_predict #prediction
from sklearn.model_selection import cross_validate

# GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

#Common data processors
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from scipy import sparse

#Accuracy Score
from sklearn.metrics import accuracy_score

In [None]:
# to make this notebook's output stable across runs
np.random.seed(123)

# To plot pretty figures
%matplotlib inline
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

In [None]:
#merge the data for feature engineering and later split it, just before applying Data Pipeline
TrainFile = pd.read_csv("../input/train.csv") #read the data from the csv file.
TestFile = pd.read_csv("../input/test.csv")

In [None]:
TrainFile.info()

In [None]:
TestFile.info()

In [None]:
TrainFile.describe()

In [None]:
TestFile.describe()

In [None]:
TrainFile.shape

In [None]:
TestFile.shape

In [None]:
TrainFile.head(2)

In [None]:
TestFile.head(2)

Create New Columns from DataTime Column

In [None]:
DataFile = TrainFile.append(TestFile,sort=False)
DataFile.reset_index(inplace=True)
DataFile.drop('index',inplace=True,axis=1)

In [None]:
DataFile.shape

In [None]:
DataFile['date'] = DataFile.datetime.apply(lambda x: x.split()[0])
DataFile['hour'] = DataFile.datetime.apply(lambda x: x.split()[1].split(":")[0]).astype("int")
DataFile['year']  = DataFile.datetime.apply(lambda x: x.split()[0].split("-")[0])
DataFile['month'] = DataFile.date.apply(lambda x: datetime.strptime(x,"%Y-%m-%d").month)
DataFile['weekday'] = DataFile.date.apply(lambda x: datetime.strptime(x,"%Y-%m-%d").weekday())
DataFile = DataFile.drop(["datetime", "date"],axis=1)

Convert Season and Weather into Categorical Values

In [None]:
DataFile["season"] = DataFile.season.map({1: "Spring", 2 : "Summer", 3 : "Fall", 4 :"Winter" })

In [None]:
DataFile["weather"] = DataFile.weather.map({1: " Clear + Few clouds + Partly cloudy + Partly cloudy",\
                                        2 : " Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist ", \
                                        3 : " Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds", \
                                        4 :" Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog " })

In [None]:
categoryVariableList = ["hour","weekday","month","season","weather","holiday","workingday","year"]
for var in categoryVariableList:
    DataFile[var] = DataFile[var].astype("category")

In [None]:
import seaborn as sn

correlation_matrix = DataFile[:].corr()
mask = np.array(correlation_matrix)
mask[np.tril_indices_from(mask)] = False
figure = plt.gcf()
figure.set_size_inches(20,10)
sn.heatmap(data=correlation_matrix, mask=mask, square=True, annot=True, cbar=True);

In [None]:
#DROP the column atemp because of correlation.
DataFile = DataFile.drop(labels='atemp', axis=1)

In [None]:
## Exploratory Outlier Analysis - Idea from https://www.kaggle.com/viveksrinivasan/eda-ensemble-model-top-10-percentile
fig,axes = plt.subplots(nrows=2,ncols=2)
fig.set_size_inches(20,15)
sn.boxplot(data = DataFile, x='hour', y='count', orient = 'v', ax = axes[1][1])
sn.boxplot(data = DataFile, x='humidity', y='count', orient = 'v', ax = axes[0][1])
sn.boxplot(data = DataFile, x='windspeed',y='count', orient = 'v', ax = axes[1][0])
sn.boxplot(data = DataFile, y='count', orient = 'v', ax = axes[0][0])

axes[0][0].set(ylabel='count',title="Box Plot On Count")
axes[0][1].set(xlabel='humidity', ylabel='Count',title="Box Plot On Count for Humidity range")
axes[1][0].set(xlabel='windspeed', ylabel='Count',title="Box Plot On Count for different wind speeds")
axes[1][1].set(xlabel='workingday', ylabel='Count',title="Box Plot On Count Across Working Day")

In [None]:
# Visualizations : Check the counts vs Season, Weather, Hour, Weekday - idea taken from https://www.kaggle.com/viveksrinivasan/eda-ensemble-model-top-10-percentile
fig,(ax1,ax2,ax3)= plt.subplots(nrows=3)
fig.set_size_inches(12, 30)
sortOrder = ["January","February","March","April","May","June","July","August","September","October","November","December"]
hueOrder = ["Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday"]

hourAggregated = pd.DataFrame(DataFile.groupby(["hour","season"],sort=True)["count"].mean()).reset_index()
sn.pointplot(x=hourAggregated["hour"], y=hourAggregated["count"],hue=hourAggregated["season"], data=hourAggregated, join=True,ax=ax1)
ax1.set(xlabel='Hour Of The Day', ylabel='Users Count',title="Average Users Count By Hour Of The Day Across Season",label='big')

hourAggregated = pd.DataFrame(DataFile.groupby(["hour","weather"],sort=True)["count"].mean()).reset_index()
sn.pointplot(x=hourAggregated["hour"], y=hourAggregated["count"],hue=hourAggregated["weather"], data=hourAggregated, join=True,ax=ax2)
ax2.set(xlabel='Hour Of The Day', ylabel='Users Count',title="Average Users Count By Hour Of The Day Across Weather",label='big')

hourTransformed = pd.melt(DataFile[["hour","casual","registered"]], id_vars=['hour'], value_vars=['casual', 'registered'])
hourAggregated = pd.DataFrame(hourTransformed.groupby(["hour","variable"],sort=True)["value"].mean()).reset_index()

sn.pointplot(x=hourAggregated["hour"], y=hourAggregated["value"],hue=hourAggregated["variable"],hue_order=["casual","registered"], data=hourAggregated, join=True,ax=ax3)
ax3.set(xlabel='Hour Of The Day', ylabel='Users Count',title="Average Users Count By Hour Of The Day Across User Type",label='big')

In [None]:
DataFile = DataFile.drop(['casual','registered'],axis=1)

In [None]:
# Removing the potential outliers from the training set and maintain the test set

test_set = DataFile[~pd.notnull(DataFile['count'])]
train_set = DataFile[np.abs(DataFile["count"]-DataFile["count"].mean())<=(3*DataFile["count"].std())] 
test_set = test_set.drop('count',axis = 1)

In [None]:
test_set.shape

In [None]:
train_set.shape

In [None]:
#Check for the missing values to check if any random extraction happened? Validate that shuffle was false

obs = train_set.isnull().sum().sort_values(ascending = False)
percent = round(train_set.isnull().sum().sort_values(ascending = False)/len(train_set)*100, 2)
pd.concat([obs, percent], axis = 1,keys= ['Number of Observations', 'Percent'])

In [None]:
#Check for the missing values in test data
obs = test_set.isnull().sum().sort_values(ascending = False)
percent = round(test_set.isnull().sum().sort_values(ascending = False)/len(test_set)*100, 2)
pd.concat([obs, percent], axis = 1,keys= ['Number of Observations', 'Percent'])

One Hot Encoding

In [None]:
# The CategoricalEncoder class will allow us to convert categorical attributes to one-hot vectors.

class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, encoding='onehot', categories='auto', dtype=np.float64,
                 handle_unknown='error'):
        self.encoding = encoding
        self.categories = categories
        self.dtype = dtype
        self.handle_unknown = handle_unknown

    def fit(self, X, y=None):
        """Fit the CategoricalEncoder to X.
        Parameters
        ----------
        X : array-like, shape [n_samples, n_feature]
            The data to determine the categories of each feature.
        Returns
        -------
        self
        """

        if self.encoding not in ['onehot', 'onehot-dense', 'ordinal']:
            template = ("encoding should be either 'onehot', 'onehot-dense' "
                        "or 'ordinal', got %s")
            raise ValueError(template % self.handle_unknown)

        if self.handle_unknown not in ['error', 'ignore']:
            template = ("handle_unknown should be either 'error' or "
                        "'ignore', got %s")
            raise ValueError(template % self.handle_unknown)

        if self.encoding == 'ordinal' and self.handle_unknown == 'ignore':
            raise ValueError("handle_unknown='ignore' is not supported for"
                             " encoding='ordinal'")

        X = check_array(X, dtype=np.object, accept_sparse='csc', copy=True)
        n_samples, n_features = X.shape

        self._label_encoders_ = [LabelEncoder() for _ in range(n_features)]

        for i in range(n_features):
            le = self._label_encoders_[i]
            Xi = X[:, i]
            if self.categories == 'auto':
                le.fit(Xi)
            else:
                valid_mask = np.in1d(Xi, self.categories[i])
                if not np.all(valid_mask):
                    if self.handle_unknown == 'error':
                        diff = np.unique(Xi[~valid_mask])
                        msg = ("Found unknown categories {0} in column {1}"
                               " during fit".format(diff, i))
                        raise ValueError(msg)
                le.classes_ = np.array(np.sort(self.categories[i]))

        self.categories_ = [le.classes_ for le in self._label_encoders_]

        return self

    def transform(self, X):
        """Transform X using one-hot encoding.
        Parameters
        ----------
        X : array-like, shape [n_samples, n_features]
            The data to encode.
        Returns
        -------
        X_out : sparse matrix or a 2-d array
            Transformed input.
        """
        X = check_array(X, accept_sparse='csc', dtype=np.object, copy=True)
        n_samples, n_features = X.shape
        X_int = np.zeros_like(X, dtype=np.int)
        X_mask = np.ones_like(X, dtype=np.bool)

        for i in range(n_features):
            valid_mask = np.in1d(X[:, i], self.categories_[i])

            if not np.all(valid_mask):
                if self.handle_unknown == 'error':
                    diff = np.unique(X[~valid_mask, i])
                    msg = ("Found unknown categories {0} in column {1}"
                           " during transform".format(diff, i))
                    raise ValueError(msg)
                else:
                    # Set the problematic rows to an acceptable value and
                    # continue `The rows are marked `X_mask` and will be
                    # removed later.
                    X_mask[:, i] = valid_mask
                    X[:, i][~valid_mask] = self.categories_[i][0]
            X_int[:, i] = self._label_encoders_[i].transform(X[:, i])

        if self.encoding == 'ordinal':
            return X_int.astype(self.dtype, copy=False)

        mask = X_mask.ravel()
        n_values = [cats.shape[0] for cats in self.categories_]
        n_values = np.array([0] + n_values)
        indices = np.cumsum(n_values)

        column_indices = (X_int + indices[:-1]).ravel()[mask]
        row_indices = np.repeat(np.arange(n_samples, dtype=np.int32),
                                n_features)[mask]
        data = np.ones(n_samples * n_features)[mask]

        out = sparse.csc_matrix((data, (row_indices, column_indices)),
                                shape=(n_samples, indices[-1]),
                                dtype=self.dtype).tocsr()
        if self.encoding == 'onehot-dense':
            return out.toarray()
        else:
            return out

In [None]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

In [None]:
# Now define x and y.

#the Y Variable
train_set_y = train_set["count"].copy()
train_set_y = train_set_y.astype(int)

# Need to reshape y to apply standard scaler.

#the X variables
train_set_X = train_set.drop("count", axis=1)

In [None]:
cat_pipeline = Pipeline([
        ("selector", DataFrameSelector(['season','weather'])),
        ("cat_encoder", CategoricalEncoder(encoding='onehot-dense')),
    ])

num_pipeline = Pipeline([
        ("selector", DataFrameSelector(["temp","humidity","windspeed"])),
        ('scaler', MinMaxScaler())
    ])
no_pipeline = Pipeline([
        ("selector", DataFrameSelector(['holiday','workingday','hour','year','month','weekday']))
    
])

In [None]:
full_pipeline = FeatureUnion(transformer_list=[
    ("cat_pipeline", cat_pipeline),
    ("num_pipeline",num_pipeline),
    ("no_pipeline", no_pipeline)
    ])

final_train_X = full_pipeline.fit_transform(train_set_X)
final_test_X = full_pipeline.transform(test_set)
final_train_y = np.log1p(train_set_y)

In [None]:
#pd.DataFrame(final_train_y).to_csv("test.csv")
#final_train_y

In [None]:
#Random Forest Regressor.
forest_class = RandomForestRegressor(random_state = 42)

n_estimators = [500]
max_features = ["auto",'sqrt','log2']

param_grid_forest = {'n_estimators' : n_estimators, 'max_features' : max_features}


rand_search_forest = GridSearchCV(forest_class, param_grid_forest, cv = 4, 
                                        scoring='neg_mean_squared_log_error', refit = True, n_jobs = -1, verbose=2)

rand_search_forest.fit(final_train_X, final_train_y)

In [None]:
random_estimator = rand_search_forest.best_estimator_

y_pred_rf= random_estimator.predict(final_train_X)
rf_msle = mean_squared_error(final_train_y, y_pred_rf)
rf_rmsle = np.sqrt(rf_msle)
rf_rmsle

In [None]:
#ADA Regressor

ada_boost = AdaBoostRegressor(random_state = 42)

n_estimators = [500]

param_grid_ada = {'n_estimators' : n_estimators}

rand_search_ada = GridSearchCV(ada_boost, param_grid_ada, cv = 4, scoring='neg_mean_squared_log_error', refit = True, n_jobs = -1, verbose = 2)

rand_search_ada.fit(final_train_X, final_train_y)

In [None]:
ada_estimator = rand_search_ada.best_estimator_

y_pred_ada= ada_estimator.predict(final_train_X)
ada_msle = mean_squared_error(final_train_y, y_pred_ada)
ada_rmsle = np.sqrt(ada_msle)
ada_rmsle

In [None]:
#Implement SVR

SVR_Reg = SVR()

SVR_Reg.fit(final_train_X, final_train_y)

In [None]:
y_pred_svr= SVR_Reg.predict(final_train_X)
svr_msle = mean_squared_error(final_train_y, y_pred_svr)
svr_rmsle = np.sqrt(svr_msle)
svr_rmsle

In [None]:
GB_Classifier = GradientBoostingRegressor(random_state = 42)

n_estimators = [50,500]

param_grid_grad_boost_class = {'n_estimators' : n_estimators}

rand_search_grad_boost_class = GridSearchCV(GB_Classifier, param_grid_grad_boost_class, cv = 4, scoring='neg_mean_squared_log_error', 
                               refit = True, n_jobs = -1, verbose = 2)

rand_search_grad_boost_class.fit(final_train_X, final_train_y)

In [None]:
y_pred_gb= rand_search_grad_boost_class.predict(final_train_X)
gb_msle = mean_squared_error(final_train_y, y_pred_gb)
gb_rmsle = np.sqrt(gb_msle)
gb_rmsle

Random Forest gives the best prediction

In [None]:
pred=random_estimator.predict(final_test_X)
d={'datetime':TestFile['datetime'],'count':np.exp(pred)}
ans=pd.DataFrame(d)
ans.to_csv('answer.csv',index=False) 