In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

In [None]:
import os
print(os.listdir("../input"))

In [None]:
%matplotlib inline
pd.set_option('display.width',500)
pd.set_option('display.max_columns',100)

In [None]:
path = "../input/train.csv"
bike_df = pd.read_csv(path)
bike_df.head()

# ***Check Missing Value***

In [None]:
bike_df.info()

In [None]:
bike_df.corr()

In [None]:
sns.pairplot(bike_df)

# ***Make RMSLE Score***

In [None]:
def RMSLE(y_true, y_pred):
    y_pred[y_pred < 0] = 0
    return np.sqrt(MSE(np.log1p(y_pred), np.log1p(y_true)))

rmsle_score = make_scorer(RMSLE)

# ***Preparation of data ***

In [None]:
def dataPreparation(bike_df):
    bike_df['datetime'] = pd.to_datetime(bike_df['datetime'])
    bike_df['hour'] = bike_df.datetime.dt.hour.astype('int')
    bike_df['day'] = bike_df.datetime.dt.dayofweek
    bike_df['month'] = bike_df.datetime.dt.month
    bike_df['year'] = bike_df.datetime.dt.year.astype('object')

    d_days = {
        0: 'Sunday',
        1: 'Monday',
        2: 'Tuesday',
        3: 'Wednesday',
        4: 'Thursday',
        5: 'Friday',
        6: 'Saturday'
    }
    bike_df['day'] = bike_df['day'].map(d_days)

    d_month = {
        1: 'January',
        2: 'February',
        3: 'March',
        4: 'April',
        5: 'May',
        6: 'June',
        7: 'July',
        8: 'August',
        9: 'September',
        10: 'October',
        11: 'November',
        12: 'December'
    }
    bike_df['month'] = bike_df['month'].map(d_month)

    d_year = {2011: 0, 2012: 1}
    bike_df['year'] = bike_df['year'].map(d_year)

    d_workingday = {0: 'No', 1: 'Yes'}
    bike_df['workingday'] = bike_df['workingday'].map(d_workingday)

    d_holidays = {0: 'No', 1: 'Yes'}
    bike_df['holiday'] = bike_df['holiday'].map(d_holidays)

    weather_d = {
        1:
        'Clear, Few clouds, Partly cloudy, Partly cloudy',
        2:
        'Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist',
        3:
        'Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds',
        4:
        'Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog'
    }
    bike_df['weather'] = bike_df['weather'].map(weather_d)

    d_seasons = {1: 'spring', 2: 'summer', 3: 'fall', 4: 'winter'}
    bike_df['season'] = bike_df['season'].map(d_seasons)

    
    return bike_df

In [None]:
bike_df = dataPreparation(bike_df)
bike_df.head()

# ***Devide Target and Features***

In [None]:
target = bike_df['count']
features = bike_df.loc[:, [
    'year', 'month', 'day', 'hour', 'season', 'holiday', 'workingday',
    'weather', 'temp', 'humidity', 'windspeed'
]]

   ### ***Preprocessing of categorical features***

In [None]:
categorical_features = list(features.columns[features.dtypes == 'object'])
print(categorical_features)
categorical_transformer = Pipeline([('cat', OneHotEncoder(handle_unknown='ignore'))])

   ### ***Preprocessing of numerical features***

In [None]:
numerical_features = list(features.columns[features.dtypes != 'object'])
print(numerical_features)
numerical_features.remove('year')
numerical_features.remove('hour')
numerical_transformer = Pipeline([('robust', RobustScaler())])
print(numerical_features)

Preprocessing

In [None]:
preprocessing = ColumnTransformer([('num', numerical_transformer, numerical_features),
                                   ('cat', categorical_transformer, categorical_features)])

In [None]:
# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize=(10,4))
ax1.hist(target, bins=30);
ax1.set_title('Distribution Before Log Transformation')
ax2.hist(np.log(target), bins=30);
ax2.set_title('Distribution After Log Transformation');

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=42421, test_size=0.2)

In [None]:
reg = Pipeline([('pre', preprocessing),
                ('reg', RandomForestRegressor(n_estimators=80, random_state=42421))])

In [None]:
param_grid = {'reg__n_estimators': range(80, 110, 5),
              'reg__max_features': [30, 31],
              'reg__bootstrap': [True, False]
              }

In [None]:
%%time
reg_grid = GridSearchCV(reg, param_grid, cv=5 ,iid=False, n_jobs=-1)
reg_grid.fit(X_train, y_train)
print(reg_grid.best_estimator_)
print(reg_grid.best_score_)

In [None]:
print(reg_grid.best_estimator_.named_steps['reg'])

In [None]:
y_pred = np.round(reg_grid.predict(X_test))
RMSLE(y_true=y_test, y_pred=y_pred)

In [None]:
path = "../input/test.csv"
test = pd.read_csv(path)
test = dataPreparation(test)
years = test['datetime']
test = test.loc[:, [
    'year', 'month', 'day', 'hour', 'season', 'holiday', 'workingday',
    'weather', 'temp', 'humidity', 'windspeed'
]]

In [None]:
reg_grid.fit(features, np.log(target))

In [None]:
y_pred = np.round(np.e ** reg_grid.predict(features))
RMSLE(y_true=target, y_pred=y_pred)

In [None]:
submission = np.round(np.e ** reg_grid.predict(test))
submission = pd.Series(submission, name='count') 
submission.head()

In [None]:
submission = pd.concat([years, submission], axis=1)
submission.to_csv('submission.csv', index=False)