In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid')
train=pd.read_csv('../input/train.csv')
train.head()

# Data Exploration
Checking whether if there is any missing value.

In [None]:
train.isnull().sum()

In [None]:
train.season.unique()

In [None]:
train.weather.value_counts()

In [None]:
train.holiday.value_counts()

In [None]:
sns.barplot(x='season', y='count', data=train)

In [None]:
sns.barplot(x='weather', y='count', data=train)

In [None]:
train[['count', 'holiday']].groupby(['holiday'], as_index = True).mean().sort_values(by = 'count')

In [None]:
train[['count', 'season']].groupby(['season'], as_index = True).mean().sort_values(by = 'count')

Well, we have a datetime object here, so it's better to break them into hour, day, month, year and make them a separate column.
Further, found in the year column, there are just 2 different years 2011,2012 so using map(), I converted 2011 and 2012 to 0 and 1 respectively.Because most of the other columns like season, holiday, weather and working day are in the form of 0,1,2,3.

In [None]:
train["hour"] = [t.hour for t in pd.DatetimeIndex(train.datetime)]
train["day"] = [t.dayofweek for t in pd.DatetimeIndex(train.datetime)]
train["month"] = [t.month for t in pd.DatetimeIndex(train.datetime)]
train['year'] = [t.year for t in pd.DatetimeIndex(train.datetime)]
train['year'] = train['year'].map({2011:0, 2012:1})

It's time to divide our training data in the form of features and target separately.

In [None]:
X, y = train.iloc[:, 1:], train['count']

It was seen from the training data sum of registered column and casual column yields count.
It was unnecessary to keep these two columns as our features, Machine learning learners can be more fruitful if dataset is free of useless columns.

In [None]:
plt.scatter(x = train['casual'] + train['registered'], y = train['count'])
plt.show()

In [None]:
X = X.drop(['registered', 'casual', 'count'], axis=1)

# Applying machine learning models
Splitting into training and test set using scikit's train_test_split package

In [None]:
from sklearn.cross_validation import  train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0)

Features on larger scales can unduly influence the model.
We want features on a similar scale.
Scikit's preprocessing provides us with StandardScaler package to scale our data.

In [None]:
from sklearn.preprocessing import StandardScaler
scl= StandardScaler()
X_train_std = scl.fit_transform(X_train)
X_test_std = scl.transform(X_test)

I used RandomForestRegressor model with fitting on the training data and predicting on the test data.

In [None]:
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(n_estimators = 400, criterion='mse',random_state=1, n_jobs=-1)
forest.fit(X_train_std, y_train)
y_train_pred = forest.predict(X_train_std)
y_test_pred = forest.predict(X_test_std)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
#Root_Mean_Square_Log_Error(RMSE) is accuracy criteria for this problem
print('RMSLE train: %.3f' % np.sqrt(mean_squared_error(np.log(y_train + 1), np.log(y_train_pred + 1))))
print('RMSLE test: %.3f' % np.sqrt(mean_squared_error(np.log(y_test + 1), np.log(y_test_pred + 1))))
print('R2 train: %.3f' % r2_score(y_train, y_train_pred))
print('R2 test: %.3f' % r2_score(y_test, y_test_pred))

Further, I tried using DecisionTreeRegressor model but it's performance was not better than  RandomForestRegressor model.

In [None]:
from sklearn.tree import DecisionTreeRegressor
clf = DecisionTreeRegressor()
clf.fit(X_train_std, y_train)
y_train_pred2 = clf.predict(X_train_std)
y_test_predd = clf.predict(X_test_std)
#Root_Mean_Square_Log_Error(RMSE) is accuracy criteria for this problem
print('RMSLE train: %.3f' % np.sqrt(mean_squared_error(np.log(y_train + 1), np.log(y_train_pred2 + 1))))
print('RMSLE test: %.3f' % np.sqrt(mean_squared_error(np.log(y_test + 1), np.log(y_test_predd + 1))))
print('R2 train: %.3f' % r2_score(y_train, y_train_pred2))
print('R2 test: %.3f' % r2_score(y_test, y_test_predd))

# Similar approach is done on the competition given test data.

In [None]:
test=pd.read_csv('../input/test.csv')
test.head()

Simlarly, I converted datatime object to hour,day,month,year

In [None]:
test["hour"] = [t.hour for t in pd.DatetimeIndex(test.datetime)]
test["day"] = [t.dayofweek for t in pd.DatetimeIndex(test.datetime)]
test["month"] = [t.month for t in pd.DatetimeIndex(test.datetime)]
test['year'] = [t.year for t in pd.DatetimeIndex(test.datetime)]
test['year'] = test['year'].map({2011:0, 2012:1})

In [None]:
X_test=test.iloc[:,1:]

Similarly,I used same standarad scaler for test data

In [None]:
X_test = scl.transform(X_test)

In [None]:
y_test=forest.predict(X_test)

In [None]:
df_submit = test

In [None]:
df_submit['count'] = np.round(y_test)

In [None]:
df_submit = df_submit.drop(['season', 'holiday', 'workingday','weather', 'temp', 'atemp', 'humidity', 'windspeed', 'hour', 'day', 'month', 'year'], axis=1)

In [None]:
df_submit.head()

In [None]:
df_submit.to_csv('bike2.csv', index=False)