In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error

In [None]:
train=pd.read_csv('../input/bike-sharing-demand/train.csv')
test=pd.read_csv('../input/bike-sharing-demand/test.csv')

# Data Exploration

In [None]:
train.head()

In [None]:
train.dtypes

In [None]:
train.info()

In [None]:
test.info()

In [None]:
# columns:'casual','registered' must be dropped because they don't exist in test data
train.drop(['casual','registered'],axis=1,inplace=True)

In [None]:
#Count null values in each column
train.isnull().sum()

In [None]:
#Count values of season column
train.season.value_counts()

In [None]:
sns.factorplot(x='season',data=train,kind='count')

In [None]:
#Count values of holiday column
train.holiday.value_counts()

In [None]:
sns.factorplot(x='holiday',data=train,kind='count')

In [None]:
#Count values of workingday column
train.workingday.value_counts()

In [None]:
sns.factorplot(x='workingday',data=train,kind='count')

In [None]:
#Count values of weather column
train.weather.value_counts()

In [None]:
sns.factorplot(x='weather',data=train,kind='count')

In [None]:
train.describe()

In [None]:
fig,axes=plt.subplots(2,2)
axes[0,0].hist(x="temp",data=train)
axes[0,0].set_title("Temperature")
axes[0,1].hist(x="atemp",data=train)
axes[0,1].set_title("atemp")
axes[1,0].hist(x="windspeed",data=train)
axes[1,0].set_title("Windspeed")
axes[1,1].hist(x="humidity",data=train)
axes[1,1].set_title("Humidity")
fig.set_size_inches(10,10)

In [None]:
cor_mat= train[:].corr()
fig=plt.gcf()
fig.set_size_inches(9,9)
sns.heatmap(data=cor_mat,annot=True)

**we conclude that:**
1. there is a high correlation between temp and atemp so they have the same value of correlation with counts 
1. there is a negative correlation between humidity and counts

# Feature Engineering

In [None]:
#make dummies of season column in both of train and test data
season=pd.get_dummies(train['season'],prefix='season')
train=pd.concat([train,season],axis=1)

season=pd.get_dummies(test['season'],prefix='season')
test=pd.concat([test,season],axis=1)


In [None]:
#make dummies of weather column in both of train and test data
weather=pd.get_dummies(train['weather'],prefix='weather')
train=pd.concat([train,weather],axis=1)

weather=pd.get_dummies(test['weather'],prefix='weather')
test=pd.concat([test,weather],axis=1)

In [None]:
#Drop season and weather columns from train and test data
train.drop(['season','weather'],inplace=True,axis=1)

test.drop(['season','weather'],inplace=True,axis=1)


In [None]:
#Make columns of hour, day, month, and year out of train datetime column
train["hour"] = [t.hour for t in pd.DatetimeIndex(train.datetime)]
train["day"] = [t.dayofweek for t in pd.DatetimeIndex(train.datetime)]
train["month"] = [t.month for t in pd.DatetimeIndex(train.datetime)]
train['year'] = [t.year for t in pd.DatetimeIndex(train.datetime)]
train['year'] = train['year'].map({2011:0, 2012:1})

In [None]:
#Make columns of hour, day, month, and year out of test datetime column
test["hour"] = [t.hour for t in pd.DatetimeIndex(test.datetime)]
test["day"] = [t.dayofweek for t in pd.DatetimeIndex(test.datetime)]
test["month"] = [t.month for t in pd.DatetimeIndex(test.datetime)]
test['year'] = [t.year for t in pd.DatetimeIndex(test.datetime)]
test['year'] = test['year'].map({2011:0, 2012:1})

In [None]:
train.drop('datetime',axis=1,inplace=True)
test.drop('datetime',axis=1,inplace=True)

In [None]:
sns.factorplot(x="hour",y="count",data=train,kind='bar', size=8)

**We conclude that:**
there is a higher counts in intervals: 7-10 and 15-19

# Model

In [None]:
X_train,X_test,y_train,y_test=train_test_split(train.drop('count',axis=1),train['count'],test_size=0.25,random_state=42)

In [None]:
model = RandomForestRegressor()
model.fit(X_train,y_train)
pred = model.predict(X_test)

In [None]:
print('Validation rmsle = ', np.sqrt(mean_squared_log_error(pred,y_test)))

In [None]:
model.fit(train.drop('count',axis=1),train['count'])
predictions = model.predict(test)

# Submission

In [None]:
test = pd.read_csv('../input/bike-sharing-demand/test.csv')
d = {'datetime':test['datetime'],'count':predictions}
submission = pd.DataFrame(d)
submission.to_csv('submission.csv',index=False)