In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#Importing Models
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor,AdaBoostRegressor
#importing Preprocess 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
#importing Evaluation Metrics
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import accuracy_score
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data=pd.read_csv('/kaggle/input/bike-sharing-demand/train.csv',parse_dates=['datetime'])
test_data=pd.read_csv('/kaggle/input/bike-sharing-demand/test.csv',parse_dates=['datetime'])

In [None]:
train_data.head()

In [None]:
train_data.dtypes

In [None]:
train_data.isnull().sum()

In [None]:
print('unique values for {} column:\n {}'.format('season',train_data['season'].unique()))
print('unique values for {} column:\n {}'.format('holiday',train_data['holiday'].unique()))
print('unique values for {} column:\n {}'.format('weather',train_data['weather'].unique()))
print('unique values for {} column:\n {}'.format('workingday',train_data['workingday'].unique()))

In [None]:
sns.relplot(x='humidity',y='count',data=train_data,kind='line',ci=None)
plt.show()

In [None]:
sns.countplot(train_data['holiday'])
plt.show()

In [None]:
sns.catplot(x='weather',data=train_data,kind='count')
# 1-> spring
# 2-> summer
# 3-> fall
# 4-> winter

In [None]:
sns.catplot(data=train_data[['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
               'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count']],kind='box')
fig=plt.gcf()
plt.xticks(rotation=45)
fig.set_size_inches(10,10)

In [None]:
#corelation matrix.
cor_mat= train_data[:].corr()
mask = np.array(cor_mat)
mask[np.tril_indices_from(mask)] = False # to get the lower triangular shape only
fig=plt.gcf()
fig.set_size_inches(30,12)
sns.heatmap(data=cor_mat,mask=mask,square=True,annot=True,cbar=True)

In [None]:
sum(train_data['humidity']==0)

In [None]:
print('Data has {} rows and {} columns'.format(train_data.shape[0],train_data.shape[1]))

# Preprocess for Train and Test data

In [None]:
# getting dummies for weather and season columns
train_data=pd.get_dummies(data=train_data,columns=['weather','season'],prefix=['weather','season'])
test_data=pd.get_dummies(data=test_data,columns=['weather','season'],prefix=['weather','season'])

In [None]:
# modifing datetime column
train_data['hour']=[t.hour for t in train_data['datetime']]
train_data['year']=[t.year for t in train_data['datetime']]
train_data['month']=[t.month for t in train_data['datetime']]
train_data['day']=[t.weekday() for t in train_data['datetime']]
#---------------------------------------------------------------
test_data['hour']=[t.hour for t in test_data['datetime']]
test_data['year']=[t.year for t in test_data['datetime']]
test_data['month']=[t.month for t in test_data['datetime']]
test_data['day']=[t.weekday() for t in test_data['datetime']]

In [None]:
train_data['year']=train_data['year'].map({2011:0,2012:1})
#---------------------------------------------------------
test_data['year']=test_data['year'].map({2011:0,2012:1})

In [None]:
#Dropping datetime column for both train and test data
train_data.drop('datetime',axis=1,inplace=True)


In [None]:
train_data.columns

# Splitting Data to X and y

In [None]:
X=train_data.drop(columns=['casual', 'registered', 'count'],axis=1)
y=train_data['count']

In [None]:
train_X,val_X,train_y,val_y=train_test_split(X,y,random_state=1)

In [None]:
# Trying 3 models RandomForrest , Adaboost ,Bagging
models=[RandomForestRegressor(),AdaBoostRegressor(),BaggingRegressor()]
models_names=['RandomForestRegressor','AdaBoostRegressor','BaggingRegressor']
rmsle=[]
for model in models:
    model.fit(train_X,train_y)
    y_pred=model.predict(val_X)
    rmsle.append(np.sqrt(mean_squared_log_error(y_pred,val_y)))
dic={'Models': models_names,'Rmsle':rmsle}
model_df=pd.DataFrame(dic)
model_df

    


In [None]:
rf_clf=RandomForestRegressor(n_estimators=450,max_depth=20,n_jobs=-1)
rf_clf.fit(train_X,train_y)
preds=rf_clf.predict(val_X)
print(np.sqrt(mean_squared_log_error(preds,val_y)))

In [None]:
# as we see no noticable tunning,we will see the predictions of the model to the test data,fitting to whole data
my_model=RandomForestRegressor(n_estimators=450,max_depth=20,n_jobs=-1)
my_model.fit(X,y)

final_preds=my_model.predict(test_data.drop('datetime',axis=1))


In [None]:
output=pd.DataFrame({'datetime':test_data.datetime,'count':final_preds})
output.to_csv('submission.csv',index=False)