In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load Datasets

In [None]:
train_set_path = '/kaggle/input/bike-sharing-demand/train.csv'
test_set_path = '/kaggle/input/bike-sharing-demand/test.csv'

train_df = pd.read_csv(train_set_path)
test_df = pd.read_csv(test_set_path)

In [None]:
train_df.head()

In [None]:
test_df.head()

# EDA

In [None]:
train_df.describe()

In [None]:
train_df['count'].hist()

In [None]:
sns.pairplot(train_df[['weather', 'workingday', 'holiday', 'season', 'temp', 'atemp', 'humidity', 'windspeed', 'count']])

In [None]:
sns.distplot(train_df['count'])

In [None]:
y_train = np.log(train_df['count'] + 1)

In [None]:
sns.distplot(y_train)

In [None]:
train_df.info()

# Feature Engineering

In [None]:
def create_date_columns(df, col_name):
    df['datetime'] = pd.to_datetime(df[col_name])
    df['hour'] =  df[col_name].apply(lambda date: date.hour)
    df['day'] =  df[col_name].apply(lambda date: date.day)
    df['month'] =  df[col_name].apply(lambda date: date.month)
    df['year'] =  df[col_name].apply(lambda date: date.year)
    df['day_number'] =  df[col_name].apply(lambda date: date.weekday())
    return df

In [None]:
train_df = create_date_columns(train_df, 'datetime')

In [None]:
train_df.head()

In [None]:
sns.barplot(data=train_df, x='day_number', y = 'count')

In [None]:
sns.barplot(data=train_df, x='hour', y = 'count')

In [None]:
sns.barplot(data=train_df, x='month', y = 'count')

In [None]:
sns.boxplot(x=train_df["hour"], y=train_df['count'])

In [None]:
count_per_hour = train_df.groupby('hour').agg('sum')['count']
count_per_hour

In [None]:
sns.relplot(data = count_per_hour, kind='line')

In [None]:
selected_features = ['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'windspeed', 'humidity', 'hour', 'day_number']

# Pipline

In [None]:
def pipline(df, selected_features):
    df = create_date_columns(df, 'datetime')
    return df[selected_features]

In [None]:
X_train = pipline(train_df, selected_features)

In [None]:
X_train.head()

# Split Dataset for training

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2)

# Evaluation Matrix

In [None]:
def rmsle(real, pred):
    return np.sqrt(np.mean(((np.log(pred + 1)) - (np.log(real + 1)))**2))

# Model selection

In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor() # randomforest with default prameters

In [None]:
model.fit(X_train, y_train)

In [None]:
y_hat_train = model.predict(X_train)
y_hat_valid = model.predict(X_valid)

In [None]:
print(len(y_hat_train))
print(len(y_hat_valid))

In [None]:
print(f"train_score: {rmsle(np.exp(y_train)-1, np.exp(y_hat_train) - 1)}")
print(f"validation_score: {rmsle(np.exp(y_valid)-1, np.exp(y_hat_valid) - 1)}")

In [None]:
model = RandomForestRegressor(n_estimators=500, max_depth=15) 
model.fit(X_train, y_train)

In [None]:
y_hat_train = model.predict(X_train)
y_hat_valid = model.predict(X_valid)

In [None]:
print(f"train_score: {rmsle(np.exp(y_train)-1, np.exp(y_hat_train) - 1)}")
print(f"validation_score: {rmsle(np.exp(y_valid)-1, np.exp(y_hat_valid) - 1)}")

# Best Fit model

In [None]:
best_model = RandomForestRegressor(n_estimators=500, max_depth=15)
X_train = pipline(train_df, selected_features)
y_train = np.log(train_df['count'] + 1)
best_model.fit(X_train, y_train)

In [None]:
X_test = pipline(test_df, selected_features)

In [None]:
y_hat = best_model.predict(X_test)

# Submit the answer

In [None]:
test_df['count'] = np.exp(y_hat) -1
final_df = test_df[['datetime', 'count']].copy()
final_df.to_csv('submission.csv', index=False)

# END