In [1]:
# importing libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
#load training data 
train_data = pd.read_csv("/kaggle/input/seoul-bike-rental-ai-pro-iti/train.csv")
train_data.head()


In [3]:
train_data.dtypes

In [4]:
print("The shape of the dataset is {}.\n\n".format(train_data.shape))

In [5]:
train_data = train_data.sample(frac=1).reset_index(drop=True)
train_data.head()

In [6]:
train_data.describe().T

In [7]:
#Count NaN values under an entire DataFrame
train_data.isna().sum()


In [8]:
# Count duplicates in all dataframe
print(train_data.duplicated().sum())

#  The correlations between variables

In [9]:
plt.figure(figsize=(30,30))
res = sns.heatmap(train_data.corr("pearson"),vmin = -1,vmax = 1,cmap = 'PiYG',annot = True,annot_kws={"fontsize":25},square = True)
res.set_xticklabels(res.get_xmajorticklabels(), fontsize = 20)
res.set_yticklabels(res.get_ymajorticklabels(), fontsize = 20)

There is a hyper correlation between Temperature and Dew point temperature so we can drop the column

In [10]:
df1Corr=pd.DataFrame(train_data.corr().unstack().sort_values(ascending=False)['y'],columns=['Correlation to the target'])
df1Corr.style.background_gradient(cmap=sns.light_palette("red", as_cmap=True))

In [11]:
train_data['label_day_night']=train_data['Hour'].apply(lambda x : 'Night' if (x >20 or x<5) else( 'Day'))
train_data

In [12]:
train_data['Date']=pd.to_datetime(train_data['Date'],format="%d/%m/%Y")

In [13]:
train_data['WeekDay']=train_data['Date'].dt.day_name()
train_data['Month']=train_data['Date'].dt.month
train_data

In [14]:
train_data["Date"]

In [15]:
train_data.plot(x='Date',y='y',kind='line')


In [16]:
train_data.plot(x="label_day_night",y='y',kind='line')

In [17]:
train_data.groupby('label_day_night').sum()['y'].plot.pie()


In [18]:
train_data.groupby('WeekDay').sum()['y'].plot.pie()

In [19]:
df1Seasons=pd.DataFrame(train_data.groupby('Seasons').sum()['y'].sort_values(ascending=False))
df1Seasons

In [20]:
train_data["label_day_night"].value_counts()


In [21]:
train_data["WeekDay"].value_counts()

In [22]:
# Encoding 
train_data['Holiday'].replace({"Holiday": 0, "No Holiday": 1}, inplace=True)
train_data["label_day_night"].replace({"Night": 0, "Day": 1}, inplace=True)
train_data['WeekDay'].replace({"Saturday": 1, "Sunday": 2, "Monday": 3, "Tuesday": 4,"Wednesday":5,"Thursday":6,"Friday":7},inplace = True)
train_data['Functioning Day'].replace({"No": 0, "Yes": 1}, inplace=True)
train_data['Seasons'].replace({"Autumn": 1, "Spring": 2, "Summer": 3, "Winter": 4},inplace = True)

train_data


In [23]:
y = train_data.y
features_columns = ['Hour','Temperature(�C)','Humidity(%)','Wind speed (m/s)','Visibility (10m)','Solar Radiation (MJ/m2)','Rainfall(mm)','Snowfall (cm)','Seasons','Holiday','Functioning Day','label_day_night','WeekDay',"Month"]
X = train_data[features_columns]
X

In [24]:
#Data Splitting
# Import the train_test_split function and uncomment
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y =train_test_split(X, y, train_size=0.8, test_size=0.2,random_state = 42)

#  specify the model****

In [25]:
from sklearn.ensemble import ExtraTreesRegressor
# Specify the model
model = ExtraTreesRegressor()
model.fit(train_X,train_y)

In [26]:
# Predict with all validation observations
val_predictions = model.predict(val_X)
val_predictions


In [27]:
print("The accuracy of the DecisionTreeRegressor on the validation set is ", (model.score(val_X, val_y)))

In [28]:
from sklearn.metrics import mean_absolute_error
val_mae = mean_absolute_error(val_y,val_predictions)
val_mae

In [29]:
from sklearn.metrics import mean_squared_log_error
err = np.sqrt(mean_squared_log_error( val_y,val_predictions ))
print(err)