# Let's start with importing essential libraries.

In [None]:
import numpy as np
import pandas as pd

import plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import warnings
warnings.filterwarnings("ignore")

# Now, let's dive into our data.

In [None]:
df = pd.read_csv("../input/london-bike-sharing-dataset/london_merged.csv")

In [None]:
df.head()

## See if there are any dublicated or NaN values

In [None]:
df.duplicated().value_counts()

In [None]:
df.isnull().sum()

- Not a single missing value! PERFECT!

## Now, Let's plot the distribution of various discrete features such as season, holiday, weekend and weathercode.

In [None]:
fig = px.bar(x= df['season'].value_counts().index, y=df['season'].value_counts().values, 
             title='Seasons', labels={'y':'Count', 'x':'Seasons'})
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

### Seems like season column distributed normally. Let's check value counts of this column for more clear info.

In [None]:
df.season.value_counts()

- Values almost equal as expected.

### Now let's check *`is_holiday`* column.

In [None]:
weekend = df.groupby('is_weekend')['cnt'].mean().reset_index().rename(columns={'is_weekend': 'Weekend', 'cnt':'Number of Bike Shared'}, )
weekend['Weekend']= weekend['Weekend'].replace({0: 'Weekday', 1:'Weekend'})

fig = px.bar(weekend, x='Weekend', y= 'Number of Bike Shared', color='Weekend', )
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

- As expected highly 'not a holiday' distribution. Most likely *`is_weekend`* column is also in the same situation. Let's check.

### Now, look at *`weather_code`* column.

In [None]:
fig = px.pie(df, values=df['weather_code'].value_counts().values, 
             names= ['Clear', 'Scattered Clouds', 'Broken Clouds', 'Cloudy' 'Rain', 'Rain With Thunderstorm', 'Snowfall', 'Freezing Fog'])
fig.show()

### Let's transform `timestamp` column to `datetime` in type, and set it as index.

In [None]:
df["timestamp"] = pd.to_datetime(df["timestamp"])

In [None]:
df = df.set_index("timestamp")

In [None]:
df.head()

### Now it is time to make feature engineering. Let's extract new columns (day of the week, day of the month, hour, month, season, year etc.) by using new index.

In [None]:
# We can use strftime() function to get year, month, day, weekday and hour of the index.

df["year_month"] = df.index.strftime('%Y-%m')
df["year"] = df.index.year
df["month"] = df.index.month
df["day_of_month"] = df.index.day
df["day_of_week"] = df.index.weekday
df["hour"] = df.index.hour
df.head()

### Everything seems perfect. Now, let's visualize the correlation with a heatmap.

In [None]:
fig = px.imshow(df.corr(), title="Correlation Heat Map")
fig.show()

### For better understanding, let's see the correlation between our target variable which is *`cnt`* and the others.

In [None]:
fig = px.imshow(df.corr()[["cnt"]], title="Correlation Heat Map")
fig.show()

#### We understand that the count of a new bike shares(*`cnt`*) column has a positive correlation with *`t1`*, *`t2`* and *`hour`* columns. Also *`hum`* column, which gives information about humidity in percentage, has a fairly high negative correlation with *`cnt`*.

### For more clear understanding, let's visualize the correlation of the target variable and the other features with barplot

In [None]:
fig = px.bar(y=df.corr()["cnt"].index, x= df.corr()["cnt"].values, title="Correlation (CNT)",
            labels={"y": "Variables", "x":"Values"})
fig.update_layout(yaxis={'categoryorder':'total descending'})
fig.show()

### Now it is time to plot bike shares over time with lineplot.

In [None]:
fig = go.Figure(data=go.Scatter(x=df.index, y=df["cnt"]))
fig.update_layout(title="Bike Shares Over Time", xaxis_title="Date", yaxis_title="Count of Bike Shares")
fig.show()

- There are days with unusually high count of a new bike shares. Let's find out which days are they.

In [None]:
df[df["cnt"]>7000]

In [None]:
# In 2015-07-09 and 2015-08-06 count of a new bike shares increases. There must be something about those days.
# This is a great example of getting information great insights by visualization.

### It is time to plot bike shares by months and year_of_month to understand the correlation between bike shares and months.

In [None]:
year_month = df.groupby("year_month").sum().reset_index()

In [None]:
fig = go.Figure(data=go.Scatter(x=year_month["year_month"], y=year_month["cnt"]))
fig.update_layout(title="Bike Shares by Month", xaxis_title="Date", yaxis_title="Count of Bike Shares")
fig.show()

In [None]:
# As expected, in summer bike shares is increasing. Let's see this relation better by different plot.

In [None]:
px.line(df.groupby("month").mean(), x=df.groupby("month").mean().index, y=df.groupby("month").mean()["cnt"])

In [None]:
px.bar(df.groupby("month").mean(), x=df.groupby("month").mean().index, y=df.groupby("month").mean()["cnt"])

In [None]:
# In those two plots, we can clearly see the bike share difference by months. Bike share leans to increase in summer.

### What about correlation between bike shares and hours? It would be great to see the difference when it is a holiday too right! Let's plot  bike shares by hours.

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=df[df["is_holiday"] == 0].groupby(["hour"]).mean()[["cnt"]].index,
                        y=df[df["is_holiday"] == 0].groupby(["hour"]).mean()[["cnt"]]["cnt"],
                        name="Not Holiday"))

fig.add_trace(go.Scatter(x=df[df["is_holiday"] == 1].groupby(["hour"]).mean()[["cnt"]].index,
                        y=df[df["is_holiday"] == 1].groupby(["hour"]).mean()[["cnt"]]["cnt"],
                        name="Holiday"))

fig.update_layout(title="Bike Shares in Holidays By Hour",
                 xaxis_title="Hour", yaxis_title="Count of Bike Shares")
fig.show()

In [None]:
# We can clearly see that when it is not holiday, bike shares tends to increase 8AM and 7PM.
# This means people use bikes when going to work. 

In [None]:
# Also difference by seasons plot confirms our conclusion. People tends to use bikes more when it is spring.

### Let's plot bike shares by day of week to understand better.

In [None]:
day_of_week = df.groupby("day_of_week").sum()[["cnt"]]
day_of_week["days"] = ["Monday", "Tuesday", "Wednesday", "Thursday","Friday", "Saturday", "Sunday"]
day_of_week = day_of_week.set_index("days")


fig = px.bar(x=day_of_week.index, y=day_of_week["cnt"],
            color=day_of_week.index, title="Bike Shares by Day",
            labels={"x": "Days", "y":"Count of Bike Shares"})
fig.show()

In [None]:
# People use bike in weekdays more than weekends.

### Let's see the difference by seasons.

In [None]:
fig = go.Figure()

for i in range(0,4):
    fig.add_trace(go.Scatter(x=df[df["season"] == i].groupby(["day_of_week"]).mean()[["cnt"]].index,
                            y=df[df["season"] == i].groupby(["day_of_week"]).mean()[["cnt"]]["cnt"]))


fig.update_layout(title="Bike Shares in Seasons By Hour",
                 xaxis_title="Day of Week", yaxis_title="Count of Bike Shares")
fig.show()

### Plot bike shares by day of month

In [None]:
day_of_month = df.groupby("day_of_month").mean()[["cnt"]].astype("int")

fig = px.line(x=day_of_month.index, y=day_of_month.cnt, title="Bike Shares by Day of Month",
             labels={"x":"Day of Month", "y": "Count of Bike Shares"})
fig.show()

### It is time to plot bike shares by year and by seasons.

In [None]:
df.groupby("year").mean()[["cnt"]]

fig = px.bar(x=df.groupby("year").mean()[["cnt"]].index, y=df.groupby("year").mean()[["cnt"]]["cnt"],
            title="Bike Share by Year",
            labels={"y":"Count of Bike Shares", "x":"Year"})
fig.show()

In [None]:
# It does seem like in 2017 bike share dropped heavily. But that is not true. Because our data does not contain
# all information about 2017. This plot may dislead us.

In [None]:
fig = px.histogram(df,x="season",y="cnt", color="season")
fig.show()

In [None]:
# We can clearly see from this plot that people use bike most in summer.

### Now, let's visualize the distribution of bike shares by weekday/weekend with barplot

In [None]:
holiday = df.groupby('is_holiday')['cnt'].mean().reset_index().rename(columns={'is_holiday': 'Holiday', 'cnt':'Number of Bike Shared'}, )
holiday['Holiday']= holiday['Holiday'].replace({0: 'Normal Day', 1:'Holiday'})

fig = px.bar(holiday, x='Holiday', y= 'Number of Bike Shared', color='Holiday', )
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

In [None]:
# People use bikes in weekdays more than in weekends. Before we saw that in 7AM and also in 5PM bike usage increase.
# This addresses that people use bike when going to work and also when coming back to their home.

### Visualize the continuous variables with scatterplot

In [None]:
fig = px.scatter(x=df["t1"], y=df["hum"], color=df["season"])
fig.show()

In [None]:
fig = px.scatter(x=df["t1"], y=df["wind_speed"], color=df["season"])
fig.show()

We have come to an end of another great analysis. It was really enjoyable for me. It was a pleasure to work with this dataset for me. I would like to thank dataset contibutor for this data. I hope you enjoyed too. If you liked my EDA on this dataset, feel free to check my other notebooks as well. Looking forward for your feedback. Thanks a lot.

Have a great day.