# Let's start with importing essential libraries.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.plotting import register_matplotlib_converters
from pylab import rcParams
sns.set_style("darkgrid")

import warnings
warnings.filterwarnings("ignore")

# Now, let's dive into our data.

In [None]:
df = pd.read_csv("../input/london-bike-sharing-dataset/london_merged.csv")

In [None]:
df.head()

## See if there are any dublicated or NaN values

In [None]:
df.duplicated().value_counts()

In [None]:
df.isnull().sum()

In [None]:
# Not a single missing value! PERFECT!

## Now, Let's plot the distribution of various discrete features such as season, holiday, weekend and weathercode.

In [None]:
sns.countplot(x="season", data=df)

### Seems like season column distributed normally. Let's check value counts of this column for more clear info.

In [None]:
df.season.value_counts()

### Values almost equal as expected.

### Now let's check *`is_holiday`* column.

In [None]:
sns.countplot(x="is_holiday", data=df)

### As expected highly 'not a holiday' distribution. Most likely *`is_weekend`* column is also in the same situation. Let's check.

In [None]:
sns.countplot(x="is_weekend", data=df)

### Now, look at *`weather_code`* column.

In [None]:
sns.countplot(x="weather_code", data=df)

### Let's transform `timestamp` column to `datetime` in type, and set it as index.

In [None]:
df["timestamp"] = pd.to_datetime(df["timestamp"])

In [None]:
df = df.set_index("timestamp")

In [None]:
df.head()

### Now it is time to make feature engineering. Let's extract new columns (day of the week, day of the month, hour, month, season, year etc.) by using new index.

In [None]:
# We can use strftime() function to get year, month, day, weekday and hour of the index.

df["year_month"] = df.index.strftime('%Y-%m')
df["year"] = df.index.year
df["month"] = df.index.month
df["day_of_month"] = df.index.day
df["day_of_week"] = df.index.weekday
df["hour"] = df.index.hour
df.head()

### Everything seems perfect. Now, let's visualize the correlation with a heatmap.

In [None]:
plt.figure(figsize=(12,8), dpi=150)
sns.heatmap(df.corr(),annot=True)

### For better understanding, let's see the correlation between our target variable which is *`cnt`* and the others.

In [None]:
plt.figure(figsize=(2,4), dpi=150)
sns.heatmap(df.corr()[["cnt"]].sort_values(by="cnt", ascending=False)[1:],annot=True)

#### We understand that the count of a new bike shares(*`cnt`*) column has a positive correlation with *`t1`*, *`t2`* and *`hour`* columns. Also *`hum`* column, which gives information about humidity in percentage, has a fairly high negative correlation with *`cnt`*.

### For more clear understanding, let's visualize the correlation of the target variable and the other features with barplot

In [None]:
plt.figure(figsize=(12,8), dpi=150)
df.corr()[["cnt"]].sort_values(by="cnt").plot(kind="barh", legend=False);

### Now it is time to plot bike shares over time with lineplot.

In [None]:
plt.figure(figsize=(12,4), dpi=100)
sns.lineplot(x=df.index,y="cnt", data=df)

In [None]:
# There are days with unusually high count of a new bike shares. Let's find out which days are they.

In [None]:
df[df["cnt"]>7000]

In [None]:
# In 2015-07-09 and 2015-08-06 count of a new bike shares increases. There must be something about those days.
# This is a great example of getting information great insights by visualization.

### It is time to plot bike shares by months and year_of_month to understand the correlation between bike shares and months.

In [None]:
year_month = df.groupby("year_month").sum().reset_index()

In [None]:
plt.figure(figsize=(16,4), dpi=150)
sns.lineplot(x="year_month", y="cnt", data=year_month)
plt.xticks(rotation=90);

In [None]:
# As expected, in summer bike shares is increasing. Let's see this relation better by different plot.

In [None]:
plt.figure(figsize=(12,4), dpi=150)
sns.pointplot(x="month", y="cnt", data=df);

In [None]:
plt.figure(figsize=(8,4), dpi=100)
sns.barplot(x="month", y="cnt", data=df);

In [None]:
# In those two plots, we can clearly see the bike share difference by months. Bike share leans to increase in summer.

### What about correlation between bike shares and hours? It would be great to see the difference when it is a holiday too right! Let's plot  bike shares by hours.

In [None]:
plt.figure(figsize=(12,4), dpi=100)
sns.lineplot(data=df, x="hour", y="cnt", hue="is_holiday")

In [None]:
# We can clearly see that when it is not holiday, bike shares tends to increase 8AM and 7PM.
# This means people use bikes when going to work. 

In [None]:
plt.figure(figsize=(12,4), dpi=100)
sns.lineplot(data=df, x="hour", y="cnt", hue="season")

In [None]:
# Also difference by seasons plot confirms our conclusion. People tends to use bikes more when it is spring.

### Now let's see all this plots in a single figure.

In [None]:
fig, axs = plt.subplots(nrows=4,ncols=1,figsize=(12,10), dpi=100)
sns.pointplot(data=df, x="hour", y="cnt", ax=axs[0])
sns.pointplot(data=df, x="hour", y="cnt", ax=axs[1], hue="is_holiday")
sns.pointplot(data=df, x="hour", y="cnt", ax=axs[2], hue="is_weekend")
sns.pointplot(data=df, x="hour", y="cnt", ax=axs[3], hue="season")
plt.tight_layout()

### Let's plot bike shares by day of week to understand better.

In [None]:
plt.figure(figsize=(6,4), dpi=100)
sns.barplot(x="day_of_week", y="cnt", data=df, hue="is_weekend")

In [None]:
# People use bike in weekdays more than weekends.

### Let's see the difference by seasons.

In [None]:
fig, axs = plt.subplots(nrows=2,ncols=1,figsize=(12,6), dpi=100)
sns.pointplot(data=df, x="day_of_week", y="cnt", ax=axs[0])
sns.pointplot(data=df, x="day_of_week", y="cnt", ax=axs[1], hue="season")
plt.tight_layout()

### Plot bike shares by day of month

In [None]:
day_of_month = df.groupby("day_of_month").mean()[["cnt"]].astype("int")

In [None]:
plt.figure(figsize=(16,4), dpi=150)
sns.lineplot(data=day_of_month, x=day_of_month.index, y=day_of_month.cnt)

### It is time to plot bike shares by year and by seasons.

In [None]:
plt.figure(dpi=150)
sns.barplot(x="year", y="cnt", data=df)

In [None]:
# It does seem like in 2017 bike share dropped heavily. But that is not true. Because our data does not contain
# all information about 2017. This plot may dislead us.

In [None]:
plt.figure(dpi=150)
sns.barplot(x="season", y="cnt", data=df)

In [None]:
# We can clearly see from this plot that people use bike most in summer.

### Now, let's visualize the distribution of bike shares by weekday/weekend with piechart and barplot

In [None]:
plt.figure(dpi=150)
df.is_weekend.value_counts().plot(kind="pie")

In [None]:
plt.figure(figsize=(8,4))
graph = sns.countplot(x='is_weekend',data = df)
for p in graph.patches:
        graph.annotate(f"%{round(p.get_height() / len(df.is_weekend) * 100,1)} - {p.get_height()}", (p.get_x()+0.4, p.get_height()),
                    ha='center', va='bottom',
                    color= 'black',
                      fontsize=12)

In [None]:
# People use bikes in weekdays more than in weekends. Before we saw that in 7AM and also in 5PM bike usage increase.
# This addresses that people use bike when going to work and also when coming back to their home.

### Plotting the distribution of weather code by seasons

In [None]:
plt.figure(figsize=(8,4))
sns.countplot(data=df, x="weather_code")

In [None]:
sns.catplot(data=df,x="weather_code", col="season", kind="count")

### Visualize all the continuous variables with histogram and scatterplot

In [None]:
fig, ax = plt.subplots(nrows=2, ncols=2,figsize=(12,8), dpi=200)

sns.histplot(x=df["t1"], ax=ax[0][0], bins=10, color="orange")
ax[0][0].set_title("t1")
ax[0][0].set_xlabel("")

sns.histplot(x=df["t2"], ax=ax[0][1], bins=10)
ax[0][1].set_title("t2")
ax[0][1].set_xlabel("")



sns.histplot(x=df["wind_speed"], ax=ax[1][0], bins=10, color="green")
ax[1][0].set_title("Wind Speed")
ax[1][0].set_xlabel("")


sns.histplot(x=df["hum"], ax=ax[1][1], bins=10, color="black")
ax[1][1].set_title("Humidity")
ax[1][1].set_xlabel("")



plt.tight_layout()

In [None]:
fig,ax = plt.subplots(nrows=2, ncols=1 ,figsize=(20,10), dpi=200)
ax[0].scatter(x=df["t1"], y=df["hum"],c=df["season"])
ax[1].scatter(x=df["t1"], y=df["wind_speed"],c=df["season"])
plt.tight_layout();