In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
import warnings
from sklearn.model_selection import train_test_split


In [None]:
df = pd.read_csv("../input/bike-sharing-demand/train.csv")
df

* datetime - hourly date + timestamp  
* season -  1 = spring, 2 = summer, 3 = fall, 4 = winter 
* holiday - whether the day is considered a holiday
* workingday - whether the day is neither a weekend nor holiday

* weather - 1: Clear, Few clouds, Partly cloudy, Partly cloudy
            2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
            3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
            4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog 

* temp - temperature in Celsius
* atemp - "feels like" temperature in Celsius
* humidity - relative humidity
* windspeed - wind speed
* casual - number of non-registered user rentals initiated
* registered - number of registered user rentals initiated
* count - number of total rentals

In [None]:
#ploting the heatmap for correlation
fig = plt.figure(figsize=(20,10))
ax = sns.heatmap(df.corr(), annot=True) 
# atemp and temp are highly correlated --> .98

In [None]:
# temp and atemp have .98 correlation so dropping temp
df = df.drop(["temp"] , axis= 1)
df

In [None]:
# holiday column does not really matter because workingday coloumn is covering it
df = df.drop("holiday",axis=1)

In [None]:
sns.boxplot(df["count"])

In [None]:
df.describe()

In [None]:
#calculating upper and lower limits for outliers 
upper_limit = (1.5*(270 - 40))+270
lower_limit = 40 - (1.5*(270-40))
print(lower_limit," ", upper_limit)

In [None]:
#removing all outliers 
df = df[df["count"] <= upper_limit]
df = df[df["count"] >= lower_limit]
df

In [None]:
sns.relplot(data=df , x = "atemp" , y = "count"  ,col = "season",row="weather", palette = "deep",kind="line") # As temp increases the count also increases
# season -->  1 = spring , 2 = summer, 3 = fall, 4 = winter 

# weather --> 1: Clear, Few clouds, Partly cloudy, Partly cloudy
#            2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
#           3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
#          4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog  

In [None]:
sns.relplot(data=df , x = "humidity" , y = "count"  ,col = "weather",row="season", palette = "deep",kind="line") 

In [None]:
df['date']  =df.datetime.apply(lambda x: x.split()[0] )
df['hour'] = df.datetime.apply(lambda x: x.split()[1].split(':')[0] )
df['weekday'] =df.date.apply(lambda dateString : datetime.strptime(dateString, '%Y-%m-%d').weekday())
df['month'] = df.date.apply(lambda dateString : datetime.strptime(dateString, '%Y-%m-%d').month)
df= df.drop('datetime',axis=1)
df

In [None]:
fig = plt.figure(figsize=(20,10))
sns.heatmap(df.corr() , annot =True)
# casual and registred  are highly correlated to count

In [None]:
# count vs hour
# making a different dataframe with hour as index and casual and registered as a single coloum
fig = plt.figure(figsize = (15,10))
hourTransformed = pd.melt(df[["hour","casual","registered"]], id_vars=['hour'], value_vars=['casual', 'registered'])
hourAggregated = pd.DataFrame(hourTransformed.groupby(["hour","variable"],sort=True)["value"].mean()).reset_index()
sns.lineplot(x=hourAggregated["hour"], y=hourAggregated["value"],hue=hourAggregated["variable"],hue_order=["casual","registered"], data=hourAggregated )
# peak renter is completely contributed by registered users

In [None]:
fig = plt.figure(figsize = (15,10))
sns.barplot(data = hourTransformed, y = hourAggregated["value"] ,  x = hourAggregated["hour"] , hue = hourAggregated["variable"])

reg vs casual
corr plot
seasonality plot 

In [None]:
fig = plt.figure(figsize=(10,5))
#mon = ["January","February","March","April","May","June","July","August","September","October","November","December"]
g = sns.barplot(data=df , x = "month" , y= "count"  )
print(df.groupby("month").sum()["count"])
# count increases till 6th and 7th month and then decreases gradually
# season 1 = 1: January , 2: Feburary , 3: March   --> it has lower number of count that other 3 seasons
# season 2 = 4: April , 5 : May , June: 6          --> count peaks at the end of the season
# season 3 = 7: July , 8 : August , 9 : September  --> count peaks at the start of season and after JULY it starts decreasing
# season 4 = 10: October , 11: November ,12 : December 

In [None]:
fig = plt.figure(figsize=(20,10))
sns.lineplot(data=df , x = "hour" , y = "count" , hue = "season" , palette="deep" ,style="season" , markers = True)
# For all four seasons maximum number of bikes are rented 7 to 9 Am which peaks at 8Am
# and then again it is maximum between 4 Pm to 6 Pm which peaks at 5 Pm
# The obervation made in the above plot are proved here (season 1 has lower number of renters than other 3)

weather - 1: Clear, Few clouds, Partly cloudy, Partly cloudy

      2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
      3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
      4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog 

In [None]:
fig = plt.figure(figsize=(20,10))
sns.lineplot(data=df , x = "hour" , y = "count" , hue = "weather" , palette="deep" ,style="weather" , markers = True)
# people do not rent in extreme cold weather
# in weather 2 more number of renters in morning 
# in weather 1 more number of renters in evening
# in weather 3 comparitively less number of people rent the bikes

In [None]:
weth = pd.DataFrame([])

In [None]:
weth["Weather1"] = (df[df["weather"] == 1].groupby(["hour"])["count"].mean())
weth["Weather2"] = (df[df["weather"] == 2].groupby(["hour"])["count"].mean())
weth["Weather3"] = (df[df["weather"] == 3].groupby(["hour"])["count"].mean())
weth["Weather4"] = (df[df["weather"] == 4].groupby(["hour"])["count"].mean())
weth

In [None]:
X = ["Weather1","Weather2","Weather3","Weather4"]
for x in X:
    print(x ,"--> {7 to 9 : ",(weth[x][7]+weth[x][8]+weth[x][9]) / 3 , "}       {16 to 18 :" , (weth[x][17]+weth[x][18]+weth[x][16]) / 3 ,"}" )

In [None]:
fig = plt.figure(figsize=(20,10))
sns.lineplot(data=df , x = "hour" , y = "count" , hue = "weekday" , palette="deep" ,style="weekday" , markers= True ,ci=None)
# Here we can observe that in weekdays 7 to 9 am and 5 to 7 Pm  there are more number of renters while in the weekends the the renters keep of increasing after 7 Am and peek in the Afternoon 
# and after 4Pm renters start decreasing

In [None]:
fig = plt.figure(figsize=(30,15))

x = df["registered"]
w = df["casual"]
k = np.arange(len(x))
width = 0.35  # the width of the bars
fig,ax = plt.subplots()
rects1 = ax.bar(k - width/2, x, width, label='registered')
rects2 = ax.bar(k + width/2, w, width, label='casual')

In [None]:
fig = plt.figure(figsize=(20,10))
sns.relplot(data=df , x = "windspeed" , y = "count" ,col = "season" , kind ="line")


In [None]:
print(df.groupby("season")["windspeed"].max(),"\n") # maximum windspeed for each season
print("Count for each max windspeed for wach season \n")
print("Season 1: ",df[df["season"]==1].groupby("windspeed").mean().max()["count"] ) 
print("Season 2: ",df[df["season"]==2].groupby("windspeed").mean().max()["count"] ) 
print("Season 3: ",df[df["season"]==3].groupby("windspeed").mean().max()["count"] ) 
print("Season 4: ",df[df["season"]==4].groupby("windspeed").mean().max()["count"] ) 

In [None]:
df

In [None]:
fig = plt.figure(figsize=(15,10))
sns.relplot(data = df , x = "hour" , y = "atemp" ,col = "season" , row = "weather" , kind = "line")
# season -->  1 = spring , 2 = summer, 3 = fall, 4 = winter 

# weather --> 1: Clear, Few clouds, Partly cloudy, Partly cloudy
#            2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
#           3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
#          4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog  

# 1. spring season is colder than winter season and thats why the number of renter is more in winter season
# 2. spring temp < winter temp < summer temp < fall
# 3. As we saw earlier that june and july have higher number of renters and here we can see that ending graph of season 2 is nealry equal to ending graph of season 3
# 4. People prefer to ride bicycle at higer temp.

In [None]:
sns.catplot(data=df , y="count" ,x = "workingday" , kind ="bar")
print((df.groupby("workingday").mean())["count"].sum)
# approx same number of people are taking bike 

In [None]:
sns.relplot(data = df , x = "hour" , y = "humidity" ,col = "season" , row = "weather" , kind = "line")
# season -->  1 = spring , 2 = summer, 3 = fall, 4 = winter 