# Visualization and Exploratory Data Analysis of Cycle-Share-Dataset

**Loading all the Required Libraries**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from datetime import datetime
from dateutil.parser import parse
import folium                       #This is a Python Library for visualizing geospatial data.
import re

There are 3 Datasets:
1. station.csv
2. trip.csv
3. weather.csv

In [None]:
# Loading station.csv file

df_station= pd.read_csv('../input/cycle-share-dataset/station.csv')

In [None]:
df_station.head()

In [None]:
# Analyzing the Data Types of the Fields

df_station.info()

****Handling Missing values****

In [None]:
# Checking for missing values.

(df_station.isnull().sum()/len(df_station))*100

In [None]:
# There are missing values in the fields "modification_date" and "decommission_date".
# 70.6% and 93.1% of missing values in both the above fields respectively.
# So, let's drop the both the fields for Visualization purposes.

df_station.drop(['modification_date','decommission_date'],axis=1,inplace=True)

In [None]:
# Listing the columns in the Dataset.

df_station.columns

In [None]:
#Analyzing the values of the Fields.

df_station.describe()

On an Avg, there are 16 Docks at every Station.
Minimum no of Docks at a Station is 0 and Maximum no is 26.

In [None]:
# Let's convert install_date (which is an Object Type) to Datetime.
df_station['install_date']=pd.to_datetime(df_station['install_date'])

# Extraction only the date.
df_station['install_date']=df_station['install_date'].dt.date  

**VISUALIZATION**

In [None]:
# Visualizing the Locations of the Cycle Stations using folium Library.
# It's very easy to use folium.

# 1. Creating a Basic Map of Seattle by providing the respective lat & long values of the City to the 
# Map method.

map_stations=folium.Map(location=[47.608013,  -122.335167],zoom_start=12)
map_stations

In [None]:
# 2. We can add a location marker to the map by using add_to() method.

folium.Marker([47.615486,-122.318245]).add_to(map_stations)
map_stations

In [None]:
# 3. Let's add all the location data (lat & long) of all the Stations to the map.

lat = df_station.lat.values

long = df_station.long.values

name = df_station.name.values  # Station names

name=name.tolist()  # converting name (which is in array form) to list

print(type(name))

In [None]:
# Adding all the locations and stations to the map

for la,lo,nm in zip(lat,long,name):
    folium.Marker([la,lo],popup=nm).add_to(map_stations)
map_stations

In [None]:
df_station.columns

In [None]:
# Determining the date on which most stations were installed.

plt.figure(figsize=(16,4))
g=sns.countplot(x='install_date', data=df_station)
for i,p in zip(g.get_xticklabels(),g.patches):
    i.set_rotation(45)
    g.annotate(format(p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 4), 
                   textcoords = 'offset points',
                   rotation='horizontal')
plt.xlabel('Date of Installation', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.title('Most No of Stations Installed', fontsize=18)

On 13/10/2014, 50 Stations were installed.

In [None]:
# Let's determine the total docks added or removed from all stations.

df_station['DocksCount']= df_station['current_dockcount'] - df_station['install_dockcount']

In [None]:
# Determining total docks added and removed till present.

plt.figure(figsize=(20,6))
g=sns.barplot(x='station_id', y='DocksCount', data=df_station, order=df_station.sort_values(by='DocksCount', ascending=False).station_id)
for i,p in zip(g.get_xticklabels(),g.patches):
    i.set_rotation(90)
    g.annotate(format(p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 6), 
                   textcoords = 'offset points',
                   rotation='horizontal')
plt.xlabel('Station IDs', fontsize=15)
plt.ylabel('Docks Status', fontsize=15)
plt.title('Total Docks Added/ Removed', fontsize=18)

So, highest No of docks removed were 20 from Station SLU-18 & most Docks added were to the Station SLU-15.

In [None]:
# Loading Trip.csv file.

df_trip=pd.read_csv('../input/cycle-share-dataset/trip.csv',error_bad_lines=False)

In [None]:
df_trip.head()

Data Fields:

1. trip_id -> Trip ID
2. starttime -> Starting time of the trip.
3. stoptime -> Ending time of the trip.
4. bikeid -> Cycle ID.
5. tripduration -> Duration of the Trip (in Seconds).
6. from_station_name -> Source Station Name.
7. to_station_name -> Destination Station Name.
8. from_station_id -> Source Station ID.
9. to_station_id -> Destination Station ID.
10. usertype -> Type of User.
11. gender -> Gender of the User.
12. birthyear -> Birth Year of the User.

In [None]:
# Let's drop the trip_id field.

df_trip=df_trip.drop('trip_id',axis=1)

In [None]:
# Analyzing the Data Types of the Fields
df_trip.info()

****Handling Missing values****

In [None]:
# Checking for missing values.

(df_trip.isnull().sum()/len(df_trip))*100

There are missing values in the fields "gender" and "birthyear".
36.7% of missing values in both the above fields respectively.

In [None]:
# Let's fill the missing values with the previous field values in the dataframe using ffill (forward method) and bfill (backward method) 

df_trip = df_trip.fillna(method='ffill').fillna(method='bfill')

In [None]:
df_trip.info()

In [None]:
# Let's convert all dates in Object types into datetypes.
df_trip['starttime']=pd.to_datetime(df_trip['starttime'])
df_trip['stoptime']=pd.to_datetime(df_trip['stoptime'])

In [None]:
# Let's convert birthyear to Integer.
df_trip['birthyear']=df_trip['birthyear'].apply(lambda x:int(x))

In [None]:
df_trip.birthyear.dtypes

In [None]:
# Age Calculation.

df_trip['Age']=df_trip['stoptime'].dt.year - df_trip['birthyear']

In [None]:
df_trip.columns

In [None]:
# Determining the Top 10 Source Stations.

plt.figure(figsize=(15,4))
g=sns.countplot(x='from_station_name', data=df_trip, order= df_trip.from_station_name.value_counts().head(10).index)
for i,p in zip(g.get_xticklabels(),g.patches):
    i.set_rotation(90)
    g.annotate(format(p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 4), 
                   textcoords = 'offset points',
                   rotation='horizontal')
plt.xlabel('Source Station Names', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.title('Top 10 Source Stations', fontsize=18)

In [None]:
# Determining the Top 10 Destination Stations.

plt.figure(figsize=(15,4))
g=sns.countplot(x='to_station_name', data=df_trip, order= df_trip.to_station_name.value_counts().head(10).index)
for i,p in zip(g.get_xticklabels(),g.patches):
    i.set_rotation(90)
    g.annotate(format(p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 4), 
                   textcoords = 'offset points',
                   rotation='horizontal')
plt.xlabel('Destination Station Names', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.title('Top 10 Destination Stations', fontsize=18)

In [None]:
plt.figure(figsize=(18,6))
g=sns.countplot(x='Age', data=df_trip)
for i,p in zip(g.get_xticklabels(),g.patches):
    g.annotate(format(p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 20), 
                   textcoords = 'offset points',
                   rotation='vertical')
plt.xlabel('Age', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.title('Age Analysis', fontsize=18)

Users have been using Cycle Services starting from Age 16 till the age of 85! and most users are in the range of 24 to 34 Years (Above 10000 count) with 28 being the highest.

In [None]:
# Let's check for Outliers in tripduration field.

sns.boxplot(df_trip.tripduration)

In [None]:
# 9.4% Outliers in tripduration field.
# Converting tripduration to Minutes.

df_trip['tripduration(in Minutes)']=df_trip['tripduration'].apply(lambda x: round((x/60),2))

In [None]:
# Let's try to clip the Outliers by Identifying the Quartile Range.

Outliers_tripduration_mins = df_trip['tripduration(in Minutes)']

Outliers_tripduration_mins_Q1 = Outliers_tripduration_mins.quantile(0.25)  # 1st Quartile

Outliers_tripduration_mins_Q3 = Outliers_tripduration_mins.quantile(0.75)  # 3rd Quartile 

Outliers_tripduration_mins_IQR= Outliers_tripduration_mins_Q3-Outliers_tripduration_mins_Q1      # Inter Quartile Range

Outliers_tripduration_mins_Lowerrange=Outliers_tripduration_mins_Q1-(1.5 * Outliers_tripduration_mins_IQR)

Outliers_tripduration_mins_Upperrange=Outliers_tripduration_mins_Q3+(1.5 * Outliers_tripduration_mins_IQR)


print(Outliers_tripduration_mins_Q1, " -> 1st Quartile")
print(Outliers_tripduration_mins_Q3, " -> 3rd Quartile")
print(Outliers_tripduration_mins_IQR, " -> Inter Quartile Range")
print(Outliers_tripduration_mins_Lowerrange, " -> Outliers below the lower Range")
print(Outliers_tripduration_mins_Upperrange, " -> Outliers Above the Upper Range")

In [None]:
Outliers_tripduration_mins_outliers=df_trip[(Outliers_tripduration_mins < Outliers_tripduration_mins_Lowerrange) | 
                                       (Outliers_tripduration_mins > Outliers_tripduration_mins_Upperrange)]

print((len(Outliers_tripduration_mins_outliers)/len(df_trip.tripduration))*100,"%")

In [None]:
# 9.4% Outliers exist.
# Let's Analyze Trip duration excluding the Outliers.

TripDuration_Analysis=df_trip[df_trip['tripduration(in Minutes)'] < 36.8]

In [None]:
plt.figure(figsize=(15,6))
sns.distplot(TripDuration_Analysis['tripduration(in Minutes)'])
plt.xlabel('Trip Duration (in Minutes)', fontsize=15)
plt.ylabel('Age Density', fontsize=15)
plt.title('Age Analysis', fontsize=18)

Excluding the Outliers, the duration of most Cyclists ranges from 3 to 15 mins.

In [None]:
# User Types Analysis

g=sns.countplot(x='usertype',data=df_trip)
for p in g.patches:
    g.annotate(format(p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 4), 
                   textcoords = 'offset points')
plt.xlabel('User Type', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.title('User Type Analysis', fontsize=18)

In [None]:
# Gender Analysis

g=sns.countplot(x='gender',data=df_trip)
for p in g.patches:
    g.annotate(format(p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 4), 
                   textcoords = 'offset points')
plt.xlabel('Gender', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.title('Gender Analysis', fontsize=18)

In [None]:
g=sns.countplot(x='usertype',data=df_trip,hue='gender')
for p in g.patches:
    g.annotate(format(p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 4), 
                   textcoords = 'offset points')
plt.xlabel("User Types w.r.t Gender", fontsize=15)
plt.ylabel("Count", fontsize=15)
plt.title("User Types Analysis w.r.t Gender", fontsize=18)

Males are the highest members w.r.t both permanent & Short-term memberships.

Let's Analyse the male Users Data

In [None]:
Male_Cyclists=df_trip[df_trip.gender == "Male"]

In [None]:
Male_Cyclists=Male_Cyclists[Male_Cyclists['tripduration(in Minutes)'] < 36.8]  # Excluding Outliers

plt.figure(figsize=(15,6))
sns.distplot(Male_Cyclists['tripduration(in Minutes)'])
plt.xlabel('Trip Duration (in Minutes)', fontsize=15)
plt.ylabel('Density', fontsize=15)
plt.title('Trip Duration Analysis', fontsize=18)

In [None]:
# Age Analysis of Male Cyclists

plt.figure(figsize=(18,6))
g=sns.countplot(x='Age', data=Male_Cyclists)
for i,p in zip(g.get_xticklabels(),g.patches):
    g.annotate(format(p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 20), 
                   textcoords = 'offset points',
                   rotation='vertical')
plt.xlabel('Age', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.title('Age Analysis (Males)', fontsize=18)

Most Cyclists are in the range of 23 to 35 and after that there is a decrease in the no of users.

In [None]:
# User types Analysis of Males

g=sns.countplot(x='usertype', data=Male_Cyclists)
for i,p in zip(g.get_xticklabels(),g.patches):
    g.annotate(format(p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 5), 
                   textcoords = 'offset points',
                   rotation='horizontal')
plt.xlabel('User Type', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.title('User Type Analysis (Males)', fontsize=18)

Permanent Memberships are more in males.

In [None]:
# Let's Analyze Age w.r.t trip duration.
plt.figure(figsize=(18,6))
sns.barplot(x='Age', y='tripduration(in Minutes)', data=Male_Cyclists)

plt.xlabel('Age', fontsize=15)
plt.ylabel('Trip Duartion (in Minutes)', fontsize=15)
plt.title('Age Vs Trip Duration (Males)', fontsize=18)

The duration of most of the Cyclists varies from 10 to 12.5 mins and decreases drastically above 72 yrs except some variations in between.

Let's Analyse the Female Users Data

In [None]:
Female_Cyclists=df_trip[df_trip.gender == "Female"]

In [None]:
Female_Cyclists=Female_Cyclists[Female_Cyclists['tripduration(in Minutes)'] < 36.8]  # Excluding Outliers

plt.figure(figsize=(15,6))
sns.distplot(Female_Cyclists['tripduration(in Minutes)'])
plt.xlabel('Trip Duartion (in Minutes)', fontsize=15)
plt.ylabel('Density', fontsize=15)
plt.title('Age Vs Trip Duration (Males)', fontsize=18)

Most of the female Cyclists' trip duration varies from 5 to 20 mins.

In [None]:
# Age Analysis of Female Cyclists

plt.figure(figsize=(20,6))
g=sns.countplot(x='Age', data=Female_Cyclists)
for i,p in zip(g.get_xticklabels(),g.patches):
    g.annotate(format(p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 20), 
                   textcoords = 'offset points',
                   rotation='vertical')
plt.xlabel('Age', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.title('Age Analysis (Females)', fontsize=18)

Most Female Cyclists' Age is varying from 24 to 34 after which there are ups & downs. 51 & 65 yrs of age are Cycling more.

In [None]:
# User types Analysis of Females

g=sns.countplot(x='usertype', data=Female_Cyclists)
for i,p in zip(g.get_xticklabels(),g.patches):
    g.annotate(format(p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 5), 
                   textcoords = 'offset points',
                   rotation='horizontal')
plt.xlabel('User Type', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.title('User Type Analysis (Females)', fontsize=18)

Permanent memberships are more w.r.t females too.

In [None]:
# Let's Analyze Age w.r.t trip duration.
plt.figure(figsize=(18,6))
sns.barplot(x='Age', y='tripduration(in Minutes)', data=Female_Cyclists)

plt.xlabel('Age', fontsize=15)
plt.ylabel('Trip Duration (in Minutes)', fontsize=15)
plt.title('Age Vs Trip Duration (Females)', fontsize=18)

Most of their trip varies between 11 to 15 mins and longest duration (half an hr) is of an 85 yr old women's!.

In [None]:
# Loading weather.csv file

df_weather=pd.read_csv('../input/cycle-share-dataset/weather.csv')

In [None]:
df_weather.head()

In [None]:
df_weather.columns

In [None]:
df_weather.info()

In [None]:
(df_weather.isnull().sum()/len(df_weather))*100

Handling Missing Values

There are missing values in the fields "Max_Gust_Speed_MPH" and "Events".
26.8% and 52.3% of missing values in both the above fields respectively and 1 missing value in ""Mean_Temperature_F.

In [None]:

df_weather[df_weather.Mean_Temperature_F.isnull()]

In [None]:
# determining the mean value in the field

df_weather.Mean_Temperature_F.mean()

In [None]:
# filling the missing value with the mean value.
df_weather['Mean_Temperature_F']=df_weather['Mean_Temperature_F'].fillna(df_weather.Mean_Temperature_F.mean())

In [None]:
df_weather['Max_Gust_Speed_MPH'].head()

In [None]:
# Let's replace the "Hyphens" with nan values.

import math

df_weather.Max_Gust_Speed_MPH = df_weather.Max_Gust_Speed_MPH.replace({"-" : math.nan})

In [None]:
df_weather['Max_Gust_Speed_MPH'].head()

In [None]:
# Let's fill the nan values using forward & backward filling methods.

df_weather['Max_Gust_Speed_MPH']= df_weather['Max_Gust_Speed_MPH'].fillna(method='ffill').fillna(method='bfill')

In [None]:
df_weather.Events.value_counts()

In [None]:
# Let's replace field values having 'Comma' with 'Hyphen'.

df_weather.Events=df_weather.Events.replace('Fog , Rain', 'Fog-Rain')
df_weather.Events=df_weather.Events.replace('Rain , Thunderstorm', 'Rain-Thunderstorm')
df_weather.Events=df_weather.Events.replace('Rain , Snow', 'Rain-Snow')

In [None]:
df_weather.Events.value_counts()

In [None]:
# As there are a lot of nan values, let's fill them with "No-Event".

df_weather.Events = df_weather.Events.fillna("No-Event")

In [None]:
df_weather.info()

In [None]:
# Converting Date to datetype.
df_weather['Date'] = pd.to_datetime(df_weather['Date'])

In [None]:
# Converting float to int type.

df_weather['Mean_Temperature_F'] = df_weather['Mean_Temperature_F'].apply(lambda x: int(x))

In [None]:
# Converting object to int type.

df_weather.Max_Gust_Speed_MPH = df_weather.Max_Gust_Speed_MPH.apply(lambda x: int(x))

In [None]:
# Converting object to String type.

df_weather.Events = df_weather.Events.astype('string')

In [None]:
# Let's make a column named "Quarter" to analyse Precipitation Data on a quarterly basis.

df_weather['Quarter']=df_weather['Date'].dt.to_period('Q')
df_weather['Quarter'].value_counts().sort_index()

In [None]:
plt.figure(figsize=(20,6))
sns.barplot(x='Quarter', y='Precipitation_In', data=df_weather)

plt.xlabel('Quarter', fontsize=15)
plt.ylabel('Precipitation', fontsize=15)
plt.title('Precipitation on a Quarterly basis', fontsize=18)

We can see that precipitation is high at the year end and the beginning of the next year. i.e., during Quarter 4 & Quarter 1.

In [None]:
# let's create "Year", "Month" & "Day" columns for detail analysis.

df_weather['Year'] = pd.DatetimeIndex(df_weather.Date).year
df_weather['Month'] = pd.DatetimeIndex(df_weather.Date).month
df_weather['WeekDay'] = pd.DatetimeIndex(df_weather.Date).weekday

In [None]:
df_weather[df_weather.Month == 1]['Date'].head(30)

In [None]:
df_weather.Month=df_weather.Month.map({1:'Jan',2:'Feb',3:'Mar',4:'Apr',5:'May',6:'Jun',7:'Jul',8:'Aug',9:'Sep',10:'Oct',11:'Nov',12:'Dec'})
df_weather.WeekDay=df_weather.WeekDay.map({0:'Mon',1:'Tue',2:'Wed',3:'Thu',4:'Fri',5:'Sat',6:'Sun'})

In [None]:
index=[0,1,2]
x_val=["Year","Month","WeekDay"]
y_val="Precipitation_In"
title=["Yearly","Monthly","Daily"]

fig, axes = plt.subplots(1, 3, figsize=(20, 6), sharey=True)
fig.suptitle('Precipitation Analysis', fontsize = 18)

for index,x_val,title in zip(index,x_val,title):
    b=sns.barplot(ax=axes[index], x=x_val, y=y_val, data=df_weather)
    b.set_xlabel(x_val,fontsize=15)
    b.set_ylabel(y_val,fontsize=15)
    axes[index].set_title(title + " Precipitation", fontsize = 15)

From the above plots, we can see that the rainfall has reduced from 2014 to 2016.
Rainfall is high at the start of the year & during the year end.
Daily rainfall is high on Fridays & Saturdays

Let's merge all 3 datasets for further analysis

In [None]:
# creating 2 new columns named "From_Station_ID" & "To_Station_ID" in "df_station.csv"

df_station['from_station_id']=df_station.station_id
df_station['to_station_id']=df_station.station_id

In [None]:
#creating a dataframe with "lat", "long" & 'from_station_id' fields

df_from_station=df_station[['lat','long','from_station_id']]

In [None]:
# Merging "df_from_station" with "df_trip".

df_trip_updated1=pd.merge(df_trip,df_from_station,on='from_station_id')

In [None]:
#creating a dataframe with "lat", "long" & 'to_station_id' fields

df_to_station=df_station[['lat','long','to_station_id']]

In [None]:
# Merging "df_to_station" with "df_trip_updated1".

df_trip_updated2=pd.merge(df_trip_updated1,df_to_station,on='to_station_id')

In [None]:
df_trip_updated2.columns

In [None]:
# converting "starttime" field to list in order to extract Date.

date_str=list(df_trip_updated2.starttime)

In [None]:
# Extracting Date from "date_str".

date_str=[datetime.strftime(x, '%Y-%m-%d') for x in date_str]

In [None]:
# Updating "df_trip_updated2".

df_trip_updated2['Date']=date_str

In [None]:
# Converting Object to datetime format.

df_trip_updated2['Date']=pd.to_datetime(df_trip_updated2['Date'])

In [None]:
# Merging "df_trip_updated2" with "df_weather".

df_merged=pd.merge(df_trip_updated2,df_weather,on='Date')

In [None]:
df_merged.columns

As we now have the wholw dataset, let's start analyzing it.

In [None]:
df_merged.info()

In [None]:
df_merged.describe()

In [None]:
# Let's create a column named "Hour" to analyze hourly activity.

df_merged['Hour'] = pd.DatetimeIndex(df_merged.starttime).hour

In [None]:
plt.figure(figsize=(16,5))
g=sns.countplot(x='Year',data=df_merged,hue='gender')
for p in g.patches:
    g.annotate(format(p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 4), 
                   textcoords = 'offset points',
                   rotation='horizontal')
plt.xlabel("Year",fontsize=15)
plt.ylabel("Count",fontsize=15)
plt.title("Yearly Analysis of Cyclists W.r.t Gender",fontsize=18)

In [None]:
plt.figure(figsize=(16,5))
g=sns.countplot(x='Month',data=df_merged,hue='gender')
for p in g.patches:
    g.annotate(format(p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 4), 
                   textcoords = 'offset points',
                   rotation='horizontal')
plt.xlabel("Month",fontsize=15)
plt.ylabel("Count",fontsize=15)
plt.title("Monthly Analysis of Cyclists W.r.t Gender",fontsize=18)

It can be seen that there are fewer Cyclists in September than compared with other months. As it is winter from December to February in Seattle, the no of Cyclists are low which is obvious.As it's peek summer during July & August, more Cyclists hit the road.

In [None]:
plt.figure(figsize=(16,5))
g=sns.countplot(x='WeekDay',data=df_merged,hue='gender')
for p in g.patches:
    g.annotate(format(p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 4), 
                   textcoords = 'offset points',
                   rotation='horizontal')
plt.xlabel("WeekDay",fontsize=15)
plt.ylabel("Count",fontsize=15)
plt.title("Weekly Analysis of Cyclists W.r.t Gender",fontsize=18)

The weekdays have consistent Nos compared to the weekends.

In [None]:
plt.figure(figsize=(18,5))
g=sns.countplot(x='Hour',data=df_merged,hue='gender')
for p in g.patches:
    g.annotate(format(p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 20), 
                   textcoords = 'offset points',
                   rotation='vertical')
plt.xlabel("Hour",fontsize=15)
plt.ylabel("Count",fontsize=15)
plt.title("Hourly Analysis of Cyclists W.r.t Gender",fontsize=18)

There's high activity in the Evening (from 4 P.M to 6 P.M) & in the Morning (from 8 A.M to9 A.M).There is some Activity during mid-night too!.

In [None]:
# Let's categorize the Hours to "Morning", "Afternoon", "Evening", "Night" & "Mid-night" to get a better understanding.

def update_hour(hour):
    if(hour in range(0,5)):
        return "Mid-night"
    elif(hour in range(5,11)):
        return "Morning"
    elif(hour in range(11,16)):
        return "Afternoon"
    elif(hour in range(16,20)):
        return "Evening"
    elif(hour in range(20,24)):
        return "Night"

In [None]:
df_merged['Time_of_day']=df_merged['Hour'].apply(update_hour)

In [None]:
plt.figure(figsize=(18,5))
g=sns.countplot(x='Time_of_day',data=df_merged,hue='gender')
for p in g.patches:
    g.annotate(format(p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 5), 
                   textcoords = 'offset points',
                   rotation='horizontal')
plt.xlabel("Time of the Day",fontsize=15)
plt.ylabel("Count",fontsize=15)
plt.title("Daily Analysis of Cyclists W.r.t Gender",fontsize=18)

As we can see, Most Cyclists Cycle during the Afternoons and there are Mid-night Cyclists too!.

In [None]:
plt.figure(figsize=(18,5))
g=sns.countplot(x='Events',data=df_merged,hue='gender')
for p in g.patches:
    g.annotate(format(p.get_height()), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 5), 
                   textcoords = 'offset points',
                   rotation='horizontal')
plt.xlabel("Events",fontsize=15)
plt.ylabel("Count",fontsize=15)
plt.title("Cycle Usage Analysis during different Weather Situations",fontsize=18)

There are obviously more Cyclists during no weather phenomena and also during Rains.

****If you like my Kernel, Please Upvote :). Please feel free to provide suggestions in the comments which helps me to improve myself. Thank you :)****