In [135]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar


In [136]:
bike_hourly = pd.read_csv('hourly_weather.csv')

Data Wrangling

In [137]:
#create a copy of the bike hourly data
bike_hourly_new = bike_hourly.copy()

In [138]:
bike_hourly_new.isnull().sum()

Casual                     11
Date                        0
Hour                        0
Member                     11
Total_rides                11
relativehumidity_2m (%)     0
temperature_2m (°C)         0
weathercode (wmo code)      0
windspeed_10m (km/h)        0
dtype: int64

In [139]:
# Convert the 'Date' column to datetime
bike_hourly_new['Date'] = pd.to_datetime(bike_hourly_new['Date'])

# Ensure 'Hour' is integer (if it's not already)
bike_hourly_new['Hour'] = bike_hourly_new['Hour'].astype(int)

# Convert float columns correctly
float_columns = ['Casual', 'Member', 'Total_rides', 'temperature_2m (°C)', 'windspeed_10m (km/h)']
bike_hourly_new[float_columns] = bike_hourly_new[float_columns].astype(float)

# Convert integer columns
int_columns = ['relativehumidity_2m (%)', 'weathercode (wmo code)']
bike_hourly_new[int_columns] = bike_hourly_new[int_columns].astype(int)

# Check the updated data types
print(bike_hourly_new.dtypes)


Casual                            float64
Date                       datetime64[ns]
Hour                                int64
Member                            float64
Total_rides                       float64
relativehumidity_2m (%)             int64
temperature_2m (°C)               float64
weathercode (wmo code)              int64
windspeed_10m (km/h)              float64
dtype: object


In [140]:
#change the columns to lowercase and remove spaces
bike_hourly_new.columns = bike_hourly_new.columns.str.lower().str.replace(' ', '_')
bike_hourly_new.head()

Unnamed: 0,casual,date,hour,member,total_rides,relativehumidity_2m_(%),temperature_2m_(°c),weathercode_(wmo_code),windspeed_10m_(km/h)
0,20.0,2021-01-01,0,26.0,46.0,90,2.4,1,10.2
1,23.0,2021-01-01,1,23.0,46.0,88,1.8,1,10.0
2,20.0,2021-01-01,2,28.0,48.0,87,1.1,2,10.1
3,9.0,2021-01-01,3,7.0,16.0,81,1.6,2,10.6
4,7.0,2021-01-01,4,5.0,12.0,77,1.6,2,9.9


In [141]:
#adding holiday as a column
cal = calendar()
holidays = cal.holidays(start=bike_hourly_new['date'].min(), end=bike_hourly_new['date'].max())
bike_hourly_new['holiday'] = bike_hourly_new['date'].isin(holidays)

#add month and day of the week as columns
bike_hourly_new['month'] = bike_hourly_new['date'].dt.month
bike_hourly_new['day_of_week'] = bike_hourly_new['date'].dt.dayofweek

#adding season as a column
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

bike_hourly_new['season'] = bike_hourly_new['date'].dt.month.apply(get_season)

#adding holiday as a column
cal = calendar()
holidays = cal.holidays(start=bike_hourly_new['date'].min(), end=bike_hourly_new['date'].max())
bike_hourly_new['holiday'] = bike_hourly_new['date'].isin(holidays)

#change season to a category
bike_hourly_new['season'] = bike_hourly_new['season'].astype('category')



In [142]:
#change data type from integer to category for day of the week
bike_hourly_new['day_of_week'] = bike_hourly_new['day_of_week'].astype('category')

#change data type from integer to category for month
bike_hourly_new['month'] = bike_hourly_new['month'].astype('category')

#change the day of the week to actual names
bike_hourly_new['day_of_week'] = bike_hourly_new['day_of_week'].map({0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'})

#change the month to actual names
bike_hourly_new['month'] = bike_hourly_new['month'].map({1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June', 7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'})

#change the holiday column to yes or no
bike_hourly_new['holiday'] = bike_hourly_new['holiday'].map({True: 'Yes', False: 'No'})

#change the hour to a early morning, morning, afternoon, evening, or night
def get_time_of_day(hour):
    if hour in [0, 1, 2, 3, 4, 5]:
        return 'Early Morning'
    elif hour in [6, 7, 8, 9, 10, 11]:
        return 'Morning'
    elif hour in [12, 13, 14, 15, 16, 17]:
        return 'Afternoon'
    elif hour in [18, 19, 20, 21, 22, 23]:
        return 'Evening'
    
bike_hourly_new['time_of_day'] = bike_hourly_new['hour'].apply(get_time_of_day)



# Define bin edges and labels based on WMO weather codes
bins = [0, 4, 20, 40, 50, 60, 70, 80, 90, 100]
labels = [
    "Clear/Cloudy", 
    "Atmospheric Obstructions", 
    "Fog Conditions", 
    "Drizzle", 
    "Rain", 
    "Snow", 
    "Showers", 
    "Thunderstorms",
    "Extreme Weather"
]

# Apply binning to the 'weathercode_(wmo_code)' column and add as a new category
bike_hourly_new['weather_category'] = pd.cut(
    bike_hourly_new['weathercode_(wmo_code)'], 
    bins=bins, 
    labels=labels, 
    right=False
)

# Define humidity bins and labels
humidity_bins = [0, 30, 60, 80, 100]
humidity_labels = ['Low', 'Moderate', 'High', 'Very High']

# Apply binning to 'relativehumidity_2m_(%)' column
bike_hourly_new['humidity_category'] = pd.cut(
    bike_hourly_new['relativehumidity_2m_(%)'], 
    bins=humidity_bins, 
    labels=humidity_labels, 
    right=True
)

# Define windspeed bins and labels
windspeed_bins = [0, 5, 20, 39, 61, float('inf')]
windspeed_labels = ['Calm', 'Light Breeze', 'Moderate Breeze', 'Strong Breeze', 'High Wind']

# Apply binning to 'windspeed_10m_(km/h)' column
bike_hourly_new['windspeed_category'] = pd.cut(
    bike_hourly_new['windspeed_10m_(km/h)'], 
    bins=windspeed_bins, 
    labels=windspeed_labels, 
    right=True
)

# Display results
print(bike_hourly_new[['windspeed_10m_(km/h)', 'windspeed_category']].head())

# Display results
print(bike_hourly_new[['relativehumidity_2m_(%)', 'humidity_category']].head())



   windspeed_10m_(km/h) windspeed_category
0                  10.2       Light Breeze
1                  10.0       Light Breeze
2                  10.1       Light Breeze
3                  10.6       Light Breeze
4                   9.9       Light Breeze
   relativehumidity_2m_(%) humidity_category
0                       90         Very High
1                       88         Very High
2                       87         Very High
3                       81         Very High
4                       77              High


In [143]:
bike_hourly_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26280 entries, 0 to 26279
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   casual                   26269 non-null  float64       
 1   date                     26280 non-null  datetime64[ns]
 2   hour                     26280 non-null  int64         
 3   member                   26269 non-null  float64       
 4   total_rides              26269 non-null  float64       
 5   relativehumidity_2m_(%)  26280 non-null  int64         
 6   temperature_2m_(°c)      26280 non-null  float64       
 7   weathercode_(wmo_code)   26280 non-null  int64         
 8   windspeed_10m_(km/h)     26280 non-null  float64       
 9   holiday                  26280 non-null  object        
 10  month                    26280 non-null  category      
 11  day_of_week              26280 non-null  category      
 12  season                   26280 n

In [144]:
bike_hourly_new.head()

Unnamed: 0,casual,date,hour,member,total_rides,relativehumidity_2m_(%),temperature_2m_(°c),weathercode_(wmo_code),windspeed_10m_(km/h),holiday,month,day_of_week,season,time_of_day,weather_category,humidity_category,windspeed_category
0,20.0,2021-01-01,0,26.0,46.0,90,2.4,1,10.2,Yes,January,Friday,Winter,Early Morning,Clear/Cloudy,Very High,Light Breeze
1,23.0,2021-01-01,1,23.0,46.0,88,1.8,1,10.0,Yes,January,Friday,Winter,Early Morning,Clear/Cloudy,Very High,Light Breeze
2,20.0,2021-01-01,2,28.0,48.0,87,1.1,2,10.1,Yes,January,Friday,Winter,Early Morning,Clear/Cloudy,Very High,Light Breeze
3,9.0,2021-01-01,3,7.0,16.0,81,1.6,2,10.6,Yes,January,Friday,Winter,Early Morning,Clear/Cloudy,Very High,Light Breeze
4,7.0,2021-01-01,4,5.0,12.0,77,1.6,2,9.9,Yes,January,Friday,Winter,Early Morning,Clear/Cloudy,High,Light Breeze


In [145]:
#count of unique weather codes
bike_hourly_new['weathercode_(wmo_code)'].value_counts()


weathercode_(wmo_code)
0     11431
1      6003
3      3290
2      2507
51     1394
53      602
61      400
63      222
55      194
71      102
73       90
75       28
65       17
Name: count, dtype: int64

In [146]:
#idenifying the null values 
bike_hourly_new.isnull().sum()

#find the null values in the dataset
bike_hourly_new[bike_hourly_new.isnull().any(axis=1)]

#remove the null values
bike_hourly_new = bike_hourly_new.dropna()

In [147]:
#value_counts all categorical columns
print(bike_hourly_new['season'].value_counts())
print(bike_hourly_new['holiday'].value_counts())
print(bike_hourly_new['day_of_week'].value_counts())
print(bike_hourly_new['month'].value_counts())
print(bike_hourly_new['weather_category'].value_counts())
print(bike_hourly_new['humidity_category'].value_counts())
print(bike_hourly_new['windspeed_category'].value_counts())


season
Summer    6623
Spring    6620
Fall      6552
Winter    6471
Name: count, dtype: int64
holiday
No     25474
Yes      792
Name: count, dtype: int64
day_of_week
Saturday     3768
Sunday       3765
Friday       3763
Monday       3744
Wednesday    3743
Thursday     3743
Tuesday      3740
Name: count, dtype: int64
month
August       2232
October      2232
May          2231
July         2231
December     2231
January      2229
March        2229
April        2160
June         2160
September    2160
November     2160
February     2011
Name: count, dtype: int64
weather_category
Clear/Cloudy                23220
Rain                         2189
Snow                          639
Showers                       218
Atmospheric Obstructions        0
Fog Conditions                  0
Drizzle                         0
Thunderstorms                   0
Extreme Weather                 0
Name: count, dtype: int64
humidity_category
Moderate     9565
Very High    7268
High         7063
Low          2

EDA 

In [148]:
#creating the statistical summary of the dataset
bike_hourly_new[['casual', 'member', 'total_rides', 'temperature_2m_(°c)', 'windspeed_10m_(km/h)', 'relativehumidity_2m_(%)']].describe()

Unnamed: 0,casual,member,total_rides,temperature_2m_(°c),windspeed_10m_(km/h),relativehumidity_2m_(%)
count,26266.0,26266.0,26266.0,26266.0,26266.0,26266.0
mean,159.928729,247.16489,407.093619,14.41944,10.171492,62.963337
std,167.338822,225.295433,373.024902,10.281102,5.25993,22.726972
min,0.0,0.0,1.0,-14.6,0.4,5.0
25%,28.0,55.0,85.0,6.2,6.5,46.0
50%,105.0,203.0,319.0,14.8,9.2,64.0
75%,237.0,371.0,616.0,22.7,12.9,82.0
max,1206.0,1534.0,2262.0,41.5,40.1,100.0


In [167]:
#grouping the data of total rides, casual, and member by holidays in percentage and rounding to 2 decimal places
bike_hourly_new.groupby('holiday')[['total_rides', 'casual', 'member']].sum() / bike_hourly_new[['total_rides', 'casual', 'member']].sum().round(2) * 100

# Group data by 'season' and calculate percentage, rounding to 2 decimal places
bike_hourly_new.groupby('season')[['total_rides', 'casual', 'member']].sum() / bike_hourly_new[['total_rides', 'casual', 'member']].sum().round(2)

# Display the result
print(seasonal_percentage)



        total_rides  casual  member
season                             
Fall           0.30    0.27    0.31
Spring         0.25    0.27    0.24
Summer         0.32    0.35    0.29
Winter         0.14    0.11    0.16


  bike_hourly_new.groupby('season')[['total_rides', 'casual', 'member']].sum() / bike_hourly_new[['total_rides', 'casual', 'member']].sum().round(2)
