In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar


In [40]:
bike_hourly = pd.read_csv('hourly_weather.csv')

Data Wrangling

In [41]:
#create a copy of the bike hourly data
bike_hourly_new = bike_hourly.copy()

In [52]:
bike_hourly_new.isnull().sum()

casual                     0
date                       0
hour                       0
member                     0
total_rides                0
relativehumidity_2m_(%)    0
temperature_2m_(°c)        0
weathercode_(wmo_code)     0
windspeed_10m_(km/h)       0
holiday                    0
month                      0
day_of_week                0
season                     0
weather_category           0
dtype: int64

In [42]:
# Convert the 'Date' column to datetime
bike_hourly_new['Date'] = pd.to_datetime(bike_hourly_new['Date'])

# Ensure 'Hour' is integer (if it's not already)
bike_hourly_new['Hour'] = bike_hourly_new['Hour'].astype(int)

# Convert float columns correctly
float_columns = ['Casual', 'Member', 'Total_rides', 'temperature_2m (°C)', 'windspeed_10m (km/h)']
bike_hourly_new[float_columns] = bike_hourly_new[float_columns].astype(float)

# Convert integer columns
int_columns = ['relativehumidity_2m (%)', 'weathercode (wmo code)']
bike_hourly_new[int_columns] = bike_hourly_new[int_columns].astype(int)

# Check the updated data types
print(bike_hourly_new.dtypes)


Casual                            float64
Date                       datetime64[ns]
Hour                                int64
Member                            float64
Total_rides                       float64
relativehumidity_2m (%)             int64
temperature_2m (°C)               float64
weathercode (wmo code)              int64
windspeed_10m (km/h)              float64
dtype: object


In [43]:
#change the columns to lowercase and remove spaces
bike_hourly_new.columns = bike_hourly_new.columns.str.lower().str.replace(' ', '_')
bike_hourly_new.head()

Unnamed: 0,casual,date,hour,member,total_rides,relativehumidity_2m_(%),temperature_2m_(°c),weathercode_(wmo_code),windspeed_10m_(km/h)
0,20.0,2021-01-01,0,26.0,46.0,90,2.4,1,10.2
1,23.0,2021-01-01,1,23.0,46.0,88,1.8,1,10.0
2,20.0,2021-01-01,2,28.0,48.0,87,1.1,2,10.1
3,9.0,2021-01-01,3,7.0,16.0,81,1.6,2,10.6
4,7.0,2021-01-01,4,5.0,12.0,77,1.6,2,9.9


In [44]:
#adding holiday as a column
cal = calendar()
holidays = cal.holidays(start=bike_hourly_new['date'].min(), end=bike_hourly_new['date'].max())
bike_hourly_new['holiday'] = bike_hourly_new['date'].isin(holidays)

#add month and day of the week as columns
bike_hourly_new['month'] = bike_hourly_new['date'].dt.month
bike_hourly_new['day_of_week'] = bike_hourly_new['date'].dt.dayofweek

#adding season as a column
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

bike_hourly_new['season'] = bike_hourly_new['date'].dt.month.apply(get_season)

#adding holiday as a column
cal = calendar()
holidays = cal.holidays(start=bike_hourly_new['date'].min(), end=bike_hourly_new['date'].max())
bike_hourly_new['holiday'] = bike_hourly_new['date'].isin(holidays)

#change season to a category
bike_hourly_new['season'] = bike_hourly_new['season'].astype('category')



In [45]:
#change data type from integer to category for day of the week
bike_hourly_new['day_of_week'] = bike_hourly_new['day_of_week'].astype('category')

#change data type from integer to category for month
bike_hourly_new['month'] = bike_hourly_new['month'].astype('category')

#change the day of the week to actual names
bike_hourly_new['day_of_week'] = bike_hourly_new['day_of_week'].map({0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'})

#change the month to actual names
bike_hourly_new['month'] = bike_hourly_new['month'].map({1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June', 7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'})


# Define bin edges and labels based on WMO weather codes
bins = [0, 4, 20, 40, 50, 60, 70, 80, 90, 100]
labels = [
    "Clear/Cloudy", 
    "Atmospheric Obstructions", 
    "Fog Conditions", 
    "Drizzle", 
    "Rain", 
    "Snow", 
    "Showers", 
    "Thunderstorms",
    "Extreme Weather"
]

# Apply binning to the 'weathercode_(wmo_code)' column and add as a new category
bike_hourly_new['weather_category'] = pd.cut(
    bike_hourly_new['weathercode_(wmo_code)'], 
    bins=bins, 
    labels=labels, 
    right=False
)

In [46]:
bike_hourly_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26280 entries, 0 to 26279
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   casual                   26269 non-null  float64       
 1   date                     26280 non-null  datetime64[ns]
 2   hour                     26280 non-null  int64         
 3   member                   26269 non-null  float64       
 4   total_rides              26269 non-null  float64       
 5   relativehumidity_2m_(%)  26280 non-null  int64         
 6   temperature_2m_(°c)      26280 non-null  float64       
 7   weathercode_(wmo_code)   26280 non-null  int64         
 8   windspeed_10m_(km/h)     26280 non-null  float64       
 9   holiday                  26280 non-null  bool          
 10  month                    26280 non-null  category      
 11  day_of_week              26280 non-null  category      
 12  season                   26280 n

In [47]:
bike_hourly_new.head()

Unnamed: 0,casual,date,hour,member,total_rides,relativehumidity_2m_(%),temperature_2m_(°c),weathercode_(wmo_code),windspeed_10m_(km/h),holiday,month,day_of_week,season,weather_category
0,20.0,2021-01-01,0,26.0,46.0,90,2.4,1,10.2,True,January,Friday,Winter,Clear/Cloudy
1,23.0,2021-01-01,1,23.0,46.0,88,1.8,1,10.0,True,January,Friday,Winter,Clear/Cloudy
2,20.0,2021-01-01,2,28.0,48.0,87,1.1,2,10.1,True,January,Friday,Winter,Clear/Cloudy
3,9.0,2021-01-01,3,7.0,16.0,81,1.6,2,10.6,True,January,Friday,Winter,Clear/Cloudy
4,7.0,2021-01-01,4,5.0,12.0,77,1.6,2,9.9,True,January,Friday,Winter,Clear/Cloudy


In [48]:
#count of unique weather codes
bike_hourly_new['weathercode_(wmo_code)'].value_counts()


weathercode_(wmo_code)
0     11431
1      6003
3      3290
2      2507
51     1394
53      602
61      400
63      222
55      194
71      102
73       90
75       28
65       17
Name: count, dtype: int64

In [51]:
#idenifying the null values 
bike_hourly_new.isnull().sum()

#find the null values in the dataset
bike_hourly_new[bike_hourly_new.isnull().any(axis=1)]

#remove the null values
bike_hourly_new = bike_hourly_new.dropna()