- Import essential libraries


In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="darkgrid", palette= 'colorblind')

**Dataset characteristics**

<p>
Both hour.csv and day.csv have the following fields, except hr which is not available in day.csv

    - instant: record index
    - dteday : date
    - season : season (1:springer, 2:summer, 3:fall, 4:winter)
    - yr : year (0: 2011, 1:2012)
    - mnth : month ( 1 to 12)
    - hr : hour (0 to 23)
    - holiday : weather day is holiday or not (extracted from http://dchr.dc.gov/page/holiday-schedule)
    - weekday : day of the week
    - workingday : if day is neither weekend nor holiday is 1, otherwise is 0.
    + weathersit :
    	- 1: Clear, Few clouds, Partly cloudy, Partly cloudy
    	- 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
    	- 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
    	- 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
    - temp : Normalized temperature in Celsius. The values are divided to 41 (max)
    - atemp: Normalized feeling temperature in Celsius. The values are divided to 50 (max)
    - hum: Normalized humidity. The values are divided to 100 (max)
    - windspeed: Normalized wind speed. The values are divided to 67 (max)
    - casual: count of casual users
    - registered: count of registered users
    - cnt: count of total rental bikes including both casual and registered

</p>


- Import the dataset


In [12]:
path = './dataset/hour.csv'
hourly_data = pd.read_csv(path)

hourly_data.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


- Understanding the data


In [13]:
hourly_data.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     17379 non-null  int64  
 1   dteday      17379 non-null  object 
 2   season      17379 non-null  int64  
 3   yr          17379 non-null  int64  
 4   mnth        17379 non-null  int64  
 5   hr          17379 non-null  int64  
 6   holiday     17379 non-null  int64  
 7   weekday     17379 non-null  int64  
 8   workingday  17379 non-null  int64  
 9   weathersit  17379 non-null  int64  
 10  temp        17379 non-null  float64
 11  atemp       17379 non-null  float64
 12  hum         17379 non-null  float64
 13  windspeed   17379 non-null  float64
 14  casual      17379 non-null  int64  
 15  registered  17379 non-null  int64  
 16  cnt         17379 non-null  int64  
dtypes: float64(4), int64(12), object(1)
memory usage: 3.2 MB


In [14]:
print(f'shape of data: {hourly_data.shape}')
print(f'number of missing value: {hourly_data.isna().sum().sum()}')

shape of data: (17379, 17)
number of missing value: 0


In [15]:
hourly_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
instant,17379.0,8690.0,5017.0295,1.0,4345.5,8690.0,13034.5,17379.0
season,17379.0,2.50164,1.106918,1.0,2.0,3.0,3.0,4.0
yr,17379.0,0.502561,0.500008,0.0,0.0,1.0,1.0,1.0
mnth,17379.0,6.537775,3.438776,1.0,4.0,7.0,10.0,12.0
hr,17379.0,11.546752,6.914405,0.0,6.0,12.0,18.0,23.0
holiday,17379.0,0.02877,0.167165,0.0,0.0,0.0,0.0,1.0
weekday,17379.0,3.003683,2.005771,0.0,1.0,3.0,5.0,6.0
workingday,17379.0,0.682721,0.465431,0.0,0.0,1.0,1.0,1.0
weathersit,17379.0,1.425283,0.639357,1.0,1.0,1.0,2.0,4.0
temp,17379.0,0.496987,0.192556,0.02,0.34,0.5,0.66,1.0


- Copying the original data


In [17]:
preprocessed_data = hourly_data.copy()
preprocessed_data.sample(5)

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
7370,7371,2011-11-08,4,0,11,17,0,2,1,1,0.52,0.5,0.43,0.1642,40,459,499
2044,2045,2011-03-31,2,0,3,1,0,4,1,3,0.24,0.2273,0.93,0.194,1,4,5
12910,12911,2012-06-27,3,1,6,3,0,3,1,1,0.58,0.5455,0.46,0.0896,1,7,8
3897,3898,2011-06-16,2,0,6,7,0,4,1,3,0.56,0.5303,0.78,0.1642,16,172,188
11841,11842,2012-05-13,2,1,5,14,0,0,0,1,0.7,0.6515,0.48,0.1343,228,324,552


- Transform Seasons


In [18]:
season_mapping = {
  1 : 'winter',
  2 : 'spring',
  3 : 'summer',
  4 : 'fall'
}
preprocessed_data['season'] = preprocessed_data['season'].map(lambda x: season_mapping[x])
preprocessed_data.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,winter,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,winter,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,winter,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,winter,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,winter,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


- Transform Year


In [20]:
yr_mapping = {
  0: '2011',
  1 : '2012'
}
preprocessed_data['yr'] = preprocessed_data['yr'].map(lambda x: yr_mapping[x])

In [21]:
preprocessed_data.sample(5)

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
4481,4482,2011-07-10,summer,2011,7,15,0,0,0,1,0.84,0.7424,0.39,0.2836,142,219,361
5401,5402,2011-08-17,summer,2011,8,23,0,3,1,1,0.7,0.6515,0.65,0.2836,20,85,105
11818,11819,2012-05-12,spring,2012,5,15,0,6,0,1,0.68,0.6212,0.24,0.194,269,321,590
15739,15740,2012-10-23,fall,2012,10,0,0,2,1,1,0.46,0.4545,0.88,0.1642,5,32,37
11167,11168,2012-04-15,spring,2012,4,12,0,0,0,1,0.64,0.6212,0.44,0.2836,275,360,635


- Transform weekday


In [22]:
weekday_mapping = {
  0 : 'Sunday',
  1 : 'Monday',
  2 : 'Tuesday',
  3 : 'Wednesday',
  4 : 'Thursday',
  5 : 'Friday',
  6 : 'Saturday'
}
preprocessed_data['weekday'] = preprocessed_data['weekday'].apply(lambda x: weekday_mapping[x])

In [23]:
preprocessed_data.sample(5)

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
6952,6953,2011-10-22,fall,2011,10,7,0,Saturday,0,1,0.36,0.3636,0.76,0.1045,8,49,57
7463,7464,2011-11-12,fall,2011,11,14,0,Saturday,0,1,0.5,0.4848,0.36,0.2985,191,244,435
11243,11244,2012-04-18,spring,2012,4,16,0,Wednesday,1,2,0.42,0.4242,0.77,0.2239,36,199,235
15264,15265,2012-10-03,fall,2012,10,5,0,Wednesday,1,2,0.62,0.5455,0.94,0.1045,2,32,34
11462,11463,2012-04-27,spring,2012,4,19,0,Friday,1,1,0.5,0.4848,0.27,0.3881,62,379,441


- Transform weathersit


In [24]:
weather_mapping = {
  1 : 'clear',
  2 : 'cloudy',
  3 : 'light_rain_snow',
  4 : 'heavy_rain_snow'
}
preprocessed_data['weathersit'] = preprocessed_data['weathersit'].map(lambda x: weather_mapping[x])

- windspeed and hum both columns are normalized. Rescale both columns.


In [27]:
preprocessed_data.eval(
  '''
  windspeed = windspeed*67
  hum = hum*100
''', inplace=True
)

- Visualize all columns


In [29]:
preprocessed_data.sample(10, random_state=123)

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
5792,5793,2011-09-03,summer,2011,9,19,0,Saturday,0,clear,0.7,0.6667,74.0,8.9981,147,148,295
7823,7824,2011-11-27,fall,2011,11,14,0,Sunday,0,clear,0.62,0.6212,43.0,31.0009,113,200,313
15426,15427,2012-10-09,fall,2012,10,23,0,Tuesday,1,cloudy,0.48,0.4697,77.0,6.0032,8,76,84
15028,15029,2012-09-23,fall,2012,9,9,0,Sunday,0,clear,0.5,0.4848,51.0,22.0028,71,205,276
12290,12291,2012-06-01,spring,2012,6,7,0,Friday,1,cloudy,0.64,0.5758,89.0,12.998,33,369,402
3262,3263,2011-05-20,spring,2011,5,20,0,Friday,1,clear,0.58,0.5455,64.0,7.0015,40,180,220
10763,10764,2012-03-29,spring,2012,3,14,0,Thursday,1,clear,0.5,0.4848,42.0,23.9994,63,175,238
12384,12385,2012-06-05,spring,2012,6,5,0,Tuesday,1,light_rain_snow,0.48,0.4697,82.0,11.0014,1,35,36
6051,6052,2011-09-14,summer,2011,9,17,0,Wednesday,1,clear,0.76,0.697,52.0,19.0012,87,512,599
948,949,2011-02-12,winter,2011,2,3,0,Saturday,0,clear,0.12,0.197,80.0,0.0,3,7,10


...To Be Continued...
