# Hands-on activity: Cleaning the  Bike Sharing Dataset

---

https://archive.ics.uci.edu/dataset/275/bike+sharing+dataset 

This dataset contains the hourly and daily count of rental bikes between years 2011 and 2012 in Capital bikeshare system with the corresponding weather and seasonal information.



In [1]:
import pandas as pd

# Load the dataset
url = "https://raw.githubusercontent.com/akumarss/UCI-bike-sharing/master/UCI_BikeSharing_day.csv"
bike_data = pd.read_csv(url)

# Display the first few rows of the dataset
bike_data.head()


Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


### Check for duplicates
Check if there are any duplicated rows in this dataset. If duplicates exist, drop them while retaining the first occurence. 

In [2]:
# Remove duplicate rows if any
bike_data = bike_data.drop_duplicates(keep='first')
bike_data

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.200000,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.229270,0.436957,0.186900,82,1518,1600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726,727,2012-12-27,1,1,12,0,4,1,2,0.254167,0.226642,0.652917,0.350133,247,1867,2114
727,728,2012-12-28,1,1,12,0,5,1,2,0.253333,0.255046,0.590000,0.155471,644,2451,3095
728,729,2012-12-29,1,1,12,0,6,0,2,0.253333,0.242400,0.752917,0.124383,159,1182,1341
729,730,2012-12-30,1,1,12,0,0,0,1,0.255833,0.231700,0.483333,0.350754,364,1432,1796


### Check for missing values

In [3]:
# Check for missing values in bike_data
missing_values = bike_data.isnull().sum()

# Display columns with missing values and their counts
print(missing_values[missing_values > 0].count())

0


In [4]:
# convert into datetime format
bike_data['dteday'] = pd.to_datetime(bike_data['dteday'])

# extract various features from the datetime column
bike_data['year'] = bike_data['dteday'].dt.year
bike_data['month'] = bike_data['dteday'].dt.month
bike_data['day_of_week'] = bike_data['dteday'].dt.dayofweek
bike_data['day_of_month'] = bike_data['dteday'].dt.day
bike_data['hour'] = bike_data['dteday'].dt.hour

bike_data = bike_data.drop(columns=['dteday'])


bike_data.head()

Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,year,month,day_of_week,day_of_month,hour
0,1,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985,2011,1,5,1,0
1,2,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801,2011,1,6,2,0
2,3,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349,2011,1,0,3,0
3,4,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562,2011,1,1,4,0
4,5,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600,2011,1,2,5,0


### Handling categorical columns

We need to create dummy variables for the ordinal columns.

In [6]:
bike_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 57 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   instant          731 non-null    int64  
 1   yr               731 non-null    int64  
 2   mnth             731 non-null    int64  
 3   holiday          731 non-null    int64  
 4   weekday          731 non-null    int64  
 5   workingday       731 non-null    int64  
 6   temp             731 non-null    float64
 7   atemp            731 non-null    float64
 8   hum              731 non-null    float64
 9   windspeed        731 non-null    float64
 10  casual           731 non-null    int64  
 11  registered       731 non-null    int64  
 12  cnt              731 non-null    int64  
 13  year             731 non-null    int32  
 14  month            731 non-null    int32  
 15  hour             731 non-null    int32  
 16  season_2         731 non-null    int32  
 17  season_3        

In [5]:
# Columns to create dummies for
columns_to_encode = ['season', 'weathersit', 'day_of_week', 'day_of_month']

# Create binary dummy variables
bike_data = pd.get_dummies(bike_data, columns=columns_to_encode, dtype=int, drop_first=True)

# drop_first=True drops the first category to avoid multicollinearity

bike_data.head(20)

Unnamed: 0,instant,yr,mnth,holiday,weekday,workingday,temp,atemp,hum,windspeed,...,day_of_month_22,day_of_month_23,day_of_month_24,day_of_month_25,day_of_month_26,day_of_month_27,day_of_month_28,day_of_month_29,day_of_month_30,day_of_month_31
0,1,0,1,0,6,0,0.344167,0.363625,0.805833,0.160446,...,0,0,0,0,0,0,0,0,0,0
1,2,0,1,0,0,0,0.363478,0.353739,0.696087,0.248539,...,0,0,0,0,0,0,0,0,0,0
2,3,0,1,0,1,1,0.196364,0.189405,0.437273,0.248309,...,0,0,0,0,0,0,0,0,0,0
3,4,0,1,0,2,1,0.2,0.212122,0.590435,0.160296,...,0,0,0,0,0,0,0,0,0,0
4,5,0,1,0,3,1,0.226957,0.22927,0.436957,0.1869,...,0,0,0,0,0,0,0,0,0,0
5,6,0,1,0,4,1,0.204348,0.233209,0.518261,0.089565,...,0,0,0,0,0,0,0,0,0,0
6,7,0,1,0,5,1,0.196522,0.208839,0.498696,0.168726,...,0,0,0,0,0,0,0,0,0,0
7,8,0,1,0,6,0,0.165,0.162254,0.535833,0.266804,...,0,0,0,0,0,0,0,0,0,0
8,9,0,1,0,0,0,0.138333,0.116175,0.434167,0.36195,...,0,0,0,0,0,0,0,0,0,0
9,10,0,1,0,1,1,0.150833,0.150888,0.482917,0.223267,...,0,0,0,0,0,0,0,0,0,0


### Scaling Numerical Columns:

Standard scale the numerical columns in the dataset.

According to the documentation of the dataset, temperature, humidity and windspeed are normalized values
Therefore, we do not need to apply any further scaling. 