In [17]:
# from google.colab import drive
# drive.mount('/content/drive')


In [18]:
## Pandas
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Numpy
import numpy as np

## Preprocessing
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

## Models
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier

## Regression Metrics
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

## Set global scikit-learn configuration
from sklearn import set_config

## Warnings
import warnings



In [19]:
#Load Bike Data
fpath = "/content/drive/MyDrive/CodingDojo/03-IntermediateMachineLearning/Week09/Data/bikeshare_train - bikeshare_train.csv"
df = pd.read_csv(fpath)
df.info()
df.head()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    10886 non-null  object 
 1   season      10886 non-null  int64  
 2   holiday     10886 non-null  int64  
 3   workingday  10886 non-null  int64  
 4   weather     10886 non-null  int64  
 5   temp        10886 non-null  float64
 6   atemp       10886 non-null  float64
 7   humidity    10886 non-null  int64  
 8   windspeed   10886 non-null  float64
 9   casual      10886 non-null  int64  
 10  registered  10886 non-null  int64  
 11  count       10886 non-null  int64  
dtypes: float64(3), int64(8), object(1)
memory usage: 1020.7+ KB


Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 0:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 1:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 2:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 3:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 4:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


Your task is to engineer some new features to try to improve a model's ability to predict the total number of bike share rentals during a given hour of the day.


Import the data the drop the 'casual' and 'registered' columns. These are redundant with your target, 'count'.

Transform the 'datetime' column into a datetime type and use it to create 3 new columns in the data frame containing the:

Name of the Month

Name of the Day of the Week

Hour of the Day

Make sure all 3 new columns are 'object' datatype so they can be one-hot encoded later.

Drop the 'datetime' and 'season' columns. These are now redundant.

The temperatures in the 'temp' and 'atemp' columns are in Celsius. Use `.apply()` and a Lambda function to convert them to Fahrenheit.
Create a new column, 'temp_variance,' which shows how much warmer or colder the current temperature ('temp') is than the average temperate for that day of the year ('atemp'). If the current temperature is warmer than average ('atemp'), the value in 'temp_variance' should be positive.
Drop the 'atemp' column.
Optional:

Use a predictive model of your choice and try to predict the 'count' of hourly bike-share users with both the original features and the engineered feature set you created.

Remember to drop the 'casual' and 'registered' columns from both versions before modeling.

Did these feature engineering choices improve your ability to predict the 'count'?



In [20]:
#Drop the 'casual' and 'registered' columns
df.drop(columns=['casual','registered'], inplace=True)

In [21]:
#Transform the 'datatime' column to a datetime type and use it to create 3 new colums in the dataframe
df['datetime'] = pd.to_datetime(df['datetime'])
df.info()
df['month (name)'] = df['datetime'].dt.month_name()
df['day of week (name)'] = df['datetime'].dt.day_name()
df.head()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   datetime    10886 non-null  datetime64[ns]
 1   season      10886 non-null  int64         
 2   holiday     10886 non-null  int64         
 3   workingday  10886 non-null  int64         
 4   weather     10886 non-null  int64         
 5   temp        10886 non-null  float64       
 6   atemp       10886 non-null  float64       
 7   humidity    10886 non-null  int64         
 8   windspeed   10886 non-null  float64       
 9   count       10886 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(6)
memory usage: 850.6 KB


Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,month (name),day of week (name)
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,16,January,Saturday
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,40,January,Saturday
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,32,January,Saturday
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,13,January,Saturday
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,1,January,Saturday


In [22]:
df['datetime'].value_counts()

2011-01-01 00:00:00    1
2012-05-01 21:00:00    1
2012-05-01 13:00:00    1
2012-05-01 14:00:00    1
2012-05-01 15:00:00    1
                      ..
2011-09-02 04:00:00    1
2011-09-02 05:00:00    1
2011-09-02 06:00:00    1
2011-09-02 07:00:00    1
2012-12-19 23:00:00    1
Name: datetime, Length: 10886, dtype: int64

In [23]:
df['datetime'].dtype

dtype('<M8[ns]')

In [24]:
#Drop the 'casual' and 'registered' columns
df.drop(columns=['datetime','season'], inplace=True)

In [25]:
ctf = lambda c:9/5*c+32

In [26]:
df['temp'] = df['temp'].apply(ctf)
df.head()


Unnamed: 0,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,month (name),day of week (name)
0,0,0,1,49.712,14.395,81,0.0,16,January,Saturday
1,0,0,1,48.236,13.635,80,0.0,40,January,Saturday
2,0,0,1,48.236,13.635,80,0.0,32,January,Saturday
3,0,0,1,49.712,14.395,75,0.0,13,January,Saturday
4,0,0,1,49.712,14.395,75,0.0,1,January,Saturday


In [27]:
df['atemp'] = df['atemp'].apply(ctf)
df.head()

Unnamed: 0,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,month (name),day of week (name)
0,0,0,1,49.712,57.911,81,0.0,16,January,Saturday
1,0,0,1,48.236,56.543,80,0.0,40,January,Saturday
2,0,0,1,48.236,56.543,80,0.0,32,January,Saturday
3,0,0,1,49.712,57.911,75,0.0,13,January,Saturday
4,0,0,1,49.712,57.911,75,0.0,1,January,Saturday


Create a new column, 'temp_variance,' which shows how much warmer or colder the current temperature ('temp') is than the average temperate for that day of the year ('atemp'). If the current temperature is warmer than average ('atemp'), the value in 'temp_variance' should be positive. Drop the 'atemp' column.

In [32]:
df['temp_variance'] = ('temp' )

In [33]:
df['temp_variance']

0        temp
1        temp
2        temp
3        temp
4        temp
         ... 
10881    temp
10882    temp
10883    temp
10884    temp
10885    temp
Name: temp_variance, Length: 10886, dtype: object

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   holiday             10886 non-null  int64  
 1   workingday          10886 non-null  int64  
 2   weather             10886 non-null  int64  
 3   temp                10886 non-null  float64
 4   atemp               10886 non-null  float64
 5   humidity            10886 non-null  int64  
 6   windspeed           10886 non-null  float64
 7   count               10886 non-null  int64  
 8   month (name)        10886 non-null  object 
 9   day of week (name)  10886 non-null  object 
 10  temp_variance       10886 non-null  object 
dtypes: float64(3), int64(5), object(3)
memory usage: 935.6+ KB


In [31]:
#Drop the 'atemp'columns
df.drop(columns=['atemp'], inplace=True)