In [7]:
!python -m wget https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip
!python -m unzip bike+sharing+dataset.zip


Saved under bike+sharing+dataset (1).zip


#Description of the dataset: -
* season: Season of the year (1:winter, 2:spring, 3:summer, 4:fall).
* yr: Year (0: 2011, 1:2012).
* mnth: Month of the year (1 to 12).
* hr: Hour of the day (0 to 23).
* holiday: Whether the day is a holiday.
* weekday: Day of the week.
* workingday: Whether the day is a working day.
* weathersit: Weather situation (1: Clear, 2: Mist, 3: Light Snow/Rain, 4: Heavy Rain/Snow).
* temp: Normalized temperature in Celsius.
* hum: Normalized humidity.
* windspeed: Normalized wind speed.
* cnt: Count of total rental bikes (target variable).

In [288]:
import pandas as pd
df= pd.read_csv("hour.csv")

In [289]:
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [290]:
df['day_night'] = df['hr'].apply(lambda x: 'day' if 6 <= x <= 18 else 'night')
df['day_night'] = df.day_night.astype('category')

In [291]:
df.drop(['instant', 'casual', 'registered'], axis=1, inplace=True)
df['dteday'] = pd.to_datetime(df.dteday)
df['season'] = df.season.astype('category')
df['holiday'] = df.holiday.astype('category')
df['weekday'] = df.weekday.astype('category')
df['weathersit'] = df.weathersit.astype('category')
df['workingday'] = df.workingday.astype('category')
df['mnth'] = df.mnth.astype('category')
df['yr'] = df.yr.astype('category')
df['hr'] = df.hr.astype('category')
df.drop(columns=['dteday'], inplace=True)

In [292]:
X = df.drop(columns=['cnt']) # Features
y = df['cnt'] # Target

# 1. Creating new features

In [214]:
#Uncomment to see squared features result
# X['new_feature1']  = X['temp']**2
# X['new_feature2']  = X['hum']**2

In [242]:
#uncomment to see there results
# X['new_feature1']  = X['temp']*X['hum']
# X['new_feature2']  = X['hum']*X['windspeed']

In [304]:
#combined above  features
X['new_feature1']  = X['temp']*X['hum']
X['new_feature2']  = X['hum']*X['windspeed']
X['new_feature3']  = X['temp']**2
X['new_feature4']  = X['hum']**2

In [277]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder
# from sklearn.base import BaseEstimator, TransformerMixin
# from sklearn.utils import set_config
from sklearn.linear_model import LinearRegression

In [294]:
# Numerical features
numerical_features = ['temp', 'hum', 'windspeed','new_feature1','new_feature2','new_feature3','new_feature4']
# numerical_features = ['temp', 'hum', 'windspeed','new_feature1','new_feature2'] uncomment if using 2 features
numerical_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='mean')), # Impute missing values with mean
('scaler', MinMaxScaler()) # Normalize using MinMaxScaler
])

In [295]:
numerical_pipeline

In [296]:
X[numerical_features] = numerical_pipeline.fit_transform(X[numerical_features])

In [297]:
# Categorical features
categorical_features = ['season', 'weathersit', 'day_night']
categorical_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('target_encoder', TargetEncoder())
])
# Transforming above
X_encoded = categorical_pipeline.fit_transform(X[categorical_features],y)

In [298]:
X_encoded = pd.DataFrame(X_encoded, columns=categorical_pipeline.named_steps['target_encoder'].get_feature_names_out(categorical_features))
# Encoded categorical features + Numerical features
X_encoded

Unnamed: 0,0,1,2
0,111.114569,204.869272,98.894138
1,111.114569,204.869272,98.894138
2,111.114569,204.869272,98.894138
3,111.114569,204.869272,98.894138
4,111.114569,204.869272,98.894138
...,...,...,...
17374,111.114569,175.165493,98.894138
17375,111.114569,175.165493,98.894138
17376,111.114569,204.869272,98.894138
17377,111.114569,204.869272,98.894138


In [299]:
X = pd.concat([X.drop(columns=categorical_features), X_encoded], axis=1)

In [300]:
X.rename(columns={
    0: 'x',
    1: 'y',
    2: 'w',
}, inplace=True)

In [301]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)

In [302]:
# from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
model = LinearRegression()
model.fit(X_train, y_train)

In [270]:
# Predictions
#included 4 features
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 14939.362143172706
R-squared: 0.5282119665386169


In [187]:
# Predictions
#included temp square and hum square
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 15077.88199105194
R-squared: 0.5238374821396173


In [250]:
# Predictions with features hum*temp and hum*windspeed
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 15026.46101940761
R-squared: 0.525461366670848


In [50]:
# Predictions without feature engineering
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 15026.422105375314
R-squared: 0.5254625955837424


In [29]:
final_pipeline = Pipeline([
('num_preprocess', numerical_pipeline),
('cat_preprocess', categorical_pipeline),
('model', model)
])

In [30]:
final_pipeline