In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [32]:
# Load the dataset
data = pd.read_csv("day.csv")

# Display the first few rows
print(data.head())

   instant      dteday  season  yr  mnth  holiday  weekday  workingday  \
0        1  01-01-2018       1   0     1        0        6           0   
1        2  02-01-2018       1   0     1        0        0           0   
2        3  03-01-2018       1   0     1        0        1           1   
3        4  04-01-2018       1   0     1        0        2           1   
4        5  05-01-2018       1   0     1        0        3           1   

   weathersit       temp     atemp      hum  windspeed  casual  registered  \
0           2  14.110847  18.18125  80.5833  10.749882     331         654   
1           2  14.902598  17.68695  69.6087  16.652113     131         670   
2           1   8.050924   9.47025  43.7273  16.636703     120        1229   
3           1   8.200000  10.60610  59.0435  10.739832     108        1454   
4           1   9.305237  11.46350  43.6957  12.522300      82        1518   

    cnt  
0   985  
1   801  
2  1349  
3  1562  
4  1600  


In [33]:
# Check for missing values
print(data.isnull().sum())

# Drop rows with missing values (if any)
data = data.dropna()

instant       0
dteday        0
season        0
yr            0
mnth          0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
casual        0
registered    0
cnt           0
dtype: int64


In [34]:
# Define categorical and numerical features
categorical_features = ['season', 'holiday', 'weekday', 'workingday', 'weathersit']
numerical_features = ['temp', 'atemp', 'hum', 'windspeed']

In [35]:
# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

In [36]:
# Split the data into features and target
X = data.drop(['cnt', 'instant', 'dteday', 'casual', 'registered'], axis=1)
y = data['cnt']

In [37]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=46)

In [38]:
# Build the model pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', LinearRegression())])

In [39]:
# Train the model
model.fit(X_train, y_train)

In [40]:
# Make predictions
y_pred = model.predict(X_test)

In [41]:
# Evaluate the model
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('R-squared:', r2_score(y_test, y_pred))

Mean Absolute Error: 1093.627285494828
Mean Squared Error: 1605292.481120152
R-squared: 0.5698880673634432


In [42]:
# Get the coefficients from the model
coefficients = model.named_steps['regressor'].coef_

In [43]:
# Get the feature names from the preprocessor
feature_names = numerical_features + list(model.named_steps['preprocessor'].transformers_[1][1].get_feature_names_out(categorical_features))

In [44]:
# Create a DataFrame for the coefficients
coeff_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

print(coeff_df)

         Feature  Coefficient
0           temp   976.723903
1          atemp   112.437937
2            hum  -316.221038
3      windspeed  -265.977907
4       season_1  -755.631708
5       season_2   253.015712
6       season_3  -230.620661
7       season_4   733.236658
8      holiday_0   175.368100
9      holiday_1  -175.368100
10     weekday_0  -141.521496
11     weekday_1  -158.582241
12     weekday_2    -6.766583
13     weekday_3   -54.936873
14     weekday_4    18.809454
15     weekday_5   134.228683
16     weekday_6   208.769056
17  workingday_0  -108.120540
18  workingday_1   108.120540
19  weathersit_1   752.876648
20  weathersit_2   422.448018
21  weathersit_3 -1175.324665


## Key Inferences:

- Temperature and Apparent Temperature have a strong positive impact on bike rentals, indicating more bikes are rented on warmer days.

- Humidity and Windspeed negatively impact bike rentals, meaning rentals decrease as humidity and windspeed increase.

- Seasonality plays a role, with the highest rentals in season 4 and the lowest in season 1.

- Non-holidays see higher rentals compared to holidays.

- Day of the Week shows variability in bike rentals, with weekends (especially Saturday) seeing higher rentals compared to weekdays.

- Working Days have slightly higher rentals compared to non-working days.

- Weather Situations: Favorable weather conditions (weather situation 1) significantly increase rentals, while adverse conditions (weather situation 3) drastically decrease rentals.