In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Lees de datasets in
data = pd.read_csv("day.csv")
hour = pd.read_csv("hour.csv")


In [2]:
# --- Select input and output variables ---
X = data[['season','yr','mnth','holiday','weekday','workingday',
          'weathersit','temp','atemp','hum','windspeed']]
y = data['cnt']

# --- One-hot encoding for categorical columns ---
categorical_cols = ['season','mnth','weekday','weathersit']
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

print("Shape before encoding:", X.shape)
print("Shape after encoding:", X_encoded.shape)

# --- Split data into training and test sets ---
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.25, random_state=42)

# --- Train the linear regression model ---
model = LinearRegression()
model.fit(X_train, y_train)

# --- Predict on the test set ---
y_pred = model.predict(X_test)

# --- Evaluate the model ---
r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)

print("Model performance:")
print(f"R² = {r2:.3f}")
print(f"RMSE = {rmse:.2f}")
print(f"MAE = {mae:.2f}")

# --- Optional: check coefficients ---
coeffs = pd.DataFrame({'Feature': X_encoded.columns, 'Coefficient': model.coef_})
display(coeffs.sort_values(by='Coefficient', ascending=False))


Shape before encoding: (731, 11)
Shape after encoding: (731, 29)
Model performance:
R² = 0.844
RMSE = 774.99
MAE = 566.09




Unnamed: 0,Feature,Coefficient
4,atemp,4696.036515
0,yr,1986.047624
9,season_4,1679.874799
7,season_2,967.631758
8,season_3,906.392635
17,mnth_9,821.513526
3,temp,658.344775
11,mnth_3,561.078382
13,mnth_5,529.070688
26,weekday_6,493.225433
