In [1]:
!pip install xgboost
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, cross_val_predict
from sklearn.metrics import mean_squared_error, mean_absolute_error

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

In [4]:
df = pd.read_csv("hour.csv")
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [5]:
df = df.drop(['instant','dteday','casual','registered'], axis=1)

In [6]:
X = df.drop('cnt', axis=1)
y = df['cnt']

In [7]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [8]:
def evaluate_model(model, X, y):

    preds = cross_val_predict(model, X, y, cv=kf)

    rmse = np.sqrt(mean_squared_error(y, preds))
    mae = mean_absolute_error(y, preds)

    return rmse, mae, preds

In [9]:
rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

rf_rmse, rf_mae, rf_preds = evaluate_model(rf, X, y)

In [10]:
subag = BaggingRegressor(
    estimator=DecisionTreeRegressor(),
    n_estimators=200,
    max_samples=0.6,
    random_state=42,
    n_jobs=-1
)

sub_rmse, sub_mae, sub_preds = evaluate_model(subag, X, y)

In [11]:
gb = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

gb_rmse, gb_mae, gb_preds = evaluate_model(gb, X, y)

In [12]:
results = pd.DataFrame({
    "Model": ["Random Forest (Bagging)", "Subagging", "Gradient Boosting"],
    "RMSE": [rf_rmse, sub_rmse, gb_rmse],
    "MAE": [rf_mae, sub_mae, gb_mae]
})

results

Unnamed: 0,Model,RMSE,MAE
0,Random Forest (Bagging),51.116767,31.930885
1,Subagging,42.423419,25.594912
2,Gradient Boosting,71.463079,48.56067


In [13]:
results.to_csv("cv_regression_results.csv", index=False)

In [14]:
final_df = pd.DataFrame({
    "ActualCnt": y,
    "PredictedCnt": gb_preds
})

final_df.to_csv("final_predictions.csv", index=False)

In [15]:
rf.fit(X, y)

importances = pd.Series(rf.feature_importances_, index=X.columns)
top_features = importances.sort_values(ascending=False).head(8)

top_features

Unnamed: 0,0
hr,0.648769
temp,0.12217
yr,0.085912
workingday,0.058928
season,0.021549
weathersit,0.0159
hum,0.015885
atemp,0.012544
