In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# ---------------------------------------------------------
# Custom RMSLE Function
# ---------------------------------------------------------
def rmsle(y_true, y_pred):
    y_pred = np.maximum(0, y_pred)  # avoid negative values
    y_true = np.maximum(0, y_true)
    return np.sqrt(np.mean((np.log1p(y_pred) - np.log1p(y_true))**2))


# ---------------------------------------------------------
# Model functions
# ---------------------------------------------------------

def train_linear_regression(X_train, y_train):
    print('4. train model : linear_regression')
    model = LinearRegression()
    model.fit(X_train, y_train)
    return model


def train_ridge(X_train, y_train, alpha=1.0):
    print('5. train model : ridge')
    model = Ridge(alpha=alpha)
    model.fit(X_train, y_train)
    return model


def train_lasso(X_train, y_train, alpha=0.001):
    print('6. train model : lasso')
    model = Lasso(alpha=alpha, max_iter=20000)
    model.fit(X_train, y_train)
    return model


def train_random_forest(X_train, y_train, n_estimators=300, max_depth=15, random_state=42):
    print('7. train model : random_forest')
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=random_state,
        n_jobs=-1
    )
    model.fit(X_train, y_train)
    return model


def train_gradient_boosting(X_train, y_train, learning_rate=0.05, n_estimators=500, max_depth=4, random_state=42):
    print('8. train model : gradient_boosting')
    model = GradientBoostingRegressor(
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=random_state
    )
    model.fit(X_train, y_train)
    return model


# ---------------------------------------------------------
# Evaluate model (correct handling of log-transformed target)
# ---------------------------------------------------------
def evaluate_model(model, X_test, y_test_log):
    # Convert y_test back to original count scale
    y_true = np.expm1(y_test_log)

    # Predict log(count)
    y_pred_log = model.predict(X_test)

    # Convert prediction back
    y_pred = np.expm1(y_pred_log)

    # Safety for RMSLE
    y_pred = np.maximum(0, y_pred)

    return {
        "RMSLE": rmsle(y_true, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_true, y_pred)),
        "MAE": mean_absolute_error(y_true, y_pred),
        "R2": r2_score(y_true, y_pred)
    }


# ---------------------------------------------------------
# Feature Engineering
# ---------------------------------------------------------
def add_derived_features(df):
    df['datetime'] = pd.to_datetime(df['datetime'])
    
    df['hour'] = df['datetime'].dt.hour
    df['day'] = df['datetime'].dt.day
    df['month'] = df['datetime'].dt.month
    df['year'] = df['datetime'].dt.year
    df['weekday'] = df['datetime'].dt.weekday

    df['is_weekend'] = df['weekday'].isin([5, 6]).astype(int)
    df['is_commute_hour'] = df['hour'].isin([7, 8, 9, 17, 18, 19]).astype(int)
    df['is_night'] = df['hour'].isin([0, 1, 2, 3, 4, 5]).astype(int)

    df = df.drop(columns=['datetime'])
    return df


# ---------------------------------------------------------
# Preprocessing
# ---------------------------------------------------------
def preprocess_data(df):
    print(' preprocess_data ...')

    # LOG TRANSFORM TARGET
    y_log = np.log1p(df['count'])

    print(f'  Before : {list(df.columns)}')
   
    df = add_derived_features(df)

    # Remove dependent fields
    df = df.drop(columns=['count', 'casual', 'registered'])

    # Remove redundant field
    df = df.drop(columns=['atemp'])

    print(f'  After : {list(df.columns)}')

    X = df.copy()

    # Numeric features
    numeric_features = ['temp', 'humidity', 'windspeed']
    numeric_transformer = StandardScaler()

    # Categorical (One-Hot)
    categorical_features = ['season', 'weather', 'hour', 'month', 'weekday']
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    # Build full transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough'
    )

    print("  Fitting & transforming X ...")
    X_transformed = preprocessor.fit_transform(X)

    # convert sparse â†’ dense
    if hasattr(X_transformed, "toarray"):
        X_transformed = X_transformed.toarray()

    print("  X transformed shape:", X_transformed.shape)

    return X_transformed, y_log, preprocessor


# ---------------------------------------------------------
# Main
# ---------------------------------------------------------
print("1. Reading training data...")
df = pd.read_csv("bike_train.csv")

# Preprocess
X_processed, y_log, preprocessor = preprocess_data(df)

# Train/Test Split
print('3. Split train-test data...')
X_train, X_test, y_train_log, y_test_log = train_test_split(
    X_processed, y_log, test_size=0.2, random_state=42
)

# Train Models
lin_model = train_linear_regression(X_train, y_train_log)
ridge_model = train_ridge(X_train, y_train_log)
lasso_model = train_lasso(X_train, y_train_log)
rf_model = train_random_forest(X_train, y_train_log, n_estimators=500, max_depth=25)
gb_model = train_gradient_boosting(X_train, y_train_log, learning_rate=0.05, n_estimators=600, max_depth=4)

# Evaluate
print("\n9. Evaluate models ...")
results = {
    "Linear Regression": evaluate_model(lin_model, X_test, y_test_log),
    "Ridge Regression": evaluate_model(ridge_model, X_test, y_test_log),
    "Lasso Regression": evaluate_model(lasso_model, X_test, y_test_log),
    "Random Forest": evaluate_model(rf_model, X_test, y_test_log),
    "Gradient Boosting": evaluate_model(gb_model, X_test, y_test_log),
}

print(pd.DataFrame(results).T)


1. Reading training data...
 preprocess_data ...
  Before : ['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count']
  After : ['season', 'holiday', 'workingday', 'weather', 'temp', 'humidity', 'windspeed', 'hour', 'day', 'month', 'year', 'weekday', 'is_weekend', 'is_commute_hour', 'is_night']
  Fitting & transforming X ...
  X transformed shape: (10450, 61)
3. Split train-test data...
4. train model : linear_regression
5. train model : ridge
6. train model : lasso
7. train model : random_forest
8. train model : gradient_boosting

9. Evaluate models ...
                      RMSLE       RMSE        MAE        R2
Linear Regression  0.610896  97.363086  61.856613  0.711028
Ridge Regression   0.610842  97.386207  61.851350  0.710891
Lasso Regression   0.611516  98.094007  62.027979  0.706673
Random Forest      0.387113  41.607428  25.587240  0.947227
Gradient Boosting  0.384784  50.884093  31.125403  0.921072
