In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import xgboost as xgb
import lightgbm as lgb
import matplotlib.pyplot as plt


In [None]:
df = pd.read_csv('sample_3000.csv')
df = df[np.isfinite(df['real_flow'])]
df = df.dropna(subset=['real_flow'])

# Feature selection
X = df.drop(columns=['real_flow', 'stationID', 'name', 'date', 'longitude', 'latitude'], errors='ignore')
y = df['real_flow']

valid_idx = X.dropna().index
X = X.loc[valid_idx]
y = y.loc[valid_idx]

#One-hot code
X = pd.get_dummies(X, columns=['weekday', 'hours'], drop_first=True)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# XGBoost
xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

# LightGBM
lgb_model = lgb.LGBMRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
lgb_model.fit(X_train, y_train)
y_pred_lgb = lgb_model.predict(X_test)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000549 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3457
[LightGBM] [Info] Number of data points in the train set: 2400, number of used features: 24
[LightGBM] [Info] Start training from score 721.003333


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def print_metrics(y_test, y_pred, name):
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse ** 0.5
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)  
    
    print(f"{name}: MSE={mse:.4f}, RMSE={rmse:.4f}, MAE={mae:.4f}, R^2={r2:.4f}") 

print_metrics(y_test, y_pred_lr, "Linear Regression")
print_metrics(y_test, y_pred_rf, "Random Forest")
print_metrics(y_test, y_pred_xgb, "XGBoost")
print_metrics(y_test, y_pred_lgb, "LightGBM")


Linear Regression: MSE=1028978.8944, RMSE=1014.3860, MAE=581.6317, R^2=0.2626
Random Forest: MSE=27955.4490, RMSE=167.1988, MAE=75.1659, R^2=0.9800
XGBoost: MSE=91580.0312, RMSE=302.6219, MAE=163.4141, R^2=0.9344
LightGBM: MSE=73934.0930, RMSE=271.9082, MAE=150.5722, R^2=0.9470
