In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error, explained_variance_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import numpy as np

In [3]:
data = pd.read_csv('/weather-automated-sensors-dataset.csv')
data = data.dropna(subset=["Air Temperature"])
data["Measurement Timestamp"] = pd.to_datetime(data["Measurement Timestamp"], errors='coerce')
data["Month"] = data["Measurement Timestamp"].dt.month
data["Hour"] = data["Measurement Timestamp"].dt.hour
data["Month_sin"] = np.sin(2 * np.pi * data["Month"] / 12)
data["Month_cos"] = np.cos(2 * np.pi * data["Month"] / 12)
data["Hour_sin"] = np.sin(2 * np.pi * data["Hour"] / 24)
data["Hour_cos"] = np.cos(2 * np.pi * data["Hour"] / 24)

  data["Measurement Timestamp"] = pd.to_datetime(data["Measurement Timestamp"], errors='coerce')


In [4]:
data = data.drop(columns=["Station Name", "Measurement Timestamp", "Measurement Timestamp Label", "Measurement ID", "Month", "Hour"])


In [5]:
imputer = SimpleImputer(strategy="mean")
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

In [6]:
X = data_imputed.drop(columns=["Air Temperature"])
y = data_imputed["Air Temperature"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('gbm', GradientBoostingRegressor(random_state=42))
])

param_grid = {
    'gbm__n_estimators': [100, 150],
    'gbm__learning_rate': [0.05, 0.1],
    'gbm__max_depth': [3, 4],
}


grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)


best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)


Fitting 3 folds for each of 8 candidates, totalling 24 fits


In [8]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
tolerance = 2.0
accuracy = np.mean(np.abs(y_test - y_pred) <= tolerance) * 100
explained_var = explained_variance_score(y_test, y_pred)

print("Mean Absolute Error (MAE):", mae)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared (R2) Score:", r2)
print("Mean Absolute Percentage Error (MAPE):", mape)
print("Explained Variance Score:", explained_var)
print("Accuracy within tolerance ±2.0 units:", accuracy, "%")


Mean Absolute Error (MAE): 1.2289822579687857
Root Mean Squared Error (RMSE): 2.068630449843525
R-squared (R2) Score: 0.9560595673603721
Mean Absolute Percentage Error (MAPE): 15447946913097.916
Explained Variance Score: 0.9560609326879691
Accuracy within tolerance ±2.0 units: 80.65854071440664 %
