From 77bdbd8f1547c9fa092e884d9fe7ff8411d86841 Mon Sep 17 00:00:00 2001 From: raisa <> Date: Wed, 10 Apr 2024 14:37:09 +0100 Subject: [PATCH] replace pandas with Polars for forecasting co2 example --- examples/gaussian_process/plot_gpr_co2.py | 42 +++++++++++++---------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py index 33b0ab7271549..f0c1c0bf42851 100644 --- a/examples/gaussian_process/plot_gpr_co2.py +++ b/examples/gaussian_process/plot_gpr_co2.py @@ -33,24 +33,24 @@ # We will derive a dataset from the Mauna Loa Observatory that collected air # samples. We are interested in estimating the concentration of CO2 and # extrapolate it for further year. First, we load the original dataset available -# in OpenML. +# in OpenML as a pandas dataframe. This will be replaced with Polars +# once `fetch_openml` adds a native support for it. from sklearn.datasets import fetch_openml co2 = fetch_openml(data_id=41187, as_frame=True) co2.frame.head() # %% -# First, we process the original dataframe to create a date index and select -# only the CO2 column. -import pandas as pd +# First, we process the original dataframe to create a date column and select +# it along with the CO2 column. +import polars as pl -co2_data = co2.frame -co2_data["date"] = pd.to_datetime(co2_data[["year", "month", "day"]]) -co2_data = co2_data[["date", "co2"]].set_index("date") +co2_data = pl.DataFrame({col: co2.frame[col].to_numpy() for col in co2.frame.columns}) +co2_data = co2_data.select(pl.date("year", "month", "day"), "co2") co2_data.head() # %% -co2_data.index.min(), co2_data.index.max() +co2_data["date"].min(), co2_data["date"].max() # %% # We see that we get CO2 concentration for some days from March, 1958 to @@ -58,7 +58,8 @@ # understanding. import matplotlib.pyplot as plt -co2_data.plot() +plt.plot(co2_data["date"], co2_data["co2"]) +plt.xlabel("date") plt.ylabel("CO$_2$ concentration (ppm)") _ = plt.title("Raw air samples measurements from the Mauna Loa Observatory") @@ -67,15 +68,14 @@ # for which no measurements were collected. Such a processing will have an # smoothing effect on the data. -try: - co2_data_resampled_monthly = co2_data.resample("ME") -except ValueError: - # pandas < 2.2 uses M instead of ME - co2_data_resampled_monthly = co2_data.resample("M") - - -co2_data = co2_data_resampled_monthly.mean().dropna(axis="index", how="any") -co2_data.plot() +co2_data = ( + co2_data.sort(by="date") + .group_by_dynamic("date", every="1mo") + .agg(pl.col("co2").mean()) + .drop_nulls() +) +plt.plot(co2_data["date"], co2_data["co2"]) +plt.xlabel("date") plt.ylabel("Monthly average of CO$_2$ concentration (ppm)") _ = plt.title( "Monthly average of air samples measurements\nfrom the Mauna Loa Observatory" @@ -88,7 +88,11 @@ # # As a first step, we will divide the data and the target to estimate. The data # being a date, we will convert it into a numeric. -X = (co2_data.index.year + co2_data.index.month / 12).to_numpy().reshape(-1, 1) +X = ( + co2_data.select(pl.col("date").dt.year() + pl.col("date").dt.month() / 12) + .to_numpy() + .reshape(-1, 1) +) y = co2_data["co2"].to_numpy() # %%