Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add get_srml iotools function; deprecate read_srml_month_from_solardat #1779

Merged
merged 13 commits into from
Jun 29, 2023
1 change: 1 addition & 0 deletions docs/sphinx/source/reference/iotools.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ of sources and file formats relevant to solar energy modeling.
iotools.parse_epw
iotools.read_srml
iotools.read_srml_month_from_solardat
iotools.get_srml
iotools.read_surfrad
iotools.read_midc
iotools.read_midc_raw_data_from_nrel
Expand Down
4 changes: 4 additions & 0 deletions docs/sphinx/source/whatsnew/v0.10.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ Enhancements

* Added `map_variables` parameter to :py:func:`pvlib.iotools.read_srml`
and :py:func:`pvlib.iotools.read_srml_month_from_solardat` (:pull:`1773`)
* Added :func:`pvlib.iotools.get_srml` that is similar to
:func:`pvlib.iotools.read_srml_month_from_solardat` but is able to fetch multiple months
of data using the `start` and `end` parameters.
(:pull:`1779`)
* Allow passing keyword arguments to :py:func:`scipy:scipy.optimize.brentq` and
:py:func:`scipy:scipy.optimize.newton` solvers in
:py:func:`~pvlib.singlediode.bishop88_mpp`,
Expand Down
4 changes: 3 additions & 1 deletion docs/sphinx/source/whatsnew/v0.9.6.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,13 @@ Deprecations
(data period 2003-2012). Instead, ECMWF recommends to use CAMS global
reanalysis (EAC4) from the Atmosphere Data Store (ADS). See also :py:func:`pvlib.iotools.get_cams`.
(:issue:`1691`, :pull:`1654`)

* The ``recolumn`` parameter in :py:func:`pvlib.iotools.read_tmy3`, which maps
TMY3 column names to nonstandard alternatives, is now deprecated.
We encourage using ``map_variables`` (which produces standard pvlib names) instead.
(:issue:`1517`, :pull:`1623`)
* :py:func:`pvlib.iotools.read_srml_month_from_solardat` is deprecated and replaced by
:py:func:`pvlib.iotools.get_srml`. (:pull:`1779`)


Enhancements
~~~~~~~~~~~~
Expand Down
1 change: 1 addition & 0 deletions pvlib/iotools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from pvlib.iotools.epw import read_epw, parse_epw # noqa: F401
from pvlib.iotools.srml import read_srml # noqa: F401
from pvlib.iotools.srml import read_srml_month_from_solardat # noqa: F401
from pvlib.iotools.srml import get_srml # noqa: F401
from pvlib.iotools.surfrad import read_surfrad # noqa: F401
from pvlib.iotools.midc import read_midc # noqa: F401
from pvlib.iotools.midc import read_midc_raw_data_from_nrel # noqa: F401
Expand Down
111 changes: 104 additions & 7 deletions pvlib/iotools/srml.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@
"""
import numpy as np
import pandas as pd
import urllib
import warnings

from pvlib._deprecation import deprecated

# VARIABLE_MAP is a dictionary mapping SRML data element numbers to their
# pvlib names. For most variables, only the first three digits are used,
Expand All @@ -26,8 +29,9 @@

def read_srml(filename, map_variables=True):
"""
Read University of Oregon SRML 1min .tsv file into pandas dataframe. The
SRML is described in [1]_.
Read University of Oregon SRML 1min .tsv file into pandas dataframe.

The SRML is described in [1]_.

Parameters
----------
Expand All @@ -51,14 +55,14 @@ def read_srml(filename, map_variables=True):
the time of the row until the time of the next row. This is consistent
with pandas' default labeling behavior.

See SRML's `Archival Files`_ page for more information.

.. _Archival Files: http://solardat.uoregon.edu/ArchivalFiles.html
See [2]_ for more information concerning the file format.

References
----------
.. [1] University of Oregon Solar Radiation Monitoring Laboratory
`http://solardat.uoregon.edu/ <http://solardat.uoregon.edu/>`_
.. [2] `Archival (short interval) data files
<http://solardat.uoregon.edu/ArchivalFiles.html>`_
"""
tsv_data = pd.read_csv(filename, delimiter='\t')
data = _format_index(tsv_data)
Expand Down Expand Up @@ -168,10 +172,12 @@ def _format_index(df):
return df


@deprecated('0.10.0', alternative='pvlib.iotools.get_srml', removal='0.11.0')
AdamRJensen marked this conversation as resolved.
Show resolved Hide resolved
def read_srml_month_from_solardat(station, year, month, filetype='PO',
map_variables=True):
"""Request a month of SRML data from solardat and read it into
a Dataframe. The SRML is described in [1]_.
"""Request a month of SRML data and read it into a Dataframe.

The SRML is described in [1]_.

Parameters
----------
Expand Down Expand Up @@ -222,3 +228,94 @@ def read_srml_month_from_solardat(station, year, month, filetype='PO',
url = "http://solardat.uoregon.edu/download/Archive/"
data = read_srml(url + file_name, map_variables=map_variables)
return data


def get_srml(station, start, end, filetype='PO', map_variables=True,
url="http://solardat.uoregon.edu/download/Archive/"):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thinking more about minor consistency details, I guess we could standardize the order of map_variables and url. Looking through other functions, here's what I see:

  • map_variables first: get_cams
  • url first: get_psm3, get_pvgis_hourly, get_pvgis_tmy

#1767 currently does map_variables first but of course can be changed easily.

I don't see one as much better than the other. I guess I'd favor map_variables first since I think it is very uncommon that people will want to mess with url, but maybe map_variables=False has some somewhat common uses. @AdamRJensen what do you think?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I actually thought about discussing this for your PR, but I guess I don't have a strong opinion except for consistency. But the logic of having URL last makes sense to me - it's just more of a hassle changing this as there are more existing functions with URL first.

I opened #1791 based on this discussion fyi.

"""Request data from UoO SRML and read it into a Dataframe.

The University of Oregon Solar Radiation Monitoring Laboratory (SRML) is
described in [1]_. A list of stations can be found in [2]_.

Data is returned for the entire months between and including start and end.

Parameters
----------
station : str
Two letter station abbreviation.
start : datetime like
First day of the requested period
end : datetime like
Last day of the requested period
filetype : string, default: 'PO'
SRML file type to gather. See notes for explanation.
map_variables : bool, default: True
When true, renames columns of the DataFrame to pvlib variable names
where applicable. See variable :const:`VARIABLE_MAP`.
url : str, default: 'http://solardat.uoregon.edu/download/Archive/'
API endpoint URL

Returns
-------
data : pd.DataFrame
Dataframe with data from SRML.
meta : dict
Metadata.

Notes
-----
File types designate the time interval of a file and if it contains
raw or processed data. For instance, `RO` designates raw, one minute
data and `PO` designates processed one minute data. The availability
of file types varies between sites. Below is a table of file types
and their time intervals. See [1] for site information.

============= ============ ==================
time interval raw filetype processed filetype
============= ============ ==================
1 minute RO PO
5 minute RF PF
15 minute RQ PQ
hourly RH PH
============= ============ ==================

Warning
-------
SRML data has nighttime data prefilled with 0s through the end of the
current month (i.e., values are provided for data in the future).

References
----------
.. [1] University of Oregon Solar Radiation Measurement Laboratory
`http://solardat.uoregon.edu/ <http://solardat.uoregon.edu/>`_
.. [2] Station ID codes - Solar Radiation Measurement Laboratory
`http://solardat.uoregon.edu/StationIDCodes.html
<http://solardat.uoregon.edu/StationIDCodes.html>`_
"""
# Use pd.to_datetime so that strings (e.g. '2021-01-01') are accepted
start = pd.to_datetime(start)
end = pd.to_datetime(end)

# Generate list of months
months = pd.date_range(
start, end.replace(day=1) + pd.DateOffset(months=1), freq='1M')
months_str = months.strftime('%y%m')

# Generate list of filenames
filenames = [f"{station}{filetype}{m}.txt" for m in months_str]

dfs = [] # Initialize list of monthly dataframes
for f in filenames:
try:
dfi = read_srml(url + f, map_variables=map_variables)
dfs.append(dfi)
except urllib.error.HTTPError:
warnings.warn(f"The following file was not found: {f}")
AdamRJensen marked this conversation as resolved.
Show resolved Hide resolved

data = pd.concat(dfs, axis='rows')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about following this with a data = data.loc[start:end]? Right now I get back a full month even if I request just a single day, which isn't really ideal.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not opposed to this suggestion, although we may run into some issues related to timezone. I think this is why the start/end parameters in Solar Forecast Arbiter are required to be timezone localized - that seems like a hassle though.

There already exist several functions in pvlib that return a full month when requesting a single day btw, e.g. get_bsrn, and probably others too.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, that's a good point. Up to you on what is best here. If we keep the current behavior of returning complete months, might be worth a note in the docstring.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not opposed to this suggestion, although we may run into some issues related to timezone. I think this is why the start/end parameters in Solar Forecast Arbiter are required to be timezone localized - that seems like a hassle though.

This sounds right. And a complicating factor for SRML was nighttime 0s in the future if we requested a day from the current month.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here's a raw data file for the current month that shows -999 and similar for future dates.

image

Here's a raw data file that includes -999 and similar as well as 0s

image

This caused a problem in SFA SolarArbiter/solarforecastarbiter-core#572 and it's reasonable to expect that it would cause a problem with other user code. I don't know if that's pvlib's problem to solve, but I think it's somewhat more likely to come up with this new function that accepts datetimes instead of entire months.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see how the new function makes this more of an issue, currently, I would still request the same months with the old function, but it would just be more manual work.

I do think that it deserves a Warning entry and perhaps we can also implement a line that cuts off future data? For example:

data = data.loc[:pd.Timestamp.today(), :]

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think many users would have different expectations of the new function that accepts start, end datetimes than the old function that accepts a year and a month.

Thanks for adding the warning. I'd rather see the previously rejected start:end slicing than .today() slicing. I'm also fine with just adding the warning and seeing if users complain.


meta = {'filetype': filetype,
'station': station,
'filenames': filenames}

return data, meta
63 changes: 59 additions & 4 deletions pvlib/tests/iotools/test_srml.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
import pytest

from pvlib.iotools import srml
from ..conftest import DATA_DIR, RERUNS, RERUNS_DELAY
from ..conftest import (DATA_DIR, RERUNS, RERUNS_DELAY, assert_index_equal,
assert_frame_equal, fail_on_pvlib_version)
from pvlib._deprecation import pvlibDeprecationWarning

srml_testfile = DATA_DIR / 'SRML-day-EUPO1801.txt'

Expand Down Expand Up @@ -74,19 +76,33 @@ def test__map_columns(column, expected):
assert srml._map_columns(column) == expected


@pytest.mark.remote_data
@pytest.mark.flaky(reruns=RERUNS, reruns_delay=RERUNS_DELAY)
def test_get_srml():
url = 'http://solardat.uoregon.edu/download/Archive/EUPO1801.txt'
file_data = srml.read_srml(url)
requested, _ = srml.get_srml(station='EU', start='2018-01-01',
end='2018-01-31')
AdamRJensen marked this conversation as resolved.
Show resolved Hide resolved
assert_frame_equal(file_data, requested)


@fail_on_pvlib_version('0.11')
@pytest.mark.remote_data
@pytest.mark.flaky(reruns=RERUNS, reruns_delay=RERUNS_DELAY)
def test_read_srml_month_from_solardat():
AdamRJensen marked this conversation as resolved.
Show resolved Hide resolved
url = 'http://solardat.uoregon.edu/download/Archive/EUPO1801.txt'
file_data = srml.read_srml(url)
requested = srml.read_srml_month_from_solardat('EU', 2018, 1)
with pytest.warns(pvlibDeprecationWarning, match='get_srml instead'):
requested = srml.read_srml_month_from_solardat('EU', 2018, 1)
assert file_data.equals(requested)


@fail_on_pvlib_version('0.11')
@pytest.mark.remote_data
@pytest.mark.flaky(reruns=RERUNS, reruns_delay=RERUNS_DELAY)
def test_15_minute_dt_index():
data = srml.read_srml_month_from_solardat('TW', 2019, 4, 'RQ')
with pytest.warns(pvlibDeprecationWarning, match='get_srml instead'):
data = srml.read_srml_month_from_solardat('TW', 2019, 4, 'RQ')
start = pd.Timestamp('20190401 00:00')
start = start.tz_localize('Etc/GMT+8')
end = pd.Timestamp('20190430 23:45')
Expand All @@ -96,14 +112,53 @@ def test_15_minute_dt_index():
assert (data.index[3::4].minute == 45).all()


@fail_on_pvlib_version('0.11')
@pytest.mark.remote_data
@pytest.mark.flaky(reruns=RERUNS, reruns_delay=RERUNS_DELAY)
def test_hourly_dt_index():
data = srml.read_srml_month_from_solardat('CD', 1986, 4, 'PH')
with pytest.warns(pvlibDeprecationWarning, match='get_srml instead'):
data = srml.read_srml_month_from_solardat('CD', 1986, 4, 'PH')
start = pd.Timestamp('19860401 00:00')
start = start.tz_localize('Etc/GMT+8')
end = pd.Timestamp('19860430 23:00')
end = end.tz_localize('Etc/GMT+8')
assert data.index[0] == start
assert data.index[-1] == end
assert (data.index.minute == 0).all()


@pytest.mark.remote_data
@pytest.mark.flaky(reruns=RERUNS, reruns_delay=RERUNS_DELAY)
def test_get_srml_hourly():
data, meta = data, meta = srml.get_srml(station='CD', start='1986-04-01',
end='1986-05-31', filetype='PH')
expected_index = pd.date_range(start='1986-04-01', end='1986-05-31 23:59',
freq='1h', tz='Etc/GMT+8')
assert_index_equal(data.index, expected_index)


@pytest.mark.remote_data
@pytest.mark.flaky(reruns=RERUNS, reruns_delay=RERUNS_DELAY)
def test_get_srml_minute():
data_read = srml.read_srml(srml_testfile)
data_get, meta = srml.get_srml(station='EU', start='2018-01-01',
end='2018-01-31', filetype='PO')
expected_index = pd.date_range(start='2018-01-01', end='2018-01-31 23:59',
freq='1min', tz='Etc/GMT+8')
assert_index_equal(data_get.index, expected_index)
assert all([c in data_get.columns for c in data_read.columns])
# Check that all indices in example file are present in remote file
assert data_read.index.isin(data_get.index).all()
assert meta['station'] == 'EU'
assert meta['filetype'] == 'PO'
assert meta['filenames'] == ['EUPO1801.txt']


@pytest.mark.remote_data
@pytest.mark.flaky(reruns=RERUNS, reruns_delay=RERUNS_DELAY)
def test_get_srml_nonexisting_month_warning():
with pytest.warns(UserWarning, match='file was not found: EUPO0912.txt'):
# Request data for a period where not all files exist
# Eugene (EU) station started reporting 1-minute data in January 2010
data, meta = data, meta = srml.get_srml(
station='EU', start='2009-12-01', end='2010-01-31', filetype='PO')