Skip to content

Commit

Permalink
apacheGH-15070: [Python][CI] Compatibility with pandas 2.0 (apache#34878
Browse files Browse the repository at this point in the history
)

### What changes are included in this PR?

- The issue with numpy 1.25 in the assert equal helper was fixed in pandas 1.5.3 -> removing the skip (in theory can still run into this error when using an older pandas version with the latest numpy, but that's not something you should do)
- Casting tz-aware strings to datetime64[ns] was not fixed in pandas (pandas-dev/pandas#50140) -> updating our implementation to work around it
- Casting to numpy string dtype (pandas-dev/pandas#50127) is not yet fixed -> updating the skip

### Are there any user-facing changes?

No
* Closes: apache#15070

Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
  • Loading branch information
jorisvandenbossche authored and rtpsw committed May 16, 2023
1 parent 0299e03 commit 23793b1
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 29 deletions.
3 changes: 1 addition & 2 deletions python/pyarrow/pandas_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -1148,8 +1148,7 @@ def _reconstruct_columns_from_metadata(columns, column_indexes):
if pandas_dtype == "datetimetz":
tz = pa.lib.string_to_tzinfo(
column_indexes[0]['metadata']['timezone'])
dt = level.astype(numpy_dtype)
level = dt.tz_localize('utc').tz_convert(tz)
level = pd.to_datetime(level, utc=True).tz_convert(tz)
elif level.dtype != dtype:
level = level.astype(dtype)
# ARROW-9096: if original DataFrame was upcast we keep that
Expand Down
34 changes: 7 additions & 27 deletions python/pyarrow/tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,17 +187,12 @@ def test_column_index_names_are_preserved(self):
_check_pandas_roundtrip(df, preserve_index=True)

def test_column_index_names_with_tz(self):
if Version("2.0.0.dev0") <= Version(pd.__version__) < Version("2.0.0"):
# TODO: regression in pandas, should be fixed before final 2.0.0
# https://github.com/pandas-dev/pandas/issues/50140
pytest.skip("Regression in pandas 2.0.0.dev")
# ARROW-13756
# Bug if index is timezone aware DataTimeIndex

df = pd.DataFrame(
np.random.randn(5, 3),
columns=pd.date_range(
"2021-01-01", "2021-01-3", freq="D", tz="CET")
columns=pd.date_range("2021-01-01", periods=3, freq="50D", tz="CET")
)
_check_pandas_roundtrip(df, preserve_index=True)

Expand Down Expand Up @@ -453,11 +448,11 @@ def test_mixed_column_names(self):
preserve_index=True)

def test_binary_column_name(self):
if Version("2.0.0.dev0") <= Version(pd.__version__) < Version("2.0.0"):
# TODO: regression in pandas, should be fixed before final 2.0.0
if Version("2.0.0") <= Version(pd.__version__) < Version("2.1.0"):
# TODO: regression in pandas, hopefully fixed in next version
# https://issues.apache.org/jira/browse/ARROW-18394
# https://github.com/pandas-dev/pandas/issues/50127
pytest.skip("Regression in pandas 2.0.0.dev")
pytest.skip("Regression in pandas 2.0.0")
column_data = ['い']
key = 'あ'.encode()
data = {key: column_data}
Expand Down Expand Up @@ -2064,11 +2059,6 @@ def test_nested_smaller_ints(self):
assert result3.equals(expected3)

def test_infer_lists(self):
if ((Version(np.__version__) >= Version("1.25.0.dev0")) and
(Version(pd.__version__) < Version("2.0.0"))):
# TODO: regression in pandas with numpy 1.25dev
# https://github.com/pandas-dev/pandas/issues/50360
pytest.skip("Regression in pandas with numpy 1.25")
data = OrderedDict([
('nan_ints', [[None, 1], [2, 3]]),
('ints', [[0, 1], [2, 3]]),
Expand Down Expand Up @@ -2118,11 +2108,6 @@ def test_infer_numpy_array(self):
_check_pandas_roundtrip(df, expected_schema=expected_schema)

def test_to_list_of_structs_pandas(self):
if ((Version(np.__version__) >= Version("1.25.0.dev0")) and
(Version(pd.__version__) < Version("2.0.0"))):
# TODO: regression in pandas with numpy 1.25dev
# https://github.com/pandas-dev/pandas/issues/50360
pytest.skip("Regression in pandas with numpy 1.25")
ints = pa.array([1, 2, 3], pa.int32())
strings = pa.array([['a', 'b'], ['c', 'd'], ['e', 'f']],
pa.list_(pa.string()))
Expand Down Expand Up @@ -2192,11 +2177,6 @@ def test_array_from_nested_arrays(self):
assert result.equals(expected)

def test_nested_large_list(self):
if ((Version(np.__version__) >= Version("1.25.0.dev0")) and
(Version(pd.__version__) < Version("2.0.0"))):
# TODO: regression in pandas with numpy 1.25dev
# https://github.com/pandas-dev/pandas/issues/50360
pytest.skip("Regression in pandas with numpy 1.25")
s = (pa.array([[[1, 2, 3], [4]], None],
type=pa.large_list(pa.large_list(pa.int64())))
.to_pandas())
Expand Down Expand Up @@ -2950,11 +2930,11 @@ def _fully_loaded_dataframe_example():

@pytest.mark.parametrize('columns', ([b'foo'], ['foo']))
def test_roundtrip_with_bytes_unicode(columns):
if Version("2.0.0.dev0") <= Version(pd.__version__) < Version("2.0.0"):
# TODO: regression in pandas, should be fixed before final 2.0.0
if Version("2.0.0") <= Version(pd.__version__) < Version("2.1.0"):
# TODO: regression in pandas, hopefully fixed in next version
# https://issues.apache.org/jira/browse/ARROW-18394
# https://github.com/pandas-dev/pandas/issues/50127
pytest.skip("Regression in pandas 2.0.0.dev")
pytest.skip("Regression in pandas 2.0.0")

df = pd.DataFrame(columns=columns)
table1 = pa.Table.from_pandas(df)
Expand Down

0 comments on commit 23793b1

Please sign in to comment.