From f7c95a9e5f5c7a0cdd1a9df1043778e803d6f8c7 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Wed, 1 May 2024 14:33:07 -0700 Subject: [PATCH 1/3] Initial commit Signed-off-by: Balaji Veeramani --- python/ray/data/__init__.py | 11 +++++++- .../ray/data/datasource/parquet_datasource.py | 26 +++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/python/ray/data/__init__.py b/python/ray/data/__init__.py index b5dc6b811821..873ea50f2fa9 100644 --- a/python/ray/data/__init__.py +++ b/python/ray/data/__init__.py @@ -78,7 +78,16 @@ # anything. pass else: - if parse_version(pyarrow_version) >= parse_version("14.0.1"): + from ray._private.ray_constants import env_bool + + RAY_DATA_AUTOLOAD_UNSAFE_ARROW_TYPES = env_bool( + "RAY_DATA_AUTOLOAD_UNSAFE_ARROW_TYPES", False + ) + + if ( + parse_version(pyarrow_version) >= parse_version("14.0.1") + and RAY_DATA_AUTOLOAD_UNSAFE_ARROW_TYPES + ): pa.PyExtensionType.set_auto_load(True) # Import these arrow extension types to ensure that they are registered. from ray.air.util.tensor_extensions.arrow import ( # noqa diff --git a/python/ray/data/datasource/parquet_datasource.py b/python/ray/data/datasource/parquet_datasource.py index fd4642e5d848..e552b8654fcd 100644 --- a/python/ray/data/datasource/parquet_datasource.py +++ b/python/ray/data/datasource/parquet_datasource.py @@ -158,6 +158,30 @@ def set_schema_pickled(self, schema_pickled: bytes): self.schema_pickled = schema_pickled +def _check_for_legacy_tensor_type(schema): + """Check for the legacy tensor extension type and raise an error if found. + + Ray Data uses an extension type to represent tensors in Arrow tables. Previously, + the extension type extended `PyExtensionType`. However, this base type can expose + users to arbitrary code execution. To prevent this, we don't load the type by + default. + """ + import pyarrow as pa + + for name, type in zip(schema.names, schema.types): + if isinstance(type, pa.UnknownExtensionType) and isinstance( + type, pa.PyExtensionType + ): + raise RuntimeError( + f"Ray Data couldn't infer the type of column '{name}'. This might mean " + "you're trying to read data written with an older version of Ray. " + "Reading data written with older versions of Ray might expose you to " + "arbitrary code execution. To try reading the data anyway, set " + "`RAY_DATA_AUTOLOAD_UNSAFE_ARROW_TYPES=1` on all nodes." + "To learn more, see https://github.com/ray-project/ray/issues/41314." + ) + + @PublicAPI class ParquetDatasource(Datasource): """Parquet datasource, for reading and writing Parquet files. @@ -258,6 +282,8 @@ def __init__( [schema.field(column) for column in columns], schema.metadata ) + _check_for_legacy_tensor_type(schema) + if _block_udf is not None: # Try to infer dataset schema by passing dummy table through UDF. dummy_table = schema.empty_table() From d491a5a25b523353390a8e37c2573f93a90838da Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Wed, 1 May 2024 14:35:54 -0700 Subject: [PATCH 2/3] Uppdate name Signed-off-by: Balaji Veeramani --- python/ray/data/__init__.py | 6 +++--- python/ray/data/datasource/parquet_datasource.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/ray/data/__init__.py b/python/ray/data/__init__.py index 873ea50f2fa9..9c2857128da7 100644 --- a/python/ray/data/__init__.py +++ b/python/ray/data/__init__.py @@ -80,13 +80,13 @@ else: from ray._private.ray_constants import env_bool - RAY_DATA_AUTOLOAD_UNSAFE_ARROW_TYPES = env_bool( - "RAY_DATA_AUTOLOAD_UNSAFE_ARROW_TYPES", False + RAY_DATA_AUTOLOAD_PYEXTENSIONTYPE = env_bool( + "RAY_DATA_AUTOLOAD_PYEXTENSIONTYPE", False ) if ( parse_version(pyarrow_version) >= parse_version("14.0.1") - and RAY_DATA_AUTOLOAD_UNSAFE_ARROW_TYPES + and RAY_DATA_AUTOLOAD_PYEXTENSIONTYPE ): pa.PyExtensionType.set_auto_load(True) # Import these arrow extension types to ensure that they are registered. diff --git a/python/ray/data/datasource/parquet_datasource.py b/python/ray/data/datasource/parquet_datasource.py index e552b8654fcd..2cda1549b61c 100644 --- a/python/ray/data/datasource/parquet_datasource.py +++ b/python/ray/data/datasource/parquet_datasource.py @@ -177,7 +177,7 @@ def _check_for_legacy_tensor_type(schema): "you're trying to read data written with an older version of Ray. " "Reading data written with older versions of Ray might expose you to " "arbitrary code execution. To try reading the data anyway, set " - "`RAY_DATA_AUTOLOAD_UNSAFE_ARROW_TYPES=1` on all nodes." + "`RAY_DATA_AUTOLOAD_PYEXTENSIONTYPE=1` on all nodes." "To learn more, see https://github.com/ray-project/ray/issues/41314." ) From b929e1a083bf8711a749c9d935eaeec73d280cf0 Mon Sep 17 00:00:00 2001 From: Balaji Veeramani Date: Mon, 6 May 2024 13:18:56 -0700 Subject: [PATCH 3/3] Fix failing test Signed-off-by: Balaji Veeramani --- ...000000.parquet => 1_000000_000000.parquet} | Bin 51975 -> 51825 bytes ...000001.parquet => 1_000007_000000.parquet} | Bin 52031 -> 51881 bytes ...000002.parquet => 1_000009_000000.parquet} | Bin 43004 -> 42854 bytes 3 files changed, 0 insertions(+), 0 deletions(-) rename python/ray/data/examples/data/parquet_images_mini/{652597cc55e94e3e8d91776eec17774a_000000.parquet => 1_000000_000000.parquet} (95%) rename python/ray/data/examples/data/parquet_images_mini/{652597cc55e94e3e8d91776eec17774a_000001.parquet => 1_000007_000000.parquet} (94%) rename python/ray/data/examples/data/parquet_images_mini/{652597cc55e94e3e8d91776eec17774a_000002.parquet => 1_000009_000000.parquet} (93%) diff --git a/python/ray/data/examples/data/parquet_images_mini/652597cc55e94e3e8d91776eec17774a_000000.parquet b/python/ray/data/examples/data/parquet_images_mini/1_000000_000000.parquet similarity index 95% rename from python/ray/data/examples/data/parquet_images_mini/652597cc55e94e3e8d91776eec17774a_000000.parquet rename to python/ray/data/examples/data/parquet_images_mini/1_000000_000000.parquet index f23198520b7ce342e2b5a267bcab211679fb6ff3..cfb619b04e6930c9922a1be461f93845c186751a 100644 GIT binary patch delta 727 zcmZpl#{6*#vrvFzkfA7x=nHPqPuxDDznDZBHVW}DGYV{$XYSl9!N4XUX~CMAo0y); zl9O3n!k(Iwnwy$eGWo+n4SfbtCI(RkQ2`*uzyLIZPgKZ96lj1D0|UDRGf*r!u>>eD zBr2u@6c(5)a!|qTO$(chD37Em$Y2T9oW!Km98o4QCb259CPuXxEo^G9n%QK`7&MrH zdL=*vgQO17Xf&%PR~Q3pAg-fhx}%GuqbrC8+wN2j5(M&r)`RSEE3WX(N-y?wajo=sE>HJ0Gyq9~-4Ph( z6&zxiY!(%fVGtE=T9p`XS?Zr_n3Lk}mK_-plmSu*@`w}EJfNYWAbXtB>m5BD9l;iX zWg!Y(ox!#uIRGLKa)na?&<`a*1~9~c27qa3a5>uAGDs*CBo-Bxrk3a?7Zm6w78T`} kE0m=c6=&w>DHxjS8R!{F>c}uK01?AG4hDt*W?)(d0A5hQCIA2c delta 807 zcma)5-D}fO6i?H%s4a?>ThlR^_)rGQOh1C=Vc_jY(#DqB#wKar+B9prC8e-twFN24 zVEDR-|Hml!CagnY`ge%<;)6bkAb9Vs8}32ylAL?Ke&_s74*B*fc<RZgwy+?vLOtjII$e(v2obR9AAMMv@peaRk~DFQE=io2 zu8^q=a=e~avt3E`%(S!DaRkoNa}KUsfVe>qwZn(>*})niSQM4!j-c1vzp&fGk)bzi z#G7kjp65tK?aqjUIID6=wk?IX+j`mS>WyJ1&vS;xPdgcT%g}A2w>7~tC%kPAo5Zp-D<>H$pO=}dG4W7CaxP?`e2amGFj$m1V@;or=z1|pyTGA zqr%LJ?m)I;EO#re@Xbmu_H=ROt@L*;Pxm!6 z0Lev8PB#wL~|$V6xNgWvm1|VX1 K$HBk=N}m9s;XIVn^K@1E$f9Kt6{ z1d@obBrz}~ad0#|xyjz3UWMSOjNcpzKYVS(Hc$Z@O@h{lpuT)L8w#fj`!t0LRHZ&u zRyGsXeM5J6LlXw3mI+Fp;M6thMq&>~Nn+pfGO1!aE9glz-H_D4O1gV>H^bX{)+NpI zh;(gY4&>10+eD+R3!+lp&gdmi4PT#5?{^Ko;@~Ox8e#n>MNu2nrr3tOCOftwIHtZH zH1vvJ&k2-ZJtc4bP1foQj@6d;8iKNs=g)|$lwhW>uDLzKP&VB{fRuE$h|+fT3EJ=W zY;8P|_sp;PII&$3EJ@umyi8wDio2#J4qUl}gVlto34BRbL|1%Nb^MCr=F)^A*$SSR zGA^TRnT&ylwaAw^cM=VUtka;iGXtSIRmN+TTs#$zks2w03(}RXk3;aax(Z^qeY5j4 ou)9l^?=P91&Q5P`&+hEDcG`0fmic8qzBmg2e!%l90FeLbHz9-U`Tzg` diff --git a/python/ray/data/examples/data/parquet_images_mini/652597cc55e94e3e8d91776eec17774a_000002.parquet b/python/ray/data/examples/data/parquet_images_mini/1_000009_000000.parquet similarity index 93% rename from python/ray/data/examples/data/parquet_images_mini/652597cc55e94e3e8d91776eec17774a_000002.parquet rename to python/ray/data/examples/data/parquet_images_mini/1_000009_000000.parquet index a698d140c8987ec2a86945b83a4b70695b16c26f..7512bec2deb62839eeaf0e749b35f180002d716d 100644 GIT binary patch delta 715 zcmex!p6S^+CZPbwAVX0W(FT6eE`A@;2h5@j8-;E#GYV{e&Ae%*1OuCZqy=kcZen^W zOHO8S343ZzYHn&?$>gW=H1runnHWSFLusLdeDHGQXE`2aacbR;1VFX8* zqo<>zW1!<^&qcz_ita$RW3szjX_UKL6_Du+;s=8Wpb|$%7lX;UOH(Auzyj$YG7`)L z$@w}uPJX#mquvWH;ur>ECg-}D`s4;>q`145Mn-s5CK;!srMQQcCL38+g5=#mgsWR{ zXtrg9e{PrwL@uW+DXX9~GTbl+B<<*!?&tzk1){-Daw-Q20{K8Ufh>0`uJFxDFZOhC z<*oF0E>HJ0Gyus(PL^FJRd1M+;_j9m84;8L5(D|f391)pKq$yer*uaTM@O)kU`dEN zS7)#VNOnQQK^8g{0Nq&vWB@}9s2@y2gUiv@mO(hG_^!GxnT0OHV%6u&xd5fD)|;-W-~M5ZVb{ct{9$k5@O#A!{UG;!lN-W#L0g5zJBlh-n0DuggO7j1a?t10a^HL z8NOUD!mmEaeh7T=EwINwd>>yy7A%DyE_NKfZL)S}Fl0MJ)A4@J;u|bv00bDx06?xg zFi?aHWCH-)2>GzD)obH;00viy+w;$&;^#9q0{vm0R1GcKdegKa6J&yi!RHIH^HVl< za>_u=it+lh#Nq{!_p|%|rdYa$i~VQwW8?LRxpj7XgPat~d*9xu zOId`E;O)-u9_QY!_*SSb6DO+RH;cUSycH^;JT}_I+aQ9*auvb*HIE-r2o3IvP&Q z`sGBr zSCCL4W8h&8vWcr8(eV0rXwvqDfl!kwGzUX$JQasM4N?HFkbbhA%M^TVZGq4|SMNU= nnZq^1b=UNMzc-1F&HkX%>qZ}}bL(7qH39&Bz>AvzkU#7