# Benchmarking the new Pandas PyArrow Backend

In [1]:
%load_ext watermark
%watermark -p numpy,pandas,polars

numpy : 1.23.5
pandas: 2.0.0rc0
polars: 0.16.10



You can install the Pandas 2.0 release candidate via 


```bash
conda install -c conda-forge/label/pandas_rc pandas==2.0.0rc0

pip install --upgrade --pre pandas==2.0.0rc0
```

# 1) Float 

### PyArrow

In [2]:
import numpy as np
import pandas as pd
import pyarrow as pa

numbers = np.random.rand(1_000_000, 100)

df_pa = pd.DataFrame(numbers, dtype="float64[pyarrow]")
df_pa.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.739789,0.171243,0.77956,0.453279,0.316272,0.202055,0.701529,0.223398,0.180779,0.361951,...,0.030731,0.012013,0.758276,0.347108,0.152085,0.914769,0.485596,0.196424,0.558316,0.036644
1,0.188237,0.364673,0.708197,0.003689,0.869673,0.99384,0.408806,0.73837,0.509626,0.516874,...,0.784797,0.437274,0.00418,0.281059,0.209215,0.581456,0.034112,0.567413,0.883934,0.41937
2,0.387432,0.542618,0.385627,0.165912,0.675967,0.496472,0.232236,0.759717,0.075509,0.060214,...,0.097261,0.680323,0.554496,0.107696,0.711632,0.346951,0.744994,0.662483,0.150556,0.776504
3,0.19037,0.856863,0.553109,0.461054,0.568778,0.187617,0.384497,0.648418,0.560689,0.211983,...,0.967316,0.576226,0.671552,0.382588,0.209125,0.873574,0.289106,0.868413,0.225736,0.269728
4,0.685521,0.300789,0.751959,0.415466,0.670178,0.405571,0.772665,0.705782,0.755357,0.00082,...,0.978006,0.052614,0.52944,0.267572,0.396589,0.195685,0.323305,0.160982,0.977584,0.781165


In [3]:
%timeit -r 30 -n 1 df_pa.mean()

25.4 ms ± 5.37 ms per loop (mean ± std. dev. of 30 runs, 1 loop each)


### Polars

In [4]:
import polars as pl

df_pl = pl.DataFrame(df_pa)
df_pl.head()

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,...,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,...,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.739789,0.171243,0.77956,0.453279,0.316272,0.202055,0.701529,0.223398,0.180779,0.361951,0.551334,0.387156,0.221788,0.074998,0.277764,0.039131,0.625279,0.988637,0.709296,0.643103,0.657713,0.171454,0.855669,0.997973,0.953301,0.953758,0.663223,0.397593,0.966815,0.292429,0.4586,0.732552,0.87994,0.494767,0.699149,0.654407,0.497486,...,0.886984,0.032967,0.088698,0.503685,0.84004,0.729495,0.960647,0.185206,0.233505,0.247867,0.240035,0.624804,0.906294,0.88794,0.306743,0.970122,0.134148,0.087597,0.409002,0.716249,0.425359,0.950504,0.776859,0.471383,0.911685,0.530392,0.098582,0.030731,0.012013,0.758276,0.347108,0.152085,0.914769,0.485596,0.196424,0.558316,0.036644
0.188237,0.364673,0.708197,0.003689,0.869673,0.99384,0.408806,0.73837,0.509626,0.516874,0.500183,0.474612,0.779803,0.326324,0.107263,0.557989,0.70311,0.851079,0.619356,0.37407,0.681312,0.147267,0.068138,0.085389,0.603855,0.008732,0.065126,0.569115,0.917873,0.060423,0.722972,0.126521,0.22078,0.26409,0.868152,0.337481,0.429603,...,0.066624,0.420628,0.173113,0.400634,0.574797,0.833651,0.206654,0.486913,0.027891,0.318097,0.176941,0.50806,0.483328,0.952277,0.136796,0.556167,0.665925,0.66638,0.225327,0.56068,0.424299,0.117952,0.817975,0.98181,0.861922,0.479305,0.556877,0.784797,0.437274,0.00418,0.281059,0.209215,0.581456,0.034112,0.567413,0.883934,0.41937
0.387432,0.542618,0.385627,0.165912,0.675967,0.496472,0.232236,0.759717,0.075509,0.060214,0.871908,0.830904,0.76113,0.833074,0.593831,0.613885,0.910025,0.49179,0.899138,0.767953,0.85621,0.589193,0.845345,0.227758,0.3273,0.011847,0.696758,0.770045,0.067501,0.558609,0.83442,0.428581,0.141987,0.032758,0.896121,0.651265,0.089106,...,0.52693,0.306121,0.494742,0.691013,0.899266,0.941592,0.412065,0.368851,0.225886,0.285843,0.084886,0.952567,0.681991,0.378658,0.640901,0.389181,0.46891,0.115059,0.441754,0.906279,0.279367,0.170382,0.510947,0.678537,0.928045,0.531335,0.590408,0.097261,0.680323,0.554496,0.107696,0.711632,0.346951,0.744994,0.662483,0.150556,0.776504
0.19037,0.856863,0.553109,0.461054,0.568778,0.187617,0.384497,0.648418,0.560689,0.211983,0.877886,0.037949,0.045737,0.52556,0.037172,0.303819,0.815909,0.678298,0.703495,0.160242,0.563789,0.999017,0.107396,0.987818,0.213269,0.864438,0.781261,0.398976,0.811049,0.201187,0.610532,0.981617,0.696182,0.476824,0.456393,0.3071,0.070704,...,0.212025,0.718776,0.863772,0.836723,0.272121,0.800949,0.734792,0.506621,0.321371,0.087973,0.867293,0.627667,0.889183,0.998005,0.422262,0.412171,0.784891,0.464939,0.591279,0.634254,0.340295,0.884701,0.795625,0.239074,0.04274,0.072529,0.548758,0.967316,0.576226,0.671552,0.382588,0.209125,0.873574,0.289106,0.868413,0.225736,0.269728
0.685521,0.300789,0.751959,0.415466,0.670178,0.405571,0.772665,0.705782,0.755357,0.00082,0.225622,0.39374,0.156345,0.491989,0.74189,0.247517,0.814866,0.929771,0.929555,0.729604,0.329593,0.704661,0.680124,0.862563,0.663751,0.535962,0.752143,0.617749,0.74543,0.558282,0.584999,0.351613,0.444335,0.391234,0.565424,0.811048,0.299882,...,0.832284,0.09156,0.844384,0.997165,0.111232,0.217267,0.411379,0.217178,0.076892,0.042135,0.763825,0.239185,0.413552,0.466653,0.86741,0.332848,0.773879,0.749281,0.466701,0.631788,0.223374,0.98785,0.972429,0.51155,0.908381,0.717534,0.967404,0.978006,0.052614,0.52944,0.267572,0.396589,0.195685,0.323305,0.160982,0.977584,0.781165


In [5]:
%timeit -r 30 -n 1 df_pl.mean()

14.9 ms ± 1.07 ms per loop (mean ± std. dev. of 30 runs, 1 loop each)


### Pandas & NumPy

In [6]:
df = pd.DataFrame(numbers, dtype="float64")
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.739789,0.171243,0.77956,0.453279,0.316272,0.202055,0.701529,0.223398,0.180779,0.361951,...,0.030731,0.012013,0.758276,0.347108,0.152085,0.914769,0.485596,0.196424,0.558316,0.036644
1,0.188237,0.364673,0.708197,0.003689,0.869673,0.99384,0.408806,0.73837,0.509626,0.516874,...,0.784797,0.437274,0.00418,0.281059,0.209215,0.581456,0.034112,0.567413,0.883934,0.41937
2,0.387432,0.542618,0.385627,0.165912,0.675967,0.496472,0.232236,0.759717,0.075509,0.060214,...,0.097261,0.680323,0.554496,0.107696,0.711632,0.346951,0.744994,0.662483,0.150556,0.776504
3,0.19037,0.856863,0.553109,0.461054,0.568778,0.187617,0.384497,0.648418,0.560689,0.211983,...,0.967316,0.576226,0.671552,0.382588,0.209125,0.873574,0.289106,0.868413,0.225736,0.269728
4,0.685521,0.300789,0.751959,0.415466,0.670178,0.405571,0.772665,0.705782,0.755357,0.00082,...,0.978006,0.052614,0.52944,0.267572,0.396589,0.195685,0.323305,0.160982,0.977584,0.781165


In [7]:
%timeit -r 30 -n 1 df.mean()

153 ms ± 3.71 ms per loop (mean ± std. dev. of 30 runs, 1 loop each)


In [8]:
%timeit -r 30 -n 1 numbers.mean()

19.5 ms ± 462 µs per loop (mean ± std. dev. of 30 runs, 1 loop each)


# 2) Int

### PyArrow

In [9]:
numbers = np.random.randint(0, 1000, (1_000_000, 100))

df_pa = pd.DataFrame(numbers, dtype="int64[pyarrow]")
df_pa.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,377,481,507,655,978,637,309,892,59,214,...,142,89,613,239,582,772,911,536,723,425
1,715,113,433,43,777,513,947,301,971,148,...,60,123,139,608,838,209,382,162,955,822
2,795,217,870,600,947,778,133,267,258,922,...,164,150,714,955,816,991,9,530,982,268
3,342,115,549,703,558,802,563,719,943,850,...,274,810,701,976,915,958,560,753,296,701
4,596,593,964,330,490,549,262,671,892,981,...,600,329,664,105,918,662,792,534,83,616


In [10]:
%timeit -r 30 -n 1 df_pa.sum()

18.7 ms ± 2.77 ms per loop (mean ± std. dev. of 30 runs, 1 loop each)


### Polars

In [11]:
df_pl = pl.DataFrame(df_pa)
df_pl.head()

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,...,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,...,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
377,481,507,655,978,637,309,892,59,214,674,141,905,297,581,333,9,820,793,500,380,889,200,863,968,509,889,174,488,805,577,455,514,990,962,656,300,...,765,308,998,100,89,92,551,202,578,566,619,380,389,881,466,559,702,92,631,119,783,195,98,273,8,97,253,142,89,613,239,582,772,911,536,723,425
715,113,433,43,777,513,947,301,971,148,953,709,769,881,115,358,758,358,216,702,831,723,236,218,688,968,89,737,407,290,967,25,786,828,609,306,841,...,682,402,544,384,34,248,130,73,66,608,398,653,302,85,609,841,174,757,868,200,853,495,237,567,30,512,418,60,123,139,608,838,209,382,162,955,822
795,217,870,600,947,778,133,267,258,922,579,674,706,337,870,755,57,212,208,727,721,804,539,795,94,766,72,354,845,44,404,751,320,371,979,772,132,...,462,705,893,773,657,980,614,535,691,352,453,286,330,475,161,757,431,662,794,725,459,967,698,1,788,208,1,164,150,714,955,816,991,9,530,982,268
342,115,549,703,558,802,563,719,943,850,5,711,433,267,920,432,4,853,613,892,220,937,232,292,676,69,266,289,415,494,232,260,149,231,620,808,759,...,313,522,250,304,258,245,337,779,506,743,19,538,881,830,296,174,798,919,163,576,630,502,444,931,986,346,174,274,810,701,976,915,958,560,753,296,701
596,593,964,330,490,549,262,671,892,981,267,184,270,18,700,400,794,566,826,445,496,405,607,471,252,417,867,823,9,982,912,673,428,691,436,815,335,...,919,401,711,784,346,971,111,967,788,60,735,825,312,979,702,148,139,615,431,195,706,869,875,479,346,206,423,600,329,664,105,918,662,792,534,83,616


In [12]:
%timeit -r 30 -n 1 df_pl.sum()

14.5 ms ± 694 µs per loop (mean ± std. dev. of 30 runs, 1 loop each)


### Pandas & NumPy

In [13]:
df = pd.DataFrame(numbers, dtype="int64")
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,377,481,507,655,978,637,309,892,59,214,...,142,89,613,239,582,772,911,536,723,425
1,715,113,433,43,777,513,947,301,971,148,...,60,123,139,608,838,209,382,162,955,822
2,795,217,870,600,947,778,133,267,258,922,...,164,150,714,955,816,991,9,530,982,268
3,342,115,549,703,558,802,563,719,943,850,...,274,810,701,976,915,958,560,753,296,701
4,596,593,964,330,490,549,262,671,892,981,...,600,329,664,105,918,662,792,534,83,616


In [14]:
%timeit -r 30 -n 1 df.sum()

46.6 ms ± 195 µs per loop (mean ± std. dev. of 30 runs, 1 loop each)


In [15]:
%timeit -r 30 -n 1 numbers.sum()

17.6 ms ± 687 µs per loop (mean ± std. dev. of 30 runs, 1 loop each)


# 3) Str

In [16]:
import string

ALPHABET = np.array(list(string.ascii_lowercase + ' '))


def generate_guess(sentence):
    return np.random.choice(ALPHABET, size=len(sentence))


s = generate_guess("Why don't scientists trust atoms? Because they make up everything!" * 10_000)
s

array(['v', 'c', 'a', ..., 'g', 'n', 'g'], dtype='<U1')

### Pandas & NumPy

In [17]:
df = pd.DataFrame(s, dtype="string")

df.head()

Unnamed: 0,0
0,v
1,c
2,a
3,t
4,a


In [18]:
df[0].str.count("a")

0         0
1         0
2         1
3         0
4         1
         ..
659995    0
659996    0
659997    0
659998    0
659999    0
Name: 0, Length: 660000, dtype: Int64

In [19]:
%timeit -r 30 -n 1 df[0].str.replace("a", "b")

64.7 ms ± 654 µs per loop (mean ± std. dev. of 30 runs, 1 loop each)


In [20]:
%timeit -r 30 -n 1 np.char.replace(s, "a", "b")

279 ms ± 1.43 ms per loop (mean ± std. dev. of 30 runs, 1 loop each)


### PyArrow

In [21]:
df_pa = pd.DataFrame(s, dtype="string[pyarrow]")

In [22]:
%timeit -r 30 -n 1 df_pa[0].str.replace("a", "b")

5.45 ms ± 183 µs per loop (mean ± std. dev. of 30 runs, 1 loop each)


### Polars

In [23]:
df_pl = pl.DataFrame(df_pa)
df_pl.head()

0
str
"""v"""
"""c"""
"""a"""
"""t"""
"""a"""


In [24]:
%timeit -r 30 -n 1 df_pl["0"].str.replace("a", "b")

11.6 ms ± 177 µs per loop (mean ± std. dev. of 30 runs, 1 loop each)
