In [2]:
import pandas as pd
df = pd.read_csv("https://raw.githubusercontent.com/allisonhorst/"
                 "palmerpenguins/47a3476d2147080e7ceccef4cf70105c808f2cbf/"
                 "data-raw/penguins_raw.csv")
                 # Increase dataset to 1m rows and reset index
df = df.sample(1_000_000, replace=True).reset_index(drop=True)

In [3]:
import numpy as np
# Update sample number (0 to 999'999)
df["Sample Number"] = df.index
# Add some random variation to numeric columns
df[["Culmen Length (mm)", "Culmen Depth (mm)", 
    "Flipper Length (mm)", "Body Mass (g)"]] = df[["Culmen Length (mm)", "Culmen Depth (mm)", 
                                                   "Flipper Length (mm)", "Body Mass (g)"]] \
                                               + np.random.rand(df.shape[0], 4)

In [4]:
# Write to csv
df.to_csv("penguin-dataset.csv")

In [5]:
pip install pyarrow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [6]:
import pyarrow as pa
# Write to Arrow
# Convert from pandas to Arrow
table = pa.Table.from_pandas(df)
# Write out to file
with pa.OSFile('penguin-dataset.arrow', 'wb') as sink:
    with pa.RecordBatchFileWriter(sink, table.schema) as writer:
        writer.write_table(table)

In [7]:
# Create dataframe where missing numeric values are filled with zero
df_nonan = df.copy()
df_nonan[["Culmen Length (mm)", "Culmen Depth (mm)", 
          "Flipper Length (mm)", "Body Mass (g)"]] = df[["Culmen Length (mm)", "Culmen Depth (mm)", 
                                                         "Flipper Length (mm)", "Body Mass (g)"]].fillna(0)

In [8]:
# Convert from no-NaN pandas to Arrow
table_nonan = pa.Table.from_pandas(df_nonan)
# Write out to file
with pa.OSFile('penguin-dataset-nonan.arrow', 'wb') as sink:
    with pa.RecordBatchFileWriter(sink, table_nonan.schema) as writer:
        writer.write_table(table_nonan)

In [9]:
from pathlib import Path
sz = Path('penguin-dataset.csv').stat().st_size
print(sz)

from pathlib import Path
sz = Path('penguin-dataset.arrow').stat().st_size
print(sz)

from pathlib import Path
sz = Path('penguin-dataset-nonan.arrow').stat().st_size
print(sz)

217220910
197875410
197375410


The difference between reading memory-mapped Arrow files with and without zero-copying 2 times performance improvement shown below

In [10]:
# Read csv and calculate mean
%%timeit
pd.read_csv("penguin-dataset.csv")["Flipper Length (mm)"].mean()

1 loop, best of 5: 2.64 s per loop


In [11]:
# Read Arrow using file API and calculate mean
%%timeit
with pa.OSFile('penguin-dataset.arrow', 'rb') as source:
    table = pa.ipc.open_file(source).read_all().column("Flipper Length (mm)")
result = table.to_pandas().mean()

10 loops, best of 5: 54.7 ms per loop


In [12]:
# Read Arrow with memory-mapped API with missing values
%%timeit
source = pa.memory_map('penguin-dataset.arrow', 'r')
table = pa.ipc.RecordBatchFileReader(source).read_all().column("Flipper Length (mm)")
result = table.to_pandas().mean()

100 loops, best of 5: 4.56 ms per loop


In [13]:
# Read Arrow with memory-mapped API without missing values (zero-copy)
%%timeit
source = pa.memory_map('penguin-dataset-nonan.arrow', 'r')
table = pa.ipc.RecordBatchFileReader(source).read_all().column("Flipper Length (mm)")
result = table.to_pandas().mean()

100 loops, best of 5: 2.66 ms per loop


In [14]:
pip install psutil

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [16]:
import os
import psutil
# Measure initial memory consumption
memory_init = psutil.Process(os.getpid()).memory_info().rss >> 20


# Read csv
col_csv = pd.read_csv("penguin-dataset.csv")["Flipper Length (mm)"]
memory_post_csv = psutil.Process(os.getpid()).memory_info().rss >> 20


# Read Arrow using file API
with pa.OSFile('penguin-dataset.arrow', 'rb') as source:
    col_arrow_file = pa.ipc.open_file(source).read_all().column("Flipper Length (mm)").to_pandas()
memory_post_arrowos = psutil.Process(os.getpid()).memory_info().rss >> 20


# Read Arrow with memory-mapped API with missing values
source = pa.memory_map('penguin-dataset.arrow', 'r')
table_mmap = pa.ipc.RecordBatchFileReader(source).read_all().column("Flipper Length (mm)")
col_arrow_mapped = table_mmap.to_pandas()
memory_post_arrowmmap = psutil.Process(os.getpid()).memory_info().rss >> 20


# Read Arrow with memory-mapped API without missing values (zero-copy)
source = pa.memory_map('penguin-dataset-nonan.arrow', 'r')
table_mmap_zc = pa.ipc.RecordBatchFileReader(source).read_all().column("Flipper Length (mm)")
col_arrow_mapped_zc = table_mmap_zc.to_pandas()
memory_post_arrowmmap_zc = psutil.Process(os.getpid()).memory_info().rss >> 20


# Display memory consumption - memory_init
print(f"initial memory consumption: {memory_init}\n")
print(f"csv memory consumption: {memory_post_csv}\n"
f"Arrow file memory consumption: {memory_post_arrowos - memory_post_csv}\n"
      f"Arrow mapped (no zero-copy) memory consumption: {memory_post_arrowmmap - memory_post_arrowos}\n"
      f"Arrow mapped (zero-copy) memory consumption: {memory_post_arrowmmap_zc - memory_post_arrowmmap}\n")


initial memory consumption: 1137

csv memory consumption: 1195
Arrow file memory consumption: 197
Arrow mapped (no zero-copy) memory consumption: 5
Arrow mapped (zero-copy) memory consumption: 0



By using the memory-mapping function and with filled NaN values, the pandas DataFrame was created directly on top of the stored Arrow file. No copying: 0 MB of RAM