Comparison between `modin` + `ray` cvs file reading performance and `pandas`, `pyarrow` and `dask`

## Install modin

In [1]:
# !pip install 'modin[ray]'

In [2]:
# !wget https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2020-01.csv

Number of CPUs

In [3]:
import os
print(f"CPUs: {os.cpu_count()}")

CPUs: 4


## Init ray

In [4]:
import ray

ray.init(_plasma_directory="/tmp")

2021-06-26 11:36:10,157	INFO services.py:1315 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m
2021-06-26 11:36:10,162	INFO services.py:1797 -- object_store_memory is not verified when plasma_directory is set.


{'node_ip_address': '192.168.0.77',
 'raylet_ip_address': '192.168.0.77',
 'redis_address': '192.168.0.77:6379',
 'object_store_address': '/tmp/ray/session_2021-06-26_11-36-07_986819_93941/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-06-26_11-36-07_986819_93941/sockets/raylet',
 'webui_url': '127.0.0.1:8265',
 'session_dir': '/tmp/ray/session_2021-06-26_11-36-07_986819_93941',
 'metrics_export_port': 58071,
 'node_id': '59c065d0cabeb0873519919b179758ac9d94e30a13b13f0416129393'}

## Import all libs

In [5]:
import modin.pandas as mpd
import pandas as pd
import pyarrow.csv as csv
import dask.dataframe as dd

In [6]:
file_name = 'yellow_tripdata_2020-01.csv'

## modin

In [7]:
%%timeit

df = mpd.read_csv(file_name)

12 s ± 1.69 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


## pandas

In [8]:
%%timeit

df = pd.read_csv(file_name)



13 s ± 1.34 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


## pyarrow

In [9]:
%%timeit

table = csv.read_csv(file_name)
df = table.to_pandas()

4.68 s ± 416 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## dask

In [10]:
%%timeit

# Not fair comparison since it returns dask dataframe not pandas.DataFrame
# like all other methods
df = dd.read_csv(file_name)

9.71 ms ± 158 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
