## Preparing CSV files

In [None]:
from progressivis.datasets import wget_file
import os
url = "https://www.aviz.fr/nyc-taxi/yellow_tripdata_2015-01.csv.bz2"
UNCOMPRESSED = True
bz2_file = "/tmp/yellow_tripdata_2015-01.csv.bz2"
csv_file = "/tmp/yellow_tripdata_2015-01.csv"
if not os.path.exists(bz2_file):
    print(f"loading {bz2_file} ...", end=" ")
    wget_file(bz2_file, url)
    print("done")
    if UNCOMPRESSED:
        import bz2
        print(f"uncompressing {bz2_file} ...", end=" ")
        with bz2.open(bz2_file, "rb") as rf:
            with open(csv_file, "wb") as wf:
                wf.write(rf.read())
        print("done")
USE_FILE = csv_file if UNCOMPRESSED else bz2_file

USECOLS = [
    "trip_distance",
    "pickup_longitude",
    "pickup_latitude",
    "dropoff_longitude",
    "dropoff_latitude"
]

## Loading a CSV file with pandas read_csv()

In [None]:
import pandas as pd
import time
start_t = time.time()
df = pd.read_csv(USE_FILE, usecols=USECOLS)
end_t = time.time()
print(f"Elapsed time: {(end_t - start_t):.2f}")
df.info()

## Loading the same CSV file with ProgressiVis

In [None]:
from progressivis import Sink, SimpleCSVLoader, get_dataset, Scheduler
from progressivis.core import aio
import time

s = Scheduler.default
module = SimpleCSVLoader(
            USE_FILE, usecols=USECOLS, scheduler=s
        )
sink = Sink(scheduler=s)
sink.input.inp = module.output.result
start_t = time.time()
await s.start()
end_t = time.time()
print(f"Elapsed time: {(end_t - start_t):.2f}")
module.result

## Loading the same CSV file with the ProgressiVis threaded CSV loader

**NB:** consider restarting the kernel then run the first cell before

In [None]:
from progressivis import Sink, ThreadedCSVLoader, get_dataset, Scheduler
from progressivis.core import aio
import time
s = Scheduler.default
module = ThreadedCSVLoader(
            USE_FILE, usecols=USECOLS, scheduler=s
        )
sink = Sink(scheduler=s)
sink.input.inp = module.output.result
start_t = time.time()
await s.start()
end_t = time.time()
print(f"Elapsed time: {(end_t - start_t):.2f}")
module.result