### Based on https://github.com/FalkonML/falkon/blob/master/notebooks/NycTaxiDataset.ipynb

In [1]:
import psycopg2
import sys, os, time

In [2]:
csv_size = 3_000_000
folder = os.path.abspath("./taxi_data_preprocessed")
try:
    os.makedirs(folder)
except FileExistsError:
    pass

### Preprocess data into compressed CSVs using SQL

In [3]:
def make_copy_sql(from_id, num_rows, out_file):
    return """
    COPY (
        SELECT
            id,
            EXTRACT(EPOCH FROM CAST(pickup_datetime AS time)) as time,
            EXTRACT(ISODOW FROM pickup_datetime) as dow,
            EXTRACT(DAY FROM pickup_datetime) as dom,
            EXTRACT(MONTH FROM pickup_datetime) as month,
            round(pickup_latitude, 6) as pickup_lat,
            round(pickup_longitude, 6) as pickup_lon,
            round(dropoff_latitude, 6) as dropoff_lat,
            round(dropoff_longitude, 6) as dropoff_lon,
            round(trip_distance, 3) as distance,
            EXTRACT(EPOCH FROM dropoff_datetime - pickup_datetime) as duration
        FROM trips WHERE 
            (pickup_nyct2010_gid IS NOT NULL) AND 
            (dropoff_nyct2010_gid IS NOT NULL) AND
            (EXTRACT(EPOCH FROM dropoff_datetime - pickup_datetime) BETWEEN 0 AND 18000) AND
            (id > %d)
        ORDER BY id ASC
        LIMIT %d
    ) TO '%s'
    WITH (FORMAT csv, HEADER true);""" % (from_id, num_rows, out_file)

In [4]:
conn = psycopg2.connect(database="nyc-taxi-data")

In [5]:
index = 0
i = 0
with conn.cursor() as cursor:
    while True:
        t_s = time.time()
        fn = os.path.join(folder, "%d.csv" % (index))
        # Postgres server needs to have permission to create the file
        os.umask(0)
        with open(os.open(fn, os.O_CREAT | os.O_WRONLY, 0o777), 'w') as fh:
            cursor.copy_expert(make_copy_sql(index, csv_size, fn), fh)
        # Read last line of written file to check the new index
        last_line = os.popen('tail -n 1 %s' % (fn)).read()
        if last_line == "":
            break
        index = int(last_line.split(",")[0])
        # Compress the file
        os.popen('gzip -f %s' % (fn))

        i += 1
        print("%d - %.2fs - Retrieved new start ID %d" % (i, time.time() - t_s, index))

1 - 34.12s - Retrieved new start ID 3111558
2 - 30.14s - Retrieved new start ID 6223504
3 - 7.00s - Retrieved new start ID 9335587
4 - 9.70s - Retrieved new start ID 12447424
5 - 5.29s - Retrieved new start ID 14092112


ValueError: invalid literal for int() with base 10: 'id'

### Turn zipped csv files into a compressed h5py dataset

In [12]:
import pandas as pd
import h5py
import time
import os
import numpy as np

In [13]:
data_folder = "./taxi_data_preprocessed"
def list_files(folder):
    for r, d, f in os.walk(folder):
        for file in f:
            if file.endswith('.csv.gz'):
                yield os.path.join(r, file)

In [14]:
all_x = []
all_y = []
for f in list_files(data_folder):
    t_s = time.time()
    df = pd.read_csv(f, header=0, index_col=False)
#     print(df.head())
    Y = df['duration'].to_numpy(np.int32, copy=True)
    X = df[['time', 'dow', 'dom', 'month', 'pickup_lat',
            'pickup_lon', 'dropoff_lat', 'dropoff_lon',
            'distance']].to_numpy(np.float64, copy=True)
    all_x.append(X)
    all_y.append(Y)
    del df
    print("file %s read in %.2fs" % (f, time.time() - t_s))

file ./taxi_data_preprocessed/9335587.csv.gz read in 4.97s
file ./taxi_data_preprocessed/6223504.csv.gz read in 4.50s
file ./taxi_data_preprocessed/3111558.csv.gz read in 4.70s
file ./taxi_data_preprocessed/0.csv.gz read in 4.60s
file ./taxi_data_preprocessed/12447424.csv.gz read in 2.36s


In [15]:
num_samples = sum([arr.shape[0] for arr in all_x])
dim = all_x[0].shape[1]
print(num_samples, ",", dim)

13585727 , 9


In [16]:
max_chunk_size = 2 * 2**20  # 2MB
chunk_x = int(max_chunk_size / dim / 8)
chunk_y = chunk_x
print("Chunk size:", chunk_x)

Chunk size: 29127


In [17]:
with h5py.File(os.path.join(data_folder, 'full.h5py'), 'w', libver='latest') as f:
    Xdset = f.create_dataset("X", (num_samples, dim), dtype='float64', 
                             compression="gzip", chunks=(chunk_x, dim))
    Ydset = f.create_dataset("Y", (num_samples, 1), dtype='int32')
    current_i = 0
    for X, Y in zip(all_x, all_y):
        t_s = time.time()
        X = np.ascontiguousarray(X)
        Y = Y.reshape((-1, 1))
        Xdset.write_direct(X, dest_sel=np.s_[current_i:current_i+X.shape[0], :])
        Ydset.write_direct(Y, dest_sel=np.s_[current_i:current_i+Y.shape[0], :])
        current_i += X.shape[0]
        print("i: %d/%d in %.2fs" % (current_i, num_samples, time.time() - t_s))

i: 3000000/13585727 in 12.53s
i: 6000000/13585727 in 12.44s
i: 9000000/13585727 in 12.13s
i: 12000000/13585727 in 11.80s
i: 13585727/13585727 in 6.25s
