In [1]:
import pandas as pd
import itertools as it
from time import time

In [7]:
def measure_read_time(filename, start):
    time_length = time() - start
    print(filename, '%.4f' % time_length)
    with open('record_read.txt', 'a') as f:
        f.write('%s : %.4f sec' % (filename, time_length) + '\n')
    

In [20]:
def read_file(extension, hyperparameter = None) :
    '''
    pd.to_csv, compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
    pd.to_json, orient : {'split', 'records', 'index', 'columns', 'values', 'table'}, 
                        default 'columns' 
                compression{'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
    pd.to_excel, None 
    pd.to_hdf, complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib', 
                    format : {'fixed', 'table', None}, default 'fixed',
    pd.to_feather, None
    pd.to_parquet, compression{'snappy', 'gzip', 'brotli', None}, default 'snappy'
    pd.to_stata, None
    pd.to_pickle, compression{'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
    '''
    
    if hyperparameter is not None:
        if type(hyperparameter) == str:
            filename = hyperparameter
        else:
            not_none_hyperparameter = [i for i in hyperparameter if i is not None]
            filename = '_'.join(not_none_hyperparameter)
        
        filename = 'test_%s.%s' % (filename,extension)
    else:
        filename = 'test.%s' % (extension)

        
    if extension == 'h5':
        start = time()
        df = pd.read_hdf(filename,'key', complib = hyperparameter[1])
        measure_read_time(filename, start)
        
    elif extension == 'xlsx':
        start = time()
        df = pd.read_excel(filename)
        measure_read_time(filename, start)
        
    elif extension == 'csv':
        start = time()
        df = pd.read_csv(filename, compression = hyperparameter)
        measure_read_time(filename, start)
        
    elif extension == 'json':
        start = time()
        df = pd.read_json(filename, orient=hyperparameter[0], compression=hyperparameter[1])
        measure_read_time(filename, start)
        
    elif extension == 'ftr':
        start = time()
        df = pd.read_feather(filename)
        measure_read_time(filename, start)
        
    elif extension == 'parquet':
        start = time()
        df = pd.read_parquet(filename)
        measure_read_time(filename, start)
        
    elif extension == 'dta':
        start = time()
        df = pd.read_stata(filename)
        measure_read_time(filename, start)
        
    elif extension == 'pkl':
        start = time()
        df = pd.read_pickle(filename, compression = hyperparameter)
        measure_read_time(filename, start)

In [15]:
json={'orient' : ['split', 'records', 'index', 'columns', 'values', 'table'], 
      'compression' : ['infer', 'gzip', 'bz2', 'zip', 'xz', None]}
h5 = {'format' : ['fixed', 'table', None], 'complib' : ['zlib', 'lzo', 'bzip2', 'blosc']}

json = list(it.product(*(json[Name] for Name in json.keys())))
csv = ['infer', 'gzip', 'bz2', 'zip', 'xz', None]
h5 = list(it.product(*(h5[Name] for Name in h5.keys())))
parquet = ['snappy', 'gzip', 'brotli', None]
pkl = ['infer', 'gzip', 'bz2', 'zip', 'xz', None]

In [38]:
read_file('ftr', None)

test.ftr 0.1319


In [11]:
for i in pkl:
    read_file('pkl', i)

test_infer.pkl 0.9683
test_gzip.pkl 2.3499
test_bz2.pkl 14.9036
test_zip.pkl 2.8101
test_xz.pkl 4.9856
test.pkl 0.9685


In [12]:
read_file('dta', None)

test.dta 6.0826


In [25]:
for i in parquet:
    read_file('parquet', i)

test_snappy.parquet 0.1641
test_gzip.parquet 0.3675
test_brotli.parquet 0.1962
test.parquet 0.1309


In [17]:
for i in json:
    read_file('json', i)

test_split_infer.json 30.6063
test_split_gzip.json 34.3981
test_split_bz2.json 49.7849
test_split_zip.json 32.4178
test_split_xz.json 37.6870
test_split.json 30.6260
test_records_infer.json 52.7509
test_records_gzip.json 58.0450
test_records_bz2.json 86.6441
test_records_zip.json 58.0955
test_records_xz.json 58.2658
test_records.json 50.8276
test_index_infer.json 146.1879
test_index_gzip.json 153.0526
test_index_bz2.json 186.2147
test_index_zip.json 152.6008
test_index_xz.json 156.6578
test_index.json 145.6078
test_columns_infer.json 87.5633
test_columns_gzip.json 89.5058
test_columns_bz2.json 118.7542
test_columns_zip.json 88.7969
test_columns_xz.json 99.0491
test_columns.json 79.6321
test_values_infer.json 26.3524
test_values_gzip.json 29.8537
test_values_bz2.json 45.2295
test_values_zip.json 28.2081
test_values_xz.json 33.9267
test_values.json 27.5413
test_table_infer.json 52.3023
test_table_gzip.json 58.3693
test_table_bz2.json 88.4866
test_table_zip.json 56.9743
test_table_xz.json

In [18]:
for i in csv:
    read_file('csv', i)

test_infer.csv 10.1735
test_gzip.csv 13.9003
test_bz2.csv 36.7044
test_zip.csv 13.6252
test_xz.csv 20.0898
test.csv 9.8901


In [21]:
for i in h5:
    read_file('h5', i)

test_fixed_zlib.h5 1.1123
test_fixed_lzo.h5 1.0547
test_fixed_bzip2.h5 1.0491
test_fixed_blosc.h5 1.0509
test_table_zlib.h5 4.3380
test_table_lzo.h5 4.3472
test_table_bzip2.h5 4.3019
test_table_blosc.h5 4.3024
test_zlib.h5 0.9725
test_lzo.h5 0.9630
test_bzip2.h5 1.0863
test_blosc.h5 0.9703
