In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import string

In [None]:
sns.set_style('whitegrid')

In [None]:
results = {}

## Generate Test Data

The test `DataFrame` that can be configured to contain numerical or text data, or both. For the HDF5 library, we test both the fixed and table format. 

In [None]:
def generate_test_data(nrows=100000, numerical_cols=2000, text_cols=0, text_length=10):
    ncols = numerical_cols + text_cols
    s = "".join([random.choice(string.ascii_letters)
                 for _ in range(text_length)])
    data = pd.concat([pd.DataFrame(np.random.random(size=(nrows, numerical_cols))),
                      pd.DataFrame(np.full(shape=(nrows, text_cols), fill_value=s))],
                     axis=1, ignore_index=True)
    data.columns = [str(i) for i in data.columns]
    return data

In [None]:
data_type = 'Mixed'

In [None]:
df = generate_test_data(numerical_cols=1000, text_cols=1000)
df.info()

## Parquet

### Size

In [None]:
parquet_file = Path('test.parquet')

In [None]:
df.to_parquet(parquet_file)
size = parquet_file.stat().st_size

### Read

In [None]:
%%timeit -o
df = pd.read_parquet(parquet_file)

In [None]:
read = _

In [None]:
parquet_file.unlink()

### Write

In [None]:
%%timeit -o
df.to_parquet(parquet_file)
parquet_file.unlink()

In [None]:
write = _

### Results

In [None]:
results['Parquet'] = {'read': np.mean(read.all_runs), 'write': np.mean(write.all_runs), 'size': size}

## HDF5

In [None]:
test_store = Path('index.h5')

### Fixed Format

#### Size

In [None]:
with pd.HDFStore(test_store) as store:
    store.put('file', df)
size = test_store.stat().st_size

#### Read

In [None]:
%%timeit -o
with pd.HDFStore(test_store) as store:
    store.get('file')

In [None]:
read = _

In [None]:
test_store.unlink()

#### Write

In [None]:
%%timeit -o
with pd.HDFStore(test_store) as store:
    store.put('file', df)
test_store.unlink()

In [None]:
write = _

#### Results

In [None]:
results['HDF Fixed'] = {'read': np.mean(read.all_runs), 'write': np.mean(write.all_runs), 'size': size}

### Table Format

#### Size

In [None]:
with pd.HDFStore(test_store) as store:
    store.append('file', df, format='t')
size = test_store.stat().st_size    

#### Read

In [None]:
%%timeit -o
with pd.HDFStore(test_store) as store:
    df = store.get('file')

In [None]:
read = _

In [None]:
test_store.unlink()

#### Write

Note that `write` in table format does not work with text data.

In [None]:
%%timeit -o
with pd.HDFStore(test_store) as store:
    store.append('file', df, format='t')
test_store.unlink()    

In [None]:
write = _

#### Results

In [None]:
results['HDF Table'] = {'read': np.mean(read.all_runs), 'write': np.mean(write.all_runs), 'size': size}

### Table Select

#### Size

In [None]:
with pd.HDFStore(test_store) as store:
    store.append('file', df, format='t', data_columns=['company', 'form'])
size = test_store.stat().st_size 

#### Read

In [None]:
company = 'APPLE INC'

In [None]:
%%timeit
with pd.HDFStore(test_store) as store:
    s = store.get('file')

In [None]:
read = _

In [None]:
test_store.unlink()

#### Write

In [None]:
%%timeit
with pd.HDFStore(test_store) as store:
    store.append('file', df, format='t', data_columns=['company', 'form'])
test_store.unlink() 

In [None]:
write = _

#### Results

In [None]:
results['HDF Select'] = {'read': np.mean(read.all_runs), 'write': np.mean(write.all_runs), 'size': size}

## CSV

In [None]:
test_csv = Path('test.csv')

### Size

In [None]:
df.to_csv(test_csv)
test_csv.stat().st_size

### Read

In [None]:
%%timeit -o
df = pd.read_csv(test_csv)

In [None]:
read = _

In [None]:
test_csv.unlink()  

### Write

In [None]:
%%timeit -o
df.to_csv(test_csv)
test_csv.unlink()

In [None]:
write = _

### Results

In [None]:
results['CSV'] = {'read': np.mean(read.all_runs), 'write': np.mean(write.all_runs), 'size': size}

In [None]:
pd.DataFrame(results).to_csv(f'{data_type}.csv').assign(Data=data)

## Store Results

In [None]:
df = (pd.read_csv('Numeric.csv', index_col=0)
      .append(pd.read_csv('Mixed.csv', index_col=0))
      .rename(columns=str.capitalize))
df.index.name='Storage'
df = df.set_index('Data', append=True).unstack()
df.Size /= 1e9

In [None]:
fig, axes = plt.subplots(ncols=3, figsize=(16, 4))
for i, op in enumerate(['Read', 'Write', 'Size']):
    flag= op in ['Read', 'Write']
    df.loc[:, op].plot.barh(title=op, ax=axes[i], logx=flag)
    if flag:
        axes[i].set_xlabel('seconds (log scale)')
    else:
        axes[i].set_xlabel('GB')
fig.tight_layout()
fig.savefig('storage', dpi=300);