# Comparing File Formats

In [1]:
import os
import pandas as pd
from time import perf_counter
import plotly.express as px

In [2]:
data_folder = os.path.relpath("../data", os.getcwd())
file_name = "ex_data"
file_path = os.path.join(data_folder, file_name)

times_file_path = os.path.join(data_folder, "times.parquet")

In [3]:
times = {}
for ext in ["json", "csv", "parquet"]:
    read_func = getattr(pd, f"read_{ext}")
    current_file_path = f"{file_path}.{ext}"
    time_list = []
    for i in range(1000):
        start = perf_counter()
        df = read_func(current_file_path)
        end = perf_counter()
        time_list.append(end-start)
    times[ext] = time_list
times_df = pd.DataFrame(times)
times_df.to_parquet(times_file_path)

In [4]:
times_df = pd.read_parquet(times_file_path)
times_df.describe()

Unnamed: 0,json,csv,parquet
count,1000.0,1000.0,1000.0
mean,0.051457,0.004547,0.000857
std,0.004754,0.000552,0.001267
min,0.045634,0.003719,0.000679
25%,0.048202,0.004256,0.000762
50%,0.050839,0.004511,0.000792
75%,0.053428,0.004723,0.000838
max,0.091307,0.012413,0.040754


In [5]:
fig = px.box(times_df, y=times_df.columns)
fig.show()