In [3]:
import random
import string
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.csv as csv
from datetime import datetime
from functools import lru_cache

In [4]:
def gen_random_string(length: int= 32)-> str:
    return ''.join(random.choices(string.ascii_uppercase + string.digits, k=length))

In [6]:
gen_random_string()

'H1RFBD7BJSOG27QL4Y3YGF9A0NRBTYJG'

In [7]:
gen_random_string()

'QLS4F1Y4NR9OBCT0X0B3UKE87D3Z0WK2'

In [51]:
dt=pd.date_range(start=datetime(2019, 1, 1), end=datetime(2022, 4 , 20), freq='min')

In [52]:
np.random.seed=42
df_size = len(dt)

In [53]:
%%time
df=pd.DataFrame({
    'date':dt,
    'a':np.random.rand(df_size),
    'b':np.random.rand(df_size),
    'c':np.random.rand(df_size),
    'd':np.random.rand(df_size),
    'e':np.random.rand(df_size),
    'str1':[gen_random_string() for x in range(df_size)],
    'str2':[gen_random_string() for x in range(df_size)]
})

Wall time: 1min 1s


In [54]:
df.shape

(1735201, 8)

In [55]:
df.head()

Unnamed: 0,date,a,b,c,d,e,str1,str2
0,2019-01-01 00:00:00,0.021719,0.789401,0.996341,0.200924,0.516212,X0Z3ISVP38V3DZNYRCERWA8V3RXJ61EM,PPB1TRP11N2XAYG4UZOQ0W38IEO7AQB6
1,2019-01-01 00:01:00,0.860443,0.769696,0.365889,0.331651,0.459428,K5KENXUIG7AMG0IJWDSVKK0TGJBAPXKS,JAEDXZVT1HCM4GJIT3N2PD3AJWZ113W9
2,2019-01-01 00:02:00,0.548508,0.612747,0.640164,0.839859,0.306226,G2OM7MRASVNJVSLSTMR4MY00QWA85YD1,SGGPHQUY70NI8FABBUZQNGXTOEXNARXQ
3,2019-01-01 00:03:00,0.890642,0.209416,0.599862,0.600903,0.428214,HRIEMQBHE1TQK0Z0QV8I1V418ZNSACCX,BIYQKWBN8K0IKPDAE6SY8DU5AB8WMXPH
4,2019-01-01 00:04:00,0.969293,0.456791,0.757094,0.338912,0.702087,WOPCVZFZ2HE3Z26OY58NEKMP1HV22LOX,F8MEH2JHMTDRS3QP1CGBXZ7KURC2N19F


In [56]:
%%time
df.to_csv('csv_pandas.csv', index=False)

Wall time: 42.5 s


In [57]:
%%time

df.to_csv('csv_pandas.csv.gz', index=False, compression='gzip')

Wall time: 1min 40s


In [58]:
%%time

df1=pd.read_csv('csv_pandas.csv')

Wall time: 20 s


In [59]:
%%time

df2=pd.read_csv('csv_pandas.csv.gz')

Wall time: 13.8 s


In [60]:
df_pa=df.copy()
df_pa['date']=df_pa['date'].values.astype(np.int64) // 10 ** 9

In [61]:
df_pa.head()

Unnamed: 0,date,a,b,c,d,e,str1,str2
0,1546300800,0.021719,0.789401,0.996341,0.200924,0.516212,X0Z3ISVP38V3DZNYRCERWA8V3RXJ61EM,PPB1TRP11N2XAYG4UZOQ0W38IEO7AQB6
1,1546300860,0.860443,0.769696,0.365889,0.331651,0.459428,K5KENXUIG7AMG0IJWDSVKK0TGJBAPXKS,JAEDXZVT1HCM4GJIT3N2PD3AJWZ113W9
2,1546300920,0.548508,0.612747,0.640164,0.839859,0.306226,G2OM7MRASVNJVSLSTMR4MY00QWA85YD1,SGGPHQUY70NI8FABBUZQNGXTOEXNARXQ
3,1546300980,0.890642,0.209416,0.599862,0.600903,0.428214,HRIEMQBHE1TQK0Z0QV8I1V418ZNSACCX,BIYQKWBN8K0IKPDAE6SY8DU5AB8WMXPH
4,1546301040,0.969293,0.456791,0.757094,0.338912,0.702087,WOPCVZFZ2HE3Z26OY58NEKMP1HV22LOX,F8MEH2JHMTDRS3QP1CGBXZ7KURC2N19F


In [62]:
%%time

df_pa_table =pa.Table.from_pandas(df_pa)

Wall time: 548 ms


In [63]:
%%time

csv.write_csv(df_pa_table, 'csv_pyarrow.csv')

Wall time: 5.3 s


In [64]:
%%time

with pa.CompressedOutputStream('csv_pyarrow.csv.gz', 'gzip') as out:
    csv.write_csv(df_pa_table, out)

Wall time: 39.5 s


In [65]:
%%time

df_pa_1=csv.read_csv('csv_pyarrow.csv')

Wall time: 4.65 s


In [66]:
%%time

df_pa_2=csv.read_csv('csv_pyarrow.csv.gz')

Wall time: 3.78 s


In [78]:
@lru_cache(maxsize=4)
def fastwrite():
    csv.write_csv(df_pa_table, 'csv_pyarrow1.csv')

def fastread():
        csv.read_csv('csv_pyarrow1.csv')


In [79]:
%%time
fastwrite()

Wall time: 5.91 s


In [80]:
%%time
fastwrite()

In [81]:
%%time
fastread()

Wall time: 1.21 s


In [83]:
%%time
fastread()

Wall time: 3.75 s
