# Parquet et Avro


## Avro

### Définir un schema Avro

In [1]:
!pip install fastavro

Collecting fastavro
[?25l  Downloading https://files.pythonhosted.org/packages/3c/47/1ce5e783fd7ecabcda82c6cb35c79779c747e77d23e459d46ece529392db/fastavro-1.0.0.post1-cp36-cp36m-manylinux2014_x86_64.whl (1.4MB)
[K     |████████████████████████████████| 1.4MB 2.8MB/s 
[?25hInstalling collected packages: fastavro
Successfully installed fastavro-1.0.0.post1


In [2]:
from fastavro import writer, reader, parse_schema

schema = {
    'doc': 'A weather reading.',
    'name': 'Weather',
    'namespace': 'test',
    'type': 'record',
    'fields': [
        {'name': 'station', 'type': 'string'},
        {'name': 'time', 'type': 'long'},
        {'name': 'temp', 'type': 'int'},
    ],
}

### Ecrire des records dans un fichier Avro

In [3]:
parsed_schema = parse_schema(schema)

# 'records' can be an iterable (including generator)
records = [
    {u'station': u'011990-99999', u'temp': 0, u'time': 1433269388},
    {u'station': u'011990-99999', u'temp': 22, u'time': 1433270389},
    {u'station': u'011990-99999', u'temp': -11, u'time': 1433273379},
    {u'station': u'012650-99999', u'temp': 111, u'time': 1433275478},
]

# Writing
with open('weather.avro', 'wb') as out:
    writer(out, parsed_schema, records)

In [4]:
!ls -lh 

total 8.0K
drwxr-xr-x 1 root root 4.0K Oct 14 16:31 sample_data
-rw-r--r-- 1 root root  337 Oct 27 09:11 weather.avro


### Lire le fichier Avro précédemment enregistré

In [6]:
with open('weather.avro', 'rb') as fo:
    for record in reader(fo):
        print(record)

{'station': '011990-99999', 'time': 1433269388, 'temp': 0}
{'station': '011990-99999', 'time': 1433270389, 'temp': 22}
{'station': '011990-99999', 'time': 1433273379, 'temp': -11}
{'station': '012650-99999', 'time': 1433275478, 'temp': 111}


### Enregistrer une dataframe Pandas

In [8]:
!pip install pandavro

Collecting pandavro
  Downloading https://files.pythonhosted.org/packages/9f/6d/cd1944c0851514636c696a18cb2d9f01b66d2b73bb0c5d5c29b2581fd311/pandavro-1.5.2.tar.gz
Building wheels for collected packages: pandavro
  Building wheel for pandavro (setup.py) ... [?25l[?25hdone
  Created wheel for pandavro: filename=pandavro-1.5.2-cp36-none-any.whl size=2956 sha256=c1f320dd2c2cab306751954acf9cfe41d3d189496b725851aff74ccf89260ac9
  Stored in directory: /root/.cache/pip/wheels/9c/71/4f/f60022ad5c477241cb81edb1e29c50c55e98850224e0676ec6
Successfully built pandavro
Installing collected packages: pandavro
Successfully installed pandavro-1.5.2


In [9]:
import numpy as np
import pandas as pd
import pandavro as pdx

filename = "df.avro"

df = pd.DataFrame({"Boolean": [True, False, True, False],
                    "Float64": np.random.randn(4),
                    "Int64": np.random.randint(0, 10, 4),
                    "String": ['foo', 'bar', 'foo', 'bar'],
                    "DateTime64": [pd.Timestamp('20190101'), pd.Timestamp('20190102'),
                                  pd.Timestamp('20190103'), pd.Timestamp('20190104')]})

pdx.to_avro(filename, df)

In [10]:
!ls -lh

total 12K
-rw-r--r-- 1 root root  512 Oct 27 09:16 df.avro
drwxr-xr-x 1 root root 4.0K Oct 14 16:31 sample_data
-rw-r--r-- 1 root root  337 Oct 27 09:11 weather.avro


### Lire le fichier Avro créé

In [11]:
saved = pdx.read_avro(filename)
print(saved)

   Boolean   Float64  Int64 String                DateTime64
0     True -0.580502      1    foo 2019-01-01 00:00:00+00:00
1    False  0.096166      9    bar 2019-01-02 00:00:00+00:00
2     True -0.433939      2    foo 2019-01-03 00:00:00+00:00
3    False  1.051204      2    bar 2019-01-04 00:00:00+00:00


## Parquet

### Ecrire un fichier Parquet

In [12]:
!pip install pyarrow



In [13]:
import pyarrow as pa
import pyarrow.parquet as pq

table = pa.Table.from_pandas(df)
pq.write_table(table, 'df.parquet')

In [14]:
!ls -lh

total 16K
-rw-r--r-- 1 root root  512 Oct 27 09:16 df.avro
-rw-r--r-- 1 root root 2.3K Oct 27 09:18 df.parquet
drwxr-xr-x 1 root root 4.0K Oct 14 16:31 sample_data
-rw-r--r-- 1 root root  337 Oct 27 09:11 weather.avro


### Lire le fichier Parquet

In [15]:
table2 = pq.read_table('df.parquet')
table2.to_pandas()

Unnamed: 0,Boolean,Float64,Int64,String,DateTime64
0,True,-0.580502,1,foo,2019-01-01
1,False,0.096166,9,bar,2019-01-02
2,True,-0.433939,2,foo,2019-01-03
3,False,1.051204,2,bar,2019-01-04


### Lire un sous-ensemble de colonnes

In [16]:
pq.read_pandas('df.parquet', columns=['String', 'Int64']).to_pandas()

Unnamed: 0,String,Int64
0,foo,1
1,bar,9
2,foo,2
3,bar,2


## Comparaison Avro et Parquet

In [17]:
!ls -lh

total 16K
-rw-r--r-- 1 root root  512 Oct 27 09:16 df.avro
-rw-r--r-- 1 root root 2.3K Oct 27 09:18 df.parquet
drwxr-xr-x 1 root root 4.0K Oct 14 16:31 sample_data
-rw-r--r-- 1 root root  337 Oct 27 09:11 weather.avro


In [14]:
filename_avro = 'df_test.avro'
filename_parquet = 'df_test.parquet'

df = pd.DataFrame({'a': range(100000)})
pdx.to_avro(filename_avro, df)
pq.write_table(pa.Table.from_pandas(df), filename_parquet)

!ls -lh

total 1000K
-rw-r--r-- 1 root root  512 Oct 26 19:46 df.avro
-rw-r--r-- 1 root root 2.3K Oct 26 19:46 df.parquet
-rw-r--r-- 1 root root 384K Oct 26 19:46 df_test.avro
-rw-r--r-- 1 root root 600K Oct 26 19:46 df_test.parquet
drwxr-xr-x 1 root root 4.0K Oct 14 16:31 sample_data
-rw-r--r-- 1 root root  337 Oct 26 19:46 weather.avro


In [15]:
df = pd.DataFrame({'a': (i % 2 for i in range(100000))})
pdx.to_avro(filename_avro, df)
pq.write_table(pa.Table.from_pandas(df), filename_parquet)

!ls -lh

total 216K
-rw-r--r-- 1 root root  512 Oct 26 19:46 df.avro
-rw-r--r-- 1 root root 2.3K Oct 26 19:46 df.parquet
-rw-r--r-- 1 root root 196K Oct 26 19:46 df_test.avro
-rw-r--r-- 1 root root 1.4K Oct 26 19:46 df_test.parquet
drwxr-xr-x 1 root root 4.0K Oct 14 16:31 sample_data
-rw-r--r-- 1 root root  337 Oct 26 19:46 weather.avro


## Compression avec Parquet


In [17]:
!apt-get install libsnappy-dev
!pip install python-snappy

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  libsnappy-dev
0 upgraded, 1 newly installed, 0 to remove and 21 not upgraded.
Need to get 27.2 kB of archives.
After this operation, 108 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/main amd64 libsnappy-dev amd64 1.1.7-1 [27.2 kB]
Fetched 27.2 kB in 1s (39.4 kB/s)
Selecting previously unselected package libsnappy-dev:amd64.
(Reading database ... 144611 files and directories currently installed.)
Preparing to unpack .../libsnappy-dev_1.1.7-1_amd64.deb ...
Unpacking libsnappy-dev:amd64 (1.1.7-1) ...
Setting up libsnappy-dev:amd64 (1.1.7-1) ...
Collecting python-snappy
  Using cached https://files.pythonhosted.org/packages/45/35/65d9f8cc537129894b4b32647d80212d1fa342877581c5b8a69872cea8be/python-snappy-0.5.4.tar.gz
Building wheels for collected packages: python-snappy
  Building wheel for python-snappy (s

In [22]:
table = pa.Table.from_pandas(df)
pq.write_table(table, 'df_snappy.parquet', compression='snappy')
pq.write_table(table, 'df_gzip.parquet', compression='gzip')

!ls -lh

total 224K
-rw-r--r-- 1 root root  512 Oct 26 19:46 df.avro
-rw-r--r-- 1 root root  891 Oct 26 19:53 df_gzip.parquet
-rw-r--r-- 1 root root 2.3K Oct 26 19:46 df.parquet
-rw-r--r-- 1 root root 1.4K Oct 26 19:53 df_snappy.parquet
-rw-r--r-- 1 root root 196K Oct 26 19:46 df_test.avro
-rw-r--r-- 1 root root 1.4K Oct 26 19:46 df_test.parquet
drwxr-xr-x 1 root root 4.0K Oct 14 16:31 sample_data
-rw-r--r-- 1 root root  337 Oct 26 19:46 weather.avro
