# Install pyarrow

In [1]:
%%sh
pip install pyarrow



You should consider upgrading via the 'pip install --upgrade pip' command.


## Import necessary modules

In [2]:
import pyarrow.parquet as pq
import numpy as np
import pandas as pd

## Setup data

In [3]:
import random
import uuid
import json

def gen_sample_acct():
    return dict(acct_id=random.randint(1,99999999),acct_feature1=str(uuid.uuid4()))

def gen_sample_accounts(num_records):
    accts = [gen_sample_acct() for i in range(num_records)]
    return accts

def write_json(sample_dict, filename):
    with open(filename, 'w') as f:
        f.write(json.dumps(sample_dict))
    return "success"

## Generate sample data

In [4]:
sample_accts = gen_sample_accounts(5)
write_json(sample_accts, 'sample_accounts_pyarrow.json')

'success'

In [5]:
%%sh 
ls -lt | grep -i "sample_"

-rw-r--r--  1 sharattadimalla  staff    399 Nov 11 10:41 sample_accounts_pyarrow.json
drwxr-xr-x  6 sharattadimalla  staff    192 Nov 10 12:43 sample_accounts.parquet
drwxr-xr-x  6 sharattadimalla  staff    192 Nov 10 12:43 sample_customer.parquet
-rw-r--r--  1 sharattadimalla  staff  16784 Nov 10 12:42 sample_customers.json
-rw-r--r--  1 sharattadimalla  staff    399 Nov 10 12:40 sample_accounts.json
-rw-r--r--  1 sharattadimalla  staff  16787 Nov 10 12:33 sample_data.json
drwxr-xr-x  6 sharattadimalla  staff    192 Nov 10 11:43 sample_data.parquet


In [6]:
sample_accts

[{'acct_id': 30071544,
  'acct_feature1': 'ab4a7c54-7f98-4d64-a874-3972286e1cbf'},
 {'acct_id': 53960302,
  'acct_feature1': '7820eb21-3b92-4b79-89b1-76e4c0046a0b'},
 {'acct_id': 18450629,
  'acct_feature1': 'e04384c0-55af-4fdb-b7bb-a8474f1fd3bb'},
 {'acct_id': 80074688,
  'acct_feature1': 'b10ddc34-a62b-42f9-8540-134a103e6a41'},
 {'acct_id': 7222352, 'acct_feature1': 'c5a38824-e656-4c89-9770-a55c5d0dd84c'}]

## Create pandas dataframe 

In [7]:
df = pd.DataFrame(sample_accts)

In [8]:
df.head()

Unnamed: 0,acct_feature1,acct_id
0,ab4a7c54-7f98-4d64-a874-3972286e1cbf,30071544
1,7820eb21-3b92-4b79-89b1-76e4c0046a0b,53960302
2,e04384c0-55af-4fdb-b7bb-a8474f1fd3bb,18450629
3,b10ddc34-a62b-42f9-8540-134a103e6a41,80074688
4,c5a38824-e656-4c89-9770-a55c5d0dd84c,7222352


## Create pyarrow table

In [10]:
import pyarrow as pa
table = pa.Table.from_pandas(df)

In [11]:
table

pyarrow.Table
acct_feature1: string
acct_id: int64
__index_level_0__: int64
metadata
--------
OrderedDict([(b'pandas',
              b'{"index_columns": ["__index_level_0__"], "column_indexes": ['
              b'{"name": null, "field_name": null, "pandas_type": "unicode",'
              b' "numpy_type": "object", "metadata": {"encoding": "UTF-8"}}]'
              b', "columns": [{"name": "acct_feature1", "field_name": "acct_'
              b'feature1", "pandas_type": "unicode", "numpy_type": "object",'
              b' "metadata": null}, {"name": "acct_id", "field_name": "acct_'
              b'id", "pandas_type": "int64", "numpy_type": "int64", "metadat'
              b'a": null}, {"name": null, "field_name": "__index_level_0__",'
              b' "pandas_type": "int64", "numpy_type": "int64", "metadata": '
              b'null}], "pandas_version": "0.24.1"}')])

## Write flattened parquet

In [13]:
pq.write_table(table, 'sample_accounts_pyarrow.parquet')

In [14]:
%%sh
ls -lt | grep -i "sample_"

-rw-r--r--  1 sharattadimalla  staff   1873 Nov 11 10:49 sample_accounts_pyarrow.parquet
-rw-r--r--  1 sharattadimalla  staff    399 Nov 11 10:41 sample_accounts_pyarrow.json
drwxr-xr-x  6 sharattadimalla  staff    192 Nov 10 12:43 sample_accounts.parquet
drwxr-xr-x  6 sharattadimalla  staff    192 Nov 10 12:43 sample_customer.parquet
-rw-r--r--  1 sharattadimalla  staff  16784 Nov 10 12:42 sample_customers.json
-rw-r--r--  1 sharattadimalla  staff    399 Nov 10 12:40 sample_accounts.json
-rw-r--r--  1 sharattadimalla  staff  16787 Nov 10 12:33 sample_data.json
drwxr-xr-x  6 sharattadimalla  staff    192 Nov 10 11:43 sample_data.parquet


## Setup nested data

In [27]:
def gen_sample_accounts(num_records):
    accts = [gen_sample_acct() for i in range(num_records)]
    return accts

def gen_sample_app():
    return dict(app_id=random.randint(1,99999999),app_feature1=str(uuid.uuid4()))

def gen_sample_apps(num_records):
    apps = [gen_sample_app() for i in range(num_records)]
    return apps

def gen_sample_customers(num_records):
    customers = [dict(cust_id=random.randint(1,99999999),
                      cust_feature1=str(uuid.uuid4()),
                      accts=gen_sample_accounts(num_records),
                      apps=gen_sample_apps(num_records)
                      ) for i in range(num_records)]
    return customers

sample_customers = gen_sample_customers(10)
sample_customers

[{'cust_id': 10538913,
  'cust_feature1': '55a5ff61-9142-4168-8b27-e05ae499d512',
  'accts': [{'acct_id': 62043918,
    'acct_feature1': '268612ff-dc97-4bd3-91b4-82331dc65aab'},
   {'acct_id': 45452152,
    'acct_feature1': 'f8710128-421d-42de-bbba-b3bed9ca02c0'},
   {'acct_id': 53535005,
    'acct_feature1': '9b948278-9434-4d96-99f8-330347f18b89'},
   {'acct_id': 74679862,
    'acct_feature1': 'a8638b27-5bea-409b-bd89-a1747d1dc68a'},
   {'acct_id': 55054665,
    'acct_feature1': 'cd8f75a9-7a57-4ed2-b09d-cb751b29b36e'},
   {'acct_id': 66588038,
    'acct_feature1': '2ccb8c6f-a6ef-4cca-8981-4ea36a377b45'},
   {'acct_id': 22050796,
    'acct_feature1': 'cebd1b49-69ec-4f33-9cc4-569f589b5e3f'},
   {'acct_id': 58403517,
    'acct_feature1': '1d0bb441-e374-4451-a4ad-4502555fe346'},
   {'acct_id': 51240810,
    'acct_feature1': 'afb89e41-745e-4744-a4a7-63cfdd8a09be'},
   {'acct_id': 74760271,
    'acct_feature1': 'd311afc2-f5b3-4282-ac49-5e90c3527f67'}],
  'apps': [{'app_id': 56607329,
    'a

In [28]:
df_nested = pd.DataFrame(sample_customers)

In [29]:
df_nested.head()

Unnamed: 0,accts,apps,cust_feature1,cust_id
0,"[{'acct_id': 62043918, 'acct_feature1': '26861...","[{'app_id': 56607329, 'app_feature1': '6873be8...",55a5ff61-9142-4168-8b27-e05ae499d512,10538913
1,"[{'acct_id': 37883031, 'acct_feature1': '334b6...","[{'app_id': 39514391, 'app_feature1': '0994839...",08a322f2-bede-4ebf-a1e0-badb86c395e4,65127062
2,"[{'acct_id': 14426869, 'acct_feature1': '3a940...","[{'app_id': 77031006, 'app_feature1': '47aad62...",aad812a4-f624-4fbe-ae57-29c456b9de92,27430688
3,"[{'acct_id': 48328287, 'acct_feature1': 'c784d...","[{'app_id': 52396176, 'app_feature1': '7d991b3...",80594cc2-def4-4623-aa03-773254905766,34158639
4,"[{'acct_id': 56401250, 'acct_feature1': 'ff07d...","[{'app_id': 73976825, 'app_feature1': '275862c...",823cfaab-1252-4520-bc3f-d6925f293180,17037471


## Create a pyarrow table

In [30]:
table_nested = pa.Table.from_pandas(df_nested)

NotImplementedError: struct<acct_feature1: string, acct_id: int64>

## Nested data is not supported in pyarrow!