## Install fastparquet

In [1]:
pip install fastparquet

Collecting fastparquet
[?25l  Downloading https://files.pythonhosted.org/packages/58/49/dccb790fa17ab3fbf84a6b848050083c7a1899e9586000e34e3e4fbf5538/fastparquet-0.3.2.tar.gz (151kB)
[K     |████████████████████████████████| 153kB 863kB/s eta 0:00:01
Collecting numba>=0.28 (from fastparquet)
[?25l  Downloading https://files.pythonhosted.org/packages/0e/6e/e3bd3b844ee2dd815c313807a6030706845ffa6387e95d20e2eb79370036/numba-0.46.0-cp37-cp37m-macosx_10_9_x86_64.whl (2.0MB)
[K     |████████████████████████████████| 2.0MB 1.7MB/s eta 0:00:01
Collecting thrift>=0.11.0 (from fastparquet)
[?25l  Downloading https://files.pythonhosted.org/packages/c6/b4/510617906f8e0c5660e7d96fbc5585113f83ad547a3989b80297ac72a74c/thrift-0.11.0.tar.gz (52kB)
[K     |████████████████████████████████| 61kB 4.3MB/s eta 0:00:011
Collecting llvmlite>=0.30.0dev0 (from numba>=0.28->fastparquet)
[?25l  Downloading https://files.pythonhosted.org/packages/ca/ba/1b1e5c4c707c082b1125c30f93887b827ee3db6a2e86184532fe529d

## Setup flatten data

In [2]:
import random
import uuid
import json

def gen_sample_acct():
    return dict(acct_id=random.randint(1,99999999),acct_feature1=str(uuid.uuid4()))

def gen_sample_accounts(num_records):
    accts = [gen_sample_acct() for i in range(num_records)]
    return accts

sample_accounts = gen_sample_accounts(10)

## Create pandas dataframe

In [3]:
import pandas as pd
df = pd.DataFrame(sample_accounts)

In [4]:
df.head()

Unnamed: 0,acct_feature1,acct_id
0,45e15cf3-e8df-4036-b181-76f231ec776c,64881132
1,e7653a0b-f2f8-4000-b726-e882d2594c36,61232087
2,43ef3747-3b74-4918-acaf-6116f22d6998,25430820
3,c6521705-2af2-4aa7-bd9c-fe39dcdd5f40,56853772
4,ed1ed83d-a2bc-48ac-a11a-099ed020577b,11622854


## Write flatten data into parquet using fastparquet

In [5]:
from fastparquet import write
write('sample_data_fastparquet.parquet', df)

In [6]:
%%sh
ls -lt | grep -i "sample_data_"

-rw-r--r--  1 sharattadimalla  staff   1530 Nov 11 11:22 sample_data_fastparquet.parquet


## Create a nested dataset

In [7]:
def gen_sample_app():
    return dict(app_id=random.randint(1,99999999),app_feature1=str(uuid.uuid4()))

def gen_sample_apps(num_records):
    apps = [gen_sample_app() for i in range(num_records)]
    return apps

def gen_sample_customers(num_records):
    customers = [dict(cust_id=random.randint(1,99999999),
                      cust_feature1=str(uuid.uuid4()),
                      accts=gen_sample_accounts(num_records),
                      apps=gen_sample_apps(num_records)
                      ) for i in range(num_records)]
    return customers

sample_customers = gen_sample_customers(10)

## Create a pandas dataframe

In [9]:
import pandas as pd
df_nested = pd.DataFrame(sample_customers)
df_nested.head()

Unnamed: 0,accts,apps,cust_feature1,cust_id
0,"[{'acct_id': 24239356, 'acct_feature1': 'ea05f...","[{'app_id': 1415442, 'app_feature1': '2e501cd6...",fcfd7727-9da7-43be-9517-b3c5b5e1ae64,74047523
1,"[{'acct_id': 17291389, 'acct_feature1': '3a740...","[{'app_id': 75577862, 'app_feature1': '9ca29d9...",ce6443b4-f505-4768-9844-d892c8b3b0fb,24848578
2,"[{'acct_id': 82829924, 'acct_feature1': '2855c...","[{'app_id': 28552607, 'app_feature1': '735019d...",1cdbe438-221e-4ef4-b1ea-eba0364fa1bb,88979656
3,"[{'acct_id': 25556498, 'acct_feature1': 'c1a7d...","[{'app_id': 78932919, 'app_feature1': 'f0d6d01...",634ab3fb-5908-45de-8034-77fba8945e22,7040609
4,"[{'acct_id': 58124673, 'acct_feature1': 'af620...","[{'app_id': 37376563, 'app_feature1': '41dda91...",982fc88d-2560-47bd-a0d9-1f0e968ae722,99016535


In [10]:
write('sample_nested_data_fastparquet.parquet', df_nested)

In [11]:
%%sh
ls -lt | grep -i "sample_nested_*"

-rw-r--r--  1 sharattadimalla  staff  17854 Nov 11 11:30 sample_nested_data_fastparquet.parquet


In [12]:
%%sh
parquet-tools schema sample_nested_data_fastparquet.parquet

message schema {
  optional binary accts (JSON);
  optional binary apps (JSON);
  optional binary cust_feature1 (UTF8);
  optional int64 cust_id;
}

