# Examples streaming to parquet format
- [Reference](https://arrow.apache.org/docs/python/ipc.html)

In [None]:
from datetime import datetime
import json

import ijson
import pandas as pd
import pyarrow as pa
from pyarrow.parquet import ParquetWriter

In [None]:
%%bash

ls data/

## Set up data schema

In [None]:
project_schema = pa.schema([
    pa.field('aha_id', pa.string()),
    pa.field('reference_prefix', pa.string()),
    pa.field('name', pa.string()),
    pa.field('last_release_num', pa.int32()),
    pa.field('last_feature_num', pa.int32()),
    pa.field('last_idea_num', pa.int32()),
    pa.field('position', pa.int32()),
    pa.field('positioning_customer', pa.string()),
    pa.field('positioning_problem', pa.string()),
    pa.field('positioning_benefit1', pa.string()),
    pa.field('positioning_benefit2', pa.string()),
    pa.field('positioning_benefit3', pa.string()),
    pa.field('product_line', pa.bool_()),
    pa.field('product_line_type', pa.string()),
    pa.field('capacity_planning_enabled', pa.bool_()),
    pa.field('ideas_scoring_system_id', pa.string()),
    pa.field('ideas_default_user_id', pa.string()),
    pa.field('default_capacity_units', pa.int32()),
    pa.field('default_feature_remaining_estimate', pa.bool_()),
    pa.field('last_page_num', pa.int32()),
    pa.field('color', pa.int32()),
    pa.field('workflow_screen_enabled', pa.bool_()),
    pa.field('competitor_scoring_system_id', pa.string()),
    pa.field('initiative_workflow_id', pa.string()),
    pa.field('strategic_imperative_workflow_id', pa.string()),
    pa.field('estimated_time_as_work_done', pa.bool_()),
    pa.field('last_epic_num', pa.int32()),
    pa.field('configuration', pa.string()),
    pa.field('workspace_type', pa.string()),
    pa.field('created_at', pa.timestamp('ms')),
    pa.field('updated_at', pa.timestamp('ms')),
    pa.field('parent_id', pa.string()),
    pa.field('scoring_system_id', pa.string()),
    pa.field('idea_workflow_id', pa.string()),
    pa.field('feature_workflow_id', pa.string()),
    pa.field('release_workflow_id', pa.string()),
])

requirement_schema = pa.schema([
    pa.field('aha_id', pa.string()),
    pa.field('reference_num', pa.string()),
    pa.field('created_by_user_id', pa.string()),
    pa.field('position', pa.int32()),
    pa.field('original_estimate', pa.float32()),
    pa.field('remaining_estimate', pa.float32()),
    pa.field('work_done', pa.float32()),
    pa.field('name', pa.string()),
    pa.field('created_at', pa.timestamp('ms')),
    pa.field('updated_at', pa.timestamp('ms')),
    pa.field('feature_id', pa.string()),
    pa.field('project_id', pa.string()),
    pa.field('workflow_status_id', pa.string()),
    pa.field('assigned_to_user_id', pa.string()),
])

## Streaming write
Use a DataFrame to accumulate data set and then write to parquet in batches
- [Reference](https://stackoverflow.com/questions/56377848/writing-stream-of-big-data-to-parquet-with-python)

In [None]:
%%time

backup_path = f"./data/aha-account-6240998105453674102-backup-2020-12-28-18-53.json"

with open(backup_path, 'rt', encoding='utf-8') as f:
    parser = ijson.parse(f)
    objects = ijson.items(f, 'records.item')
    
    i = 0
    project, requirement = [], []
    project_pq = './data/aha-project-backup-2020-12-28-18-53.parquet'
    requirement_pq = './data/aha-requirement-backup-2020-12-28-18-53.parquet'

    with ParquetWriter(project_pq, project_schema, compression='SNAPPY') as w, \
        ParquetWriter(requirement_pq, requirement_schema, compression='SNAPPY') as r:

        for record in objects:
            if record['class'] == 'Project':
                project.append((
                    record['id'],
                    record['fields']['reference_prefix'],
                    record['fields']['name'],
                    record['fields']['last_release_num'],
                    record['fields']['last_feature_num'],
                    record['fields']['last_idea_num'],
                    record['fields']['position'],
                    record['fields']['positioning_customer'],
                    record['fields']['positioning_problem'],
                    record['fields']['positioning_benefit1'],
                    record['fields']['positioning_benefit2'],
                    record['fields']['positioning_benefit3'],
                    record['fields']['product_line'],
                    record['fields']['product_line_type'],
                    record['fields']['capacity_planning_enabled'],
                    str(record['fields'].get('ideas_scoring_system_id')),
                    record['fields']['ideas_default_user_id'],
                    record['fields']['default_capacity_units'],
                    record['fields']['default_feature_remaining_estimate'],
                    record['fields']['last_page_num'],
                    record['fields']['color'],
                    record['fields']['workflow_screen_enabled'],
                    str(record['fields']['competitor_scoring_system_id']),
                    str(record['fields']['initiative_workflow_id']),
                    str(record['fields']['strategic_imperative_workflow_id']),
                    record['fields']['estimated_time_as_work_done'],
                    record['fields']['last_epic_num'],
                    json.dumps(record['fields']['configuration']),
                    record['fields']['workspace_type'],
                    datetime.strptime(record['fields']['created_at'], "%Y-%m-%d %H:%M:%S %Z"),
                    datetime.strptime(record['fields']['updated_at'], "%Y-%m-%d %H:%M:%S %Z"),
                    record['links'].get('parent_id'),
                    record['links'].get('scoring_system_id'),
                    record['links']['idea_workflow_id'],
                    record['links']['feature_workflow_id'],
                    record['links']['release_workflow_id'],
                ))

                if len(project) % 10 == 0:
                    tmp_df = pd.DataFrame(project, columns=[f.name for f in project_schema])
                    w.write_table(pa.Table.from_pandas(tmp_df, schema=project_schema))
                    project = []
                    
            elif record['class'] == 'Requirement':
                i += 1
                requirement.append((
                    record['id'],
                    record['fields']['reference_num'],
                    str(record['fields']['created_by_user_id']),
                    record['fields']['position'],
                    float(record['fields']['original_estimate'])
                        if record['fields']['original_estimate'] else None,
                    float(record['fields']['remaining_estimate'])
                        if record['fields']['remaining_estimate'] else None,
                    float(record['fields']['work_done']) 
                        if record['fields']['work_done'] else None,
                    record['fields']['name'],
                    datetime.strptime(record['fields']['created_at'], "%Y-%m-%d %H:%M:%S %Z"),
                    datetime.strptime(record['fields']['updated_at'], "%Y-%m-%d %H:%M:%S %Z"),
                    record['links']['feature_id'],
                    record['links']['project_id'],
                    record['links']['workflow_status_id'],
                    record['links'].get('assigned_to_user_id'),
                ))

                if len(requirement) % 1000 == 0:
                    tmp_df = pd.DataFrame(requirement, columns=[f.name for f in requirement_schema])
                    r.write_table(pa.Table.from_pandas(tmp_df, schema=requirement_schema))
                    requirement = []

                
print(f"{i} records found")

## Read parquet data file into Pandas

In [None]:
output_df = pd.read_parquet(requirement_pq)
output_df.head()