## Examples Importing parquet files using Apache PyArrow
- [Reference](azure-storage==0.37.0)

In [None]:
from io import BytesIO
import os

from azure.core.exceptions import ResourceNotFoundError
from azure.storage.blob import BlobServiceClient
import pandas as pd
import pyarrow.parquet as pq

In [None]:
%%bash

ls -al

In [None]:
account_name = os.environ['STORE_ACCOUNT_NAME']
account_key = os.environ['STORE_ACCOUNT_KEY']
container_name = os.environ['STORE_CONTAINER_NAME']

## Read Single Parquet File

In [None]:
parquet_file = 'patient_score_kinnser/2c3491d7-5d7f-44df-80cb-654035b4652e/part-00000.parquet'
# parquet_file = 'patient_score_ltc400/0a094741-5a7d-4b42-9443-3802ebb0f582/part-00000.parquet'

blob_service = BlobServiceClient(
    account_url=f'https://{account_name}.blob.core.windows.net/',
    credential=account_key)

container_client = blob_service.get_container_client(container_name)

blob_client = container_client.get_blob_client(
    blob=parquet_file
)

### To pandas

In [None]:
try:
    with BytesIO() as byte_stream:
        storage_stream = blob_client.download_blob()
        storage_stream.download_to_stream(byte_stream)
        parquet_df = pq.read_table(source=byte_stream).to_pandas()
except ResourceNotFoundError:
    print("No blob found.")

# Alternate 1:
#     with open('data/temp.parquet', "wb") as my_blob:
#         storage_stream = blob_client.download_blob()
#         my_blob.write(storage_stream.readall())
#     with open('data/temp.parquet', "rb") as my_blob:
#         parquet_df = pq.read_table(source=my_blob).to_pandas()

parquet_df.head()

## To parquet file

In [None]:
try:
    with BytesIO() as byte_stream:
        storage_stream = blob_client.download_blob()
        storage_stream.download_to_stream(byte_stream)
        parquet_file = pq.ParquetFile(byte_stream)
        
except ResourceNotFoundError:
    print("No blob found.")


# parquet_file.schema
parquet_file.metadata

## Read Partitioned Parquet File
- [Reference](http://arrow.apache.org/docs/python/parquet.html#reading-from-partitioned-datasets)
- [Stack Overflow](https://stackoverflow.com/questions/58626126/partition-parquet-files-on-azure-blob-pyarrow)

In [None]:
blob_service = BlobServiceClient(
    account_url=f'https://{account_name}.blob.core.windows.net/',
    credential=account_key)
container_client = blob_service.get_container_client(container_name)

blob_prefix = 'cc_crosswalk_kinnser/7d1fb957-c9e2-4500-bd5b-be57ae339c83'
parquet_blobs = []

for blob in container_client.list_blobs(name_starts_with=blob_prefix):
    print(f"Found {blob.name}")
    parquet_blobs.append(blob.name)

target_directory = f'data/cc_crosswalk_kinnser'
os.makedirs(target_directory, exist_ok=True)

for blob in parquet_blobs:

    file_name = os.path.split(blob)[1]
    target_path = os.path.join(target_directory, file_name)

    try:
        print(f'Downloading {file_name} to {target_path}')
        blob_client = container_client.get_blob_client(blob)
        
        with open(target_path, "wb") as f:
            storage_stream = blob_client.download_blob()
            storage_stream.download_to_stream(f)        
        
    except ResourceNotFoundError as e:
        print("No blob found.")

dataset = pq.ParquetDataset(target_directory)
table = dataset.read()

cc_crosswalk_kinnser_df = table.to_pandas()

In [None]:
cc_crosswalk_kinnser_df.head()