# Import From S3
- Requires [fsspec](https://pypi.org/project/fsspec/) and [s3fs](https://pypi.org/project/s3fs/) for pandas integration
- Another option is [aws-data-wrangler](https://github.com/awslabs/aws-data-wrangler) 

In [1]:
import csv
from datetime import datetime
from os import environ
import gzip as gz
import logging

import boto3
from botocore.exceptions import ClientError, EventStreamError
import pandas as pd
from sqlalchemy import create_engine, text
from sqlalchemy.orm import sessionmaker

In [None]:
%%bash

pip3 install fsspec==2021.4.0
pip3 install s3fs==2021.4.0

In [2]:
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',
                    datefmt='%I:%M:%S %p', level=logging.INFO)

logger = logging.getLogger(__name__)

In [None]:
s3_client = boto3.client('s3')

exports = s3_client.list_objects_v2(
            Bucket='my-bucket',
            Prefix='org/unload/test_file'
        )

for part in exports.get('Contents', []):
    logger.info(f"lake_path is {part['Key']}")

## Inspect Data

In [None]:
s3_path = 's3://my-bucket/org/unload/test_file'
columns = ['client_id', 'patient_id', 'type', 'description', 'onset_date', 'resolved_date', 'severity', 
           'reaction_code', 'reaction', 'product_code']

section_df = pd.read_table(filepath_or_buffer=s3_path, sep='|', header=None, names=columns)
section_df.shape

In [None]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 21)
pd.set_option('display.max_colwidth', None)
section_df.iloc[2360:2400, 0]

In [None]:
section_df.groupby([0]).count()

## Read Spark Output

In [None]:
s3_path_out = 's3://my-bucket/org/unload/test_file'
parquet_df = pd.read_parquet(path=s3_path_out)
parquet_df.shape

In [None]:
parquet_df.head()

## Summary Stats

In [None]:
summary_path = 's3://my-bucket/org/unload/test_file'

summary_df = pd.read_table(filepath_or_buffer=summary_path, sep='|', header=None)
summary_df.shape

In [None]:
summary_df.head()

In [None]:
pd.set_option('display.max_rows', 100)
summary_df[summary_df[0] == 'IMMUNIZATIONS'].iloc[:, 0:3].groupby([0, 1]).sum()