# Exploratory Data Analysis with S3

This example shows easily accessing data stored as CSV and Parquet files in S3 using credentials already provided in Workbench.

In [1]:
import boto3
import pandas as pd
import io
import pyarrow

## List files in an S3 bucket

In [6]:
s3_client = boto3.client('s3')

for key in s3_client.list_objects(Bucket='tn-example-data')['Contents']:
    print(key['Key'])

bank_failures.parquet
heart.csv
iris.parquet
loan_data/
loan_data/.DS_Store
loan_data/year=2007/.DS_Store
loan_data/year=2007/month=10/part-0.parquet
loan_data/year=2007/month=11/part-0.parquet
loan_data/year=2007/month=12/part-0.parquet
loan_data/year=2007/month=6/part-0.parquet
loan_data/year=2007/month=7/part-0.parquet
loan_data/year=2007/month=8/part-0.parquet
loan_data/year=2007/month=9/part-0.parquet
loan_data/year=2008/.DS_Store
loan_data/year=2008/month=1/part-0.parquet
loan_data/year=2008/month=10/part-0.parquet
loan_data/year=2008/month=11/part-0.parquet
loan_data/year=2008/month=12/part-0.parquet
loan_data/year=2008/month=2/part-0.parquet
loan_data/year=2008/month=3/part-0.parquet
loan_data/year=2008/month=4/part-0.parquet
loan_data/year=2008/month=5/part-0.parquet
loan_data/year=2008/month=6/part-0.parquet
loan_data/year=2008/month=7/part-0.parquet
loan_data/year=2008/month=8/part-0.parquet
loan_data/year=2008/month=9/part-0.parquet
loan_data/year=2009/.DS_Store
loan_data/y

## Access a csv file stored in S3

Files can be read directly into a session as a Dataframe or downloaded first then read into a Dataframe.

1. Bring directly into a Pandas dataframe

In [7]:
s3_client = boto3.client('s3')

bucket="tn-example-data"
key="heart.csv"

obj = s3_client.get_object(Bucket=bucket, Key=key)
pd.read_csv(io.BytesIO(obj['Body'].read()))


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


2. Download the csv and load into a Pandas dataframe

In [8]:
s3_client = boto3.client('s3')

bucket="tn-example-data"
key="heart.csv"
filename="heart-local.csv"

s3_client.download_file(Bucket=bucket, Key=key, Filename=filename)
pd.read_csv(filename)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1
