In [1]:
import os.path

import boto3
import pandas as pd
from decouple import config
from s3logparse.s3logparse import parse_log_lines, LogLine

In [2]:
s3 = boto3.resource(
    's3',
    aws_access_key_id=config('ACCESS_KEY'),
    aws_secret_access_key=config('SECRET_KEY')
)

In [3]:
bucket = s3.Bucket(name='encode-public-logs')

In [4]:
LOG_PATH = 's3_logs'

if not os.path.exists(LOG_PATH):
    os.mkdir(LOG_PATH)

In [5]:
def get_local_or_download(name):
    local_path = f'{LOG_PATH}/{name}'
    if not os.path.exists(local_path):
        bucket.download_file(name, local_path)
    return open(local_path)

In [6]:
rows = []
column_headers = LogLine._fields

for file in bucket.objects.filter(Prefix='2019').limit(100):
    with get_local_or_download(file.key) as log:
        for log_entry in parse_log_lines(log.readlines()):
            rows.append(list(log_entry))

data = pd.DataFrame(rows, columns=column_headers)
print(data)

                                          bucket_owner         bucket  \
0    50fe8c9d2e5e9d4db8f4fd5ff68ec949de9d4ca39756c3...  encode-public   
1    50fe8c9d2e5e9d4db8f4fd5ff68ec949de9d4ca39756c3...  encode-public   
2    50fe8c9d2e5e9d4db8f4fd5ff68ec949de9d4ca39756c3...  encode-public   
3    50fe8c9d2e5e9d4db8f4fd5ff68ec949de9d4ca39756c3...  encode-public   
4    50fe8c9d2e5e9d4db8f4fd5ff68ec949de9d4ca39756c3...  encode-public   
5    50fe8c9d2e5e9d4db8f4fd5ff68ec949de9d4ca39756c3...  encode-public   
6    50fe8c9d2e5e9d4db8f4fd5ff68ec949de9d4ca39756c3...  encode-public   
7    50fe8c9d2e5e9d4db8f4fd5ff68ec949de9d4ca39756c3...  encode-public   
8    50fe8c9d2e5e9d4db8f4fd5ff68ec949de9d4ca39756c3...  encode-public   
9    50fe8c9d2e5e9d4db8f4fd5ff68ec949de9d4ca39756c3...  encode-public   
10   50fe8c9d2e5e9d4db8f4fd5ff68ec949de9d4ca39756c3...  encode-public   
11   50fe8c9d2e5e9d4db8f4fd5ff68ec949de9d4ca39756c3...  encode-public   
12   50fe8c9d2e5e9d4db8f4fd5ff68ec949de9d4ca39756c3