In [1]:
import os.path

import boto3
import pandas as pd
from decouple import config
from s3logparse.s3logparse import parse_log_lines, LogLine

In [2]:
s3 = boto3.resource(
    's3',
    aws_access_key_id=config('ACCESS_KEY'),
    aws_secret_access_key=config('SECRET_KEY')
)

In [3]:
bucket = s3.Bucket(name='encode-public-logs')

In [4]:
LOG_PATH = 's3_logs'

if not os.path.exists(LOG_PATH):
    os.mkdir(LOG_PATH)

In [5]:
def get_local_or_download(name):
    local_path = f'{LOG_PATH}/{name}'
    if not os.path.exists(local_path):
        bucket.download_file(name, local_path)
    return open(local_path)

In [9]:
rows = []
column_headers = LogLine._fields

ALREADY_TRIED_FILE_PATH = 'already-tried.txt'

already_tried = []
if os.path.exists(ALREADY_TRIED_FILE_PATH):
    with open(ALREADY_TRIED_FILE_PATH, 'r') as already_tried_file:
        already_tried = already_tried_file.read().splitlines()
        
def is_log_file_we_want(content):
    return b'REST.GET.OBJECT' in content;

for file in bucket.objects.filter(Prefix='2019').limit(3000):
    if file.key in already_tried:
        continue
    local_path = f'{LOG_PATH}/{file.key}'
    if os.path.exists(local_path):
        with open(local_path, 'r') as local_copy:
            content = local_copy.readlines()
            for log_entry in parse_log_lines(content):
                rows.append(list(log_entry))
    else:
        content = file.get()['Body'].read()
        if is_log_file_we_want(content):
            with open(local_path, 'w') as local_copy:
                local_copy.write(str(content))
                rows.append(list(parse_log_lines(content)))
        else:
            already_tried.append(file.key)
with open(ALREADY_TRIED_FILE_PATH, 'w') as already_tried_file:
    for tried in already_tried:
        already_tried_file.write(tried + '\n')

data = pd.DataFrame(rows, columns=column_headers)
print(data)

                                        bucket_owner         bucket  \
0  b'50fe8c9d2e5e9d4db8f4fd5ff68ec949de9d4ca39756...  encode-public   
1  b'50fe8c9d2e5e9d4db8f4fd5ff68ec949de9d4ca39756...  encode-public   
2  b'50fe8c9d2e5e9d4db8f4fd5ff68ec949de9d4ca39756...  encode-public   
3  b'50fe8c9d2e5e9d4db8f4fd5ff68ec949de9d4ca39756...  encode-public   

                  timestamp     remote_ip  \
0 2019-02-08 21:57:53+00:00  171.66.209.5   
1 2019-02-08 21:58:05+00:00  171.66.209.5   
2 2019-02-08 22:58:02+00:00  171.66.209.4   
3 2019-02-08 23:00:15+00:00  171.66.209.5   

                                    requester        request_id  \
0  arn:aws:iam::407227577691:user/caseylitton  A1E34A80F6423271   
1  arn:aws:iam::407227577691:user/caseylitton  CB382FA22780C50D   
2                                        None  352478B159D39173   
3                                        None  E3F97BCC8A464D76   

                            operation  \
0  REST.GET.OBJECT_LOCK_CONFIGURATION   
