# Top S3 Uploaders

## Step 1: Import Python Packages

In [None]:
import boto3                     # Amazon Python SDK
from datetime import datetime    # DateTime package for time conversions
import json                      # Needed to JSONify data for pandas
import pandas as pd              # Used to manipulate data
import time                      # Used to wait for query to finish

## Step 2: Query for s3:GetObject

In [None]:
# Get current time
end_time = datetime.now().timestamp()

# Get 24 hours prior
start_time = end_time - 86400

# Create CloudWatch Logs client and get S3 data
client = boto3.client('logs')
response = client.start_query(
    queryLanguage='CWLI',
    logGroupName='/baker221b/cloudtrail',
    startTime=int(start_time),
    endTime=int(end_time),
    queryString='fields @message | filter eventSource == "s3.amazonaws.com" and eventName == "GetObject"'
)
time.sleep(5)
query_id = response['queryId']
response = client.get_query_results(
    queryId = query_id
)
results = response['results']
# Shove data into a pandas DataFrame
results_with_message = []
for result in results:
    results_with_message.append(result[0]['value'])
data = [json.loads(item) for item in results_with_message]
df = pd.read_json(json.dumps(data), orient='records')
extended_event_data = pd.json_normalize(df['additionalEventData'])
df = pd.concat([df, extended_event_data], axis=1)
df

## Step 3: Detection 1: Rank Top Downloaders by Download Occurrences

In [None]:
counts = df['sourceIPAddress'].value_counts().reset_index()
counts.columns = ['source','occurrences']
counts

## Step 4: Detection 2: Rank Top Downloaders by Sum of Downloaded File Sizes

In [None]:
sum_by_ip = df.groupby('sourceIPAddress')['bytesTransferredOut'].sum().reset_index()
sum_by_ip.columns = ['source', 'totalDataOut']
sum_by_ip