There are two ways of collecting and analyzing NGINX logs in AWS:

1. Download all the logentries from Cloudwatch filtered on `StreamPrefix='application/nginx'`` and analyze them locally
2. Execute a query with boto3 and analyze the results

This notebook aims to show how to use the second approach.


In [None]:
%pip install --upgrade pip
%pip install boto3 prettytable pandas --upgrade

In [4]:
import boto3
import subprocess
import json
import os
import time, pytz
import re
import random
import pandas as pd
from botocore.exceptions import ClientError
from datetime import datetime, timezone, timedelta
from concurrent.futures import ThreadPoolExecutor, as_completed

os.environ['TZ'] = 'UTC'
time.tzset()

import sys
sys.path.append('..')
from awsutils import *

In [None]:
def create_session(profile_name='acl-production', region='us-east-1'):
    global aws_session, sts_client, logs_client

    # Use the function with your profile to get a session
    aws_session = set_aws_credentials(profile_name, region)

    # Create clients using the session
    sts_client = aws_session.client('sts')
    logs_client = aws_session.client('logs')

    # Example usage of the clients
    account_id = sts_client.get_caller_identity()["Account"]
    print("Current AWS Account ID:", account_id)

create_session()

# Log Collector from AWS CloudWatch

In [None]:
# collect logs from the specified log group and log stream into a file

log_group = 'projects-main'
#log_group = 'reports-main'
#log_group = 'results-main'
#log_group =  'api_proxy-main'
#log_group =  'launchpad-main'
#log_group =  'risks-main'


log_stream = 'nginx'
#log_stream = 'application'

start_date = exact_date(2025, 1, 27, 15, 30)
end_date = exact_date(2025, 1, 27, 16, 10)


base_query = "fields @timestamp, @message"
base_query += f"| filter @logStream like 'application/{log_stream}'"
print (f"query: {base_query}")


# create folder for the log fragments
date_str = datetime.fromtimestamp(start_date).strftime("%Y%m%d")
folder_name = f'{log_stream}_{log_group}_{date_str}'
os.makedirs(folder_name, exist_ok=True)

# fetch the log data in slices
cloudwatch_crawler(logs_client, log_group, base_query, start_date, end_date, folder_name, slices=1)
print("Done.")

In [23]:
# Precompile regular expressions for better performance
HTTP_PATTERN = re.compile(r' HTTP/1.1$')
ID_PATTERN = re.compile(r'([/=])\d+')

# Create a single mapping for all character replacements
CHAR_REPLACEMENTS = {
    '%5D': ']',
    '%5B': '[',
    '%2C': ',',
    '%3A': ':',
    '%7B': '{',
    '%7D': '}',
    '%22': '"',
    '%20': ' ',
    '%3F': '?',
    '%3D': '=',
    '%26': '&',
    '%25': '%',
    '%2F': '/',
    '%5C': '\\',
}

# Create escape pattern at module level
ESCAPE_PATTERN = re.compile('|'.join(map(re.escape, CHAR_REPLACEMENTS.keys())))

def normalize_request_path(request):
    # Remove "HTTP/1.1" if it exists at the end using precompiled pattern
    request = HTTP_PATTERN.sub('', request)
    
    # Replace all escaped characters in one pass
    request = ESCAPE_PATTERN.sub(
        lambda m: CHAR_REPLACEMENTS[m.group(0)], 
        request
    )

    # Replace numeric IDs using precompiled pattern
    request = ID_PATTERN.sub(r'\1{id}', request)

    return request

def aggregate_requests(aggregated_data, data):
    for entry in data:
        try:
            request = entry["@message"]["request"]
            normalized_request = normalize_request_path(request)
            request_time = int(float(entry["@message"]["request_time"]) * 1000)  # Convert to milliseconds

            if normalized_request in aggregated_data:
                aggregated_data[normalized_request]['count'] += 1
                aggregated_data[normalized_request]['total_duration'] += request_time
            else:
                aggregated_data[normalized_request] = {'count': 1, 'total_duration': request_time}
        except KeyError:
            continue
        except Exception as e:
            print("Error processing entry:", entry)
            print(e)
            continue


In [None]:
# load the json file with log entries into a list
aggregated_data = {}

folder_name = 'nginx_projects-main_20250128'
log_files = [os.path.join(folder_name, f) for f in os.listdir(folder_name) if f.endswith('.json')]

item_count=0
        
for fname in log_files:
    with open(fname, 'r') as f:
        for line in f:
            data = json.loads(line)
            item_count += len(data)
            print(f'\rnumber of log entries: {item_count}', end="")
            parse_json_messages(data)
            aggregate_requests(aggregated_data, data)


In [None]:
# Print aggregated data in a more readable format
print("\nAggregated Request Statistics:")
for path, stats in aggregated_data.items():
    avg_duration = stats['total_duration'] / stats['count']
    print(f"\nPath: {path}")
    print(f"Count: {stats['count']}")
    print(f"Total Duration (ms): {stats['total_duration']}")
    print(f"Average Duration (ms): {avg_duration:.2f}")
    print("-" * 80)

    
