# Example of analyzing cloud watch logs in Python
This script provides an example of reading and analyzing cloud watch logs in Python.  
It can serve as a basis to a more elaborate analytics script.  

Before using: 
- make sure to have aws cli profiles configured  
- select the correct profile_name to connect to AWS  
- be sure to change 'log_group' variable depending on whether you are connecting to dev or prod

In [62]:
import boto3
from datetime import datetime, timedelta
import time
import pandas as pd
import json
from os import path

In [63]:
session = boto3.Session(profile_name='production', region_name='eu-west-1') # change to your profile name
client = session.client('logs')

In [64]:

def get_cloudwatch_logs(cic_id, n_max=100, start_time=(datetime.today() - timedelta(hours=5)), end_time=datetime.now()):
    query = f"fields @message, @timestamp, @logStream, @log | filter clientid = '{cic_id}' | limit {n_max}"
    log_group = 'cic_logs' # 'cic_logs' in production, 'CIC_logs' in dev

    # Correct type of start and end times
    if isinstance(start_time, datetime):
        start_time = int(start_time.timestamp())
    if isinstance(end_time, datetime):
        end_time = int(end_time.timestamp())
    
    start_query_response = client.start_query(
        logGroupName=log_group,
        startTime=start_time,
        endTime=end_time,
        queryString=query,
    )

    query_id = start_query_response['queryId']

    response = None

    while response == None or response['status'] == 'Running':
        # print('Waiting for query to complete ...')
        time.sleep(1)
        response = client.get_query_results(
            queryId=query_id
        )
    return response

Example of usage and conversion to pandas frame

In [29]:
# get logs
cic_id = 'CIC-3d024a6b-c933-54e0-b1f9-52bae643355d'
start_time = datetime(year=2023, month=7, day=11, hour=4, minute=0, second=0)
end_time = datetime(year=2023, month=7, day=11, hour=9, minute=59, second=59)
response = get_cloudwatch_logs(cic_id, start_time=start_time, end_time=end_time, n_max=10000)

# make dataframe of error messages:
df = pd.concat([pd.json_normalize(json.loads(row[0]['value'])['payload']['logs']) 
                for row in response['results']], ignore_index=True)

# add column with cic_id and human readable timestamp
df['cic_id'] = cic_id
df['timestamp'] = pd.to_datetime(df['ts'], unit='us')
df.sort_values(by='timestamp', ascending=False, inplace=True)

In [31]:
# query message field for certain errors
df_ot = df[df['service'].str.contains('opentherm')]
df_nm = df[df['service'].str.contains('Network')]
# df.query('message.str.contains("ERROR") & message.str.contains("MODBUS IO")', engine='python')

In [35]:
# Save data
file_name = f'{start_time.strftime("%Y-%m-%d")}_{cic_id}_logs.xlsx'
df.to_excel(path.join('data', file_name), index=False)