# Example of analyzing cloud watch logs in Python
This script provides an example of reading and analyzing cloud watch logs in Python.  
It can serve as a basis to a more elaborate analytics script.  

Before using: 
- make sure to have aws cli profiles configured  
- select the correct profile_name to connect to AWS  
- be sure to change 'log_group' variable depending on whether you are connecting to dev or prod

In [68]:
import boto3
from datetime import datetime, timedelta
import time
import pandas as pd
import json

In [33]:
session = boto3.Session(profile_name='default') # change to your profile name
client = session.client('logs')

In [122]:

def get_cloudwatch_logs(cic_id, n_max=10):
    query = f"fields @message, @timestamp, @logStream, @log | filter clientid = '{cic_id}' | sort @timestamp desc | limit {n_max}"
    log_group = 'CIC_logs' # 'cic_logs' in production, 'CIC_logs' in dev

    start_query_response = client.start_query(
        logGroupName=log_group,
        startTime=int((datetime.today() - timedelta(hours=5)).timestamp()),
        endTime=int(datetime.now().timestamp()),
        queryString=query,
    )

    query_id = start_query_response['queryId']

    response = None

    while response == None or response['status'] == 'Running':
        # print('Waiting for query to complete ...')
        time.sleep(1)
        response = client.get_query_results(
            queryId=query_id
        )
    return response

Example of usage and conversion to pandas frame

In [124]:
# get logs
cic_id = 'CIC-46f31f71-9b6d-5af5-ba63-1c71d639dcc5'
response = get_cloudwatch_logs(cic_id)

# make dataframe of error messages:
df = pd.concat([pd.json_normalize(json.loads(row[0]['value'])['payload']['logs']) 
                for row in response['results']], ignore_index=True)

# add column with cic_id and human readable timestamp
df['cic_id'] = cic_id
df['timestamp'] = pd.to_datetime(df['ts'], unit='us')
df.sort_values(by='timestamp', ascending=False, inplace=True)

In [125]:
# query message field for certain errors
df.query('message.str.contains("ERROR") & message.str.contains("MODBUS IO")', engine='python')

Unnamed: 0,service,message,ts,cic_id,timestamp
60,cic-modbus.service,ERROR:root:Received MODBUS IO exception.,1689847046331909,CIC-46f31f71-9b6d-5af5-ba63-1c71d639dcc5,2023-07-20 09:57:26.331909
55,cic-modbus.service,ERROR:root:Received MODBUS IO exception.,1689847045278283,CIC-46f31f71-9b6d-5af5-ba63-1c71d639dcc5,2023-07-20 09:57:25.278283
50,cic-modbus.service,ERROR:root:Received MODBUS IO exception.,1689847044243170,CIC-46f31f71-9b6d-5af5-ba63-1c71d639dcc5,2023-07-20 09:57:24.243170
45,cic-modbus.service,ERROR:root:Received MODBUS IO exception.,1689847043208631,CIC-46f31f71-9b6d-5af5-ba63-1c71d639dcc5,2023-07-20 09:57:23.208631
42,cic-modbus.service,ERROR:root:Received MODBUS IO exception.,1689847042170152,CIC-46f31f71-9b6d-5af5-ba63-1c71d639dcc5,2023-07-20 09:57:22.170152
...,...,...,...,...,...
782,cic-modbus.service,ERROR:root:Received MODBUS IO exception.,1689846881590610,CIC-46f31f71-9b6d-5af5-ba63-1c71d639dcc5,2023-07-20 09:54:41.590610
777,cic-modbus.service,ERROR:root:Received MODBUS IO exception.,1689846880581466,CIC-46f31f71-9b6d-5af5-ba63-1c71d639dcc5,2023-07-20 09:54:40.581466
772,cic-modbus.service,ERROR:root:Received MODBUS IO exception.,1689846879495177,CIC-46f31f71-9b6d-5af5-ba63-1c71d639dcc5,2023-07-20 09:54:39.495177
766,cic-modbus.service,ERROR:root:Received MODBUS IO exception.,1689846878459807,CIC-46f31f71-9b6d-5af5-ba63-1c71d639dcc5,2023-07-20 09:54:38.459807


In [126]:
# response['results'] is a list of lists
# response['results'][0] is a list of 5 dicts (one for each field in query + 1)
# response['results'][0][1] is a dict with 2 keys: 'field' (@message), 'value' (the message)
# json.loads(response['results'][0][1]['value']) is a json dictionary
# pd.json_normalize(json.loads(response['results'][0][1]['value'])['payload']['logs']) is a dataframe of service, message and ts