In [14]:
%pip install matplotlib pandas 


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
from datetime import datetime
from datetime import timedelta
import os
import sys

In [76]:
def import_data(fname):
    
    # Import data from CSV into a panda dataframe
    df = pd.read_csv(fname, header=0, sep=',', parse_dates=[0], index_col=0)

    # print number of rows and columns
    print(df.shape)

    # print the column names
    print(df.columns)

    return df


def preprocess(df):
    # get rid of the double quotes first
    df['Duration_float'] = df['Duration'].str.replace('"', '')

    # now Duration_float is either in milliseconds or seconds, we need to convert everything to milliseconds
    # if it ends with 'ms' we just need to get rid of the 'ms'
    # if it ends with 's' we need to get rid of the 's' and multiply by 1000
    df['Duration_float'] = df['Duration_float'].str.replace(r'.dd', '', regex=True)
    df['Duration_float'] = df['Duration_float'].str.replace(r'ms', '', regex=True)
    df['Duration_float'] = df['Duration_float'].str.replace(r's', '000', regex=True)

    # convert to float
    df['Duration_float'] = df['Duration_float'].astype(float)

# Active Records statistics

In [84]:
df = import_data('./extract-2024-01-23T12_29_42.992Z.csv')
preprocess(df)

df.rename(columns={'active_record.instantiation.record_count': 'record_count'}, inplace=True)

# group by Resource, sum the Duration and calculate the average active_record.instantiation.record_count
df_aggregated = df.groupby('Resource').agg({'Duration_float': 'sum', 'record_count': 'mean', 'Resource': 'count'}).sort_values(by=['Duration_float'], ascending=False)

df_aggregated['AVG Duration ms'] = df_aggregated['Duration_float'] / df_aggregated['Resource']

# convert Duration to human readable format
df_aggregated['Duration_float'] = df_aggregated['Duration_float'].apply(lambda x: timedelta(milliseconds=x))

df_aggregated.rename(columns={'Duration_float': 'Duration', 'record_count': 'Avg record count', 'Resource': 'Number of calls'}, inplace=True)


# print the output as a table in markdown format
print(df_aggregated.head(30).to_markdown())



(33377, 6)
Index(['Service', 'Resource', 'Duration', 'Http Method', 'Status Code',
       'active_record.instantiation.record_count'],
      dtype='object')
| Resource                              | Duration               |   Avg record count |   Number of calls |   AVG Duration ms |
|:--------------------------------------|:-----------------------|-------------------:|------------------:|------------------:|
| "GroupProjectRole"                    | 0 days 00:32:18.348800 |            53457.3 |              1960 |           988.953 |
| "EntityResponsibility"                | 0 days 00:22:55.798110 |            51118.7 |              2832 |           485.804 |
| "CustomProjectPlanningAttributeValue" | 0 days 00:20:16.159170 |            13960.7 |              2321 |           523.981 |
| "Signoff"                             | 0 days 00:15:14.728880 |            68248.1 |              2368 |           386.288 |
| "CustomControlAttributeValue"         | 0 days 00:13:45.311350 |         

# net/http statistics

In [18]:
# replace """ with " in the file content
def replace_triple_quotes(fname):
    with open(fname, 'r') as f:
        file_content = f.read()
    file_content = file_content.replace('"""', '"')
    with open(fname, 'w') as f:
        f.write(file_content)

#replace_triple_quotes('./extract-httpnet-2024-01-30T16_51_38.690Z.csv')

In [85]:
df = import_data('./extract-httpnet-2024-01-30T16_51_38.690Z.csv')
preprocess(df)

df.rename(columns={'network.destination.ip': 'destination'}, inplace=True)

# group by Resource, sum the Duration
df_aggregated = df.groupby('destination').agg({'Duration_float': 'sum', 'destination': 'count'}).sort_values(by=['Duration_float'], ascending=False)

# convert Duration to human readable format
df_aggregated['AVG Duration ms'] = df_aggregated['Duration_float'] / df_aggregated['destination']
df_aggregated['Duration_float'] = df_aggregated['Duration_float'].apply(lambda x: timedelta(milliseconds=x))
df_aggregated.rename(columns={'Duration_float': 'Duration'}, inplace=True)
df_aggregated.rename(columns={'destination': 'count'}, inplace=True)

# print the output as a table in markdown format
print(df_aggregated.head(20).to_markdown())


(94438, 7)
Index(['Service', 'Resource', 'Duration', 'Http Method', 'Status Code',
       'network.destination.ip', 'Request path'],
      dtype='object')
| destination                                                                     | Duration               |   count |   AVG Duration ms |
|:--------------------------------------------------------------------------------|:-----------------------|--------:|------------------:|
| sqs.eu-central-1.amazonaws.com                                                  | 0 days 08:29:57.649320 |   31198 |           980.757 |
| accounts.highbond.com                                                           | 0 days 04:19:07.784660 |   46918 |           331.382 |
| apis.highbond.com                                                               | 0 days 01:30:06.611890 |    6554 |           824.933 |
| notifier-configs.airbrake.io                                                    | 0 days 00:27:54.017770 |    4382 |           382.021 |
| vpc-opens

## accounts.highbond.com

In [87]:
# filter df for network.destination.ip="accounts.highbond.com"
df_filtered = df[df['destination'] == 'accounts.highbond.com']
print(df_filtered.shape)


# replace the user string in paths like this: /users/ddgx9-9TpvWVqxZzjjsb/groups using regex: r'/users/[^/]+'
df_filtered['Request path'] = df_filtered['Request path'].str.replace(r'/users/[^/]+', '/users/<user_id>', regex=True)
df_filtered['Request path'] = df_filtered['Request path'].str.replace(r'/authorize/[^/]+', '/authorize/<token>', regex=True)
df_filtered['Request path'] = df_filtered['Request path'].str.replace(r'/login/oauth/authorize[^/]+', '/login/oauth/authorize?client_id', regex=True)


# aggregate data by 'Request path' and sum the Duration
df_aggregated = df_filtered.groupby('Request path').agg({'Duration_float': 'sum', 'Request path': 'count'}).sort_values(by=['Duration_float'], ascending=False)

df_aggregated['Duration AVG ms'] = df_aggregated['Duration_float'] / df_aggregated['Request path']
# convert Duration to human readable format
df_aggregated['Duration_float'] = df_aggregated['Duration_float'].apply(lambda x: timedelta(milliseconds=x))

df_aggregated.rename(columns={'Duration_float': 'Duration Sum', 'Request path': 'Count'}, inplace=True)


# print the output as a table in markdown format
print(df_aggregated.to_markdown())

(46918, 8)
| Request path                     | Duration Sum           |   Count |   Duration AVG ms |
|:---------------------------------|:-----------------------|--------:|------------------:|
| /api/users/<user_id>             | 0 days 01:41:50.597510 |   13017 |           469.432 |
| /api/users/<user_id>/groups      | 0 days 01:27:04.386060 |   13008 |           401.629 |
| /api/sessions/get                | 0 days 00:29:58.306690 |   11422 |           157.442 |
| /api/sessions/refresh            | 0 days 00:18:50.449430 |    6738 |           167.772 |
| /login/oauth/token               | 0 days 00:09:37.503750 |    1081 |           534.231 |
| /login/oauth/authorize?client_id | 0 days 00:06:26.218060 |     879 |           439.383 |
| /oauth/authorize/<token>         | 0 days 00:05:20.323160 |     773 |           414.39  |


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Request path'] = df_filtered['Request path'].str.replace(r'/users/[^/]+', '/users/<user_id>', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Request path'] = df_filtered['Request path'].str.replace(r'/authorize/[^/]+', '/authorize/<token>', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

## apis.highbonds.com

In [21]:
# filter df for network.destination.ip="accounts.highbond.com"
df_filtered = df[df['network.destination.ip'] == '"apis.highbond.com"']
print(df_filtered.shape)



(0, 8)


# Postgress statistics

In [89]:
df = import_data('./extract-postgress-2024-02-02T12_20_14.334Z.csv')
preprocess(df)

# group by Resource, sum the Duration and count the appearances of each Resource
df_aggregated = df.groupby('Resource').agg({'Duration_float': 'sum', 'Resource': 'count'}).sort_values(by=['Duration_float'], ascending=False)

df_aggregated['AVG Duration ms'] = df_aggregated['Duration_float'] / df_aggregated['Resource']


#convert Duration to human readable format
df_aggregated['Duration_float'] = df_aggregated['Duration_float'].apply(lambda x: timedelta(milliseconds=x))

df_aggregated.rename(columns={'Duration_float': 'Duration', 'Resource': 'Number of calls'}, inplace=True)

# print the output as a table in markdown format
print(df_aggregated.head(30).to_markdown())


(100000, 5)
Index(['Service', 'Resource', 'Duration', 'Http Method', 'Status Code'], dtype='object')
| Resource                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         