In [2]:
%pip install matplotlib pandas prettytable


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
from datetime import datetime
from datetime import timedelta
import os
import sys

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [4]:
def import_data(fname):
    # Import data from CSV into a panda dataframe from ./extract-2024-01-23T12_29_42.992Z.csv   
    df = pd.read_csv(fname, header=0, sep=',', parse_dates=[0], index_col=0)


    # print number of rows and columns
    print(df.shape)

    # print the column names
    print(df.columns)

    return df


In [5]:
def preprocess(df):
    # get rid of the double quotes first
    df['Duration_float'] = df['Duration'].str.replace('"', '')

    #then get rid of the trailing 'ms' or 's' and multiply by 1000 those wich ended to 's' to convert to milliseconds
    df['Duration_float'] = df['Duration_float'].str.replace('ms', '')
    df['Duration_float'] = df['Duration_float'].str.replace('s', '000')

    # convert Duration to float
    df['Duration_float'] = df['Duration_float'].astype(float)



# Active Records statistics

In [6]:
df = import_data('./extract-2024-01-23T12_29_42.992Z.csv')
preprocess(df)
# group by Resource, sum the Duration and calculate the average active_record.instantiation.record_count
df_aggregated = df.groupby('Resource').agg({'Duration_float': 'sum', 'active_record.instantiation.record_count': 'mean'}).sort_values(by=['Duration_float'], ascending=False)

# convert Duration to human readable format
from prettytable import PrettyTable
x = PrettyTable()
x.field_names = ["Resource", "Duration", "Record count"]
for index, row in df_aggregated.iterrows():
    if row['Duration_float'] < 10000: # 10 seconds
        break
    x.add_row([index, timedelta(milliseconds=row['Duration_float']), row['active_record.instantiation.record_count']])
print(x)


(33377, 6)
Index(['Service', 'Resource', 'Duration', 'Http Method', 'Status Code',
       'active_record.instantiation.record_count'],
      dtype='object')
+---------------------------------------+----------------+--------------------+
|                Resource               |    Duration    |    Record count    |
+---------------------------------------+----------------+--------------------+
|           "GroupProjectRole"          | 0:32:18.348800 | 53457.32551020408  |
|         "EntityResponsibility"        | 0:22:55.798110 | 51118.686440677964 |
| "CustomProjectPlanningAttributeValue" | 0:20:16.159170 | 13960.707022834986 |
|               "Signoff"               | 0:15:14.728880 | 68248.12162162163  |
|     "CustomControlAttributeValue"     | 0:13:45.311350 | 42503.359628770304 |
|               "Control"               | 0:11:47.960760 | 84361.17026748971  |
|             "ProjectRole"             | 0:10:45.424460 | 543950.7462028418  |
|               "FlexDate"              | 0

# net/http statistics

In [7]:
df = import_data('./extract-httpnet-2024-01-30T16_51_38.690Z.csv')
preprocess(df)
# group by Resource, sum the Duration and calculate the average active_record.instantiation.record_count
df_aggregated = df.groupby('network.destination.ip').agg({'Duration_float': 'sum'}).sort_values(by=['Duration_float'], ascending=False)


# convert Duration to human readable format
from prettytable import PrettyTable
x = PrettyTable()
x.field_names = ["Resource", "Duration"]
for index, row in df_aggregated.iterrows():
    if row['Duration_float'] < 10000: # 10 seconds
        break
    x.add_row([index, timedelta(milliseconds=row['Duration_float'])])
print(x)



(94438, 7)
Index(['Service', 'Resource', 'Duration', 'Http Method', 'Status Code',
       'network.destination.ip', 'Request path'],
      dtype='object')
+-----------------------------------------------------------------------------------+----------------+
|                                      Resource                                     |    Duration    |
+-----------------------------------------------------------------------------------+----------------+
|                          "sqs.eu-central-1.amazonaws.com"                         | 8:29:57.649320 |
|                              "accounts.highbond.com"                              | 4:19:07.784660 |
|                                "apis.highbond.com"                                | 1:30:06.611890 |
|                           "notifier-configs.airbrake.io"                          | 0:27:54.017770 |
| "vpc-opensearch-eu-main-tznlywm3luxlqyizpm7ai4kl3e.eu-central-1.es.amazonaws.com" | 0:15:20.549390 |
|          "projects-

In [17]:
# print 

# filter df for network.destination.ip="accounts.highbond.com"
df_filtered = df[df['network.destination.ip'] == '"accounts.highbond.com"']
print(df_filtered.shape)


# replace the user string in paths like this: /users/ddgx9-9TpvWVqxZzjjsb/groups using regex: r'/users/[^/]+'
df_filtered['Request path'] = df_filtered['Request path'].str.replace(r'/users/[^/]+', '/users/<user_id>', regex=True)


# aggregate data by 'Request path' and sum the Duration
df_aggregated = df_filtered.groupby('Request path').agg({'Duration_float': 'sum'}).sort_values(by=['Duration_float'], ascending=False)



# convert Duration to human readable format
from prettytable import PrettyTable
x = PrettyTable()
x.field_names = ["Resource", "Duration"]
for index, row in df_aggregated.iterrows():
    if row['Duration_float'] < 10000: # 10 seconds
        break
    x.add_row([index, timedelta(milliseconds=row['Duration_float'])])
print(x)

(46918, 8)
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------+
|                                                                                 Resource                                                                                |    Duration    |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------+
|                                                                          "/api/users/<user_id>                                                                          | 1:41:50.597510 |
|                                                                      "/api/users/<user_id>/groups"                                                                      | 1:27:04.386060 |
|                                           

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Request path'] = df_filtered['Request path'].str.replace(r'/users/[^/]+', '/users/<user_id>', regex=True)
