In [None]:
import os, json
import pandas as pd
import requests

# Fill in Aiven token, project name and Kafka service name
aiven_token = ''
aiven_project = ''
kafka_service = ''

get_topic_data = 'https://api.aiven.io/v1/project/' + aiven_project + '/service/' + kafka_service + '/topic'
response = requests.get(get_topic_data, headers={'Authorization': 'aivenv1 ' + aiven_token })
topics = pd.json_normalize(response.json()['topics'])
topics.head(3)

In [None]:
from tqdm import tqdm

all = pd.DataFrame()
for ind in tqdm(topics.index):
    response = requests.get(get_topic_data + '/' + topics['topic_name'][ind], headers={'Authorization': 'aivenv1 ' + aiven_token })
    t = pd.json_normalize(response.json()['topic'], record_path=['partitions'])
    topic_name = topics['topic_name'][ind]
    t['topic'] = topic_name

    # Topic naming convention
    #
    #  aaaa.bbbbb.cccc.ddddd
    #  ----
    #    |  -----
    #  group  |
    #      component

    if topic_name.count('.') > 0:
        t['group'] = topic_name.split('.')[0]
        if topic_name.count('.') > 1:
            t['component'] = topic_name.split('.')[0]+'.'+topic_name.split('.')[1]
        else:
            t['component'] = t['group']
    else:
        t['group'] = topic_name
        t['component'] = topic_name
    t['size_mb'] = t['size'] / 1024 / 1024
    all = pd.concat([all, t])

all.head(3)


In [None]:
all.describe()

In [None]:
all.groupby('topic').agg(part_count=('partition', 'count'), topic_size_mb=('size_mb', 'sum'), part_stddev=('size_mb', 'std')).sort_values(['topic_size_mb'],ascending=False)

In [None]:
import matplotlib.pyplot as plt

# group by the "group"

by_group = all.groupby(['group']).agg(group_size_mb=('size_mb','sum'))

by_group['group_size_mb'].nlargest(10).plot(kind='bar')
plt.show()

In [None]:
# Top-n "components" by size

by_component = all.groupby(['component']).agg(component_size_mb=('size_mb','sum'))

ax = by_component['component_size_mb'].nlargest(10).plot(kind='bar')
plt.show()

In [None]:
#by_component[by_component.index.str.startswith('protocol.')]['size_mb'].nlargest(20).plot(kind='bar')
by_component['size_mb'].nlargest(20).plot(kind='bar')
plt.show()

In [None]:
filtered = all.loc[all['topic'].str.endswith('changelog', na=False)]
filtered.groupby('topic').agg(partitions=('partition', 'count'), topic_size_mb=('size_mb', 'sum')).sort_values(['topic_size_mb'],ascending=False)

In [None]:
by_group[~by_group.index.str.startswith('protocol')]['group_size_mb'].nlargest(20).plot(kind='bar')
plt.show()

In [None]:
by_component[by_component.index.str.startswith('mongodb.')]['component_size_mb'].nlargest(20).plot(kind='bar')
plt.show()

In [None]:
#filtered = all.loc[all['topic'].str.startswith('mongodb.', na=False)]
filtered = all
filtered.groupby('topic').agg(partitions=('partition', 'count'), total_size_mb=('size_mb', 'sum')).sort_values(['total_size_mb'],ascending=False)