In [1]:
import os
import pandas
import matplotlib.pyplot as plt
import matplotlib.image as img
import os
import seaborn as sns
# Set default Seaborn style
sns.set(style="darkgrid")
#from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression

In [2]:
import math

def roundup(x):
    return int(math.ceil(x / 10.0)) * 10

# This is still experimental

This notebook allows you to calculate the **ACM Hub Observability** sizing for fleet management based on your inputs as explained in __Critical Input Parameters to Size__ section

## Methodology of Calculation
1. We have obtained the number of time series per cluster by running https://github.com/stolostron/multicluster-observability-operator/tree/main/tools/simulator/metrics-collector/metrics-extractor

1. From number of time series AND number of clusters, we infer
    1. Memory requirement (2 hours of this time series data is stored in memory)
    1. CPU Requirement
    1. Disk needed for PVs (volume of data stored is dictated by settings in MultiCluster Observability CR)
    1. Storage needed for Object store (volume of data stored is dictated by settings in MultiCluster Observability CR). This is has many simplifying assumptions.

_All calculations are in this notebook below - and they are easy to follow._
 

In [3]:
master_output_df = pandas.DataFrame(columns=['NumManagedCluster', 'TimeSeriesPerCluster', 'MemoryGB','CPUCorevCPU','PVCGB','ObjStoreGB'])

In [4]:
fname = os.path.join("..","images","sizing_summary_based_on_max_run_08222023.csv")
master_df = pandas.read_csv(fname,index_col=0)
#master_df.info()
#master_df


In [5]:
#0.5 bytes - 1.5 bytes is maximum range. This is for storage in disk (not memory)
observed_bytes_for_storage_per_ts=2

### Formulating User Input

In [6]:
def set_input(NumMngCluster,TimeSeriesPerCluster):    
    number_of_managed_clusters=NumMngCluster
    number_of_time_series_per_cluster= TimeSeriesPerCluster

    #sampling interval - how frequently do we send data to ACM for storage
    number_of_samples_per_hour=12
    number_of_hours_pv_retention_hrs=24
    number_of_days_for_storage=30

    # Present Sizing for Object Store
    input_data = {'Specs': ['Number of Managed Clusters', 
                            'Number of time series per cluster', 
                            'Number of metric samples per hour',
                            'Base Operative time series count',
                            'Number of hours of retention in Receiver PV',
                            'Target days for storage of data assuming downsampling'], 
            'Value': [number_of_managed_clusters,number_of_time_series_per_cluster,
                      number_of_samples_per_hour,
                      number_of_managed_clusters*number_of_time_series_per_cluster,
                      number_of_hours_pv_retention_hrs,number_of_days_for_storage]
               } 

    # Create DataFrame 
    input_df = pandas.DataFrame(input_data) 

    # Print the output. 
    return input_df

### Inferring Memory Requirement


In [7]:
def calculate_memory(master_df,number_of_managed_clusters,number_of_time_series_per_cluster):
    print("Total Time Series count: ",number_of_managed_clusters*number_of_time_series_per_cluster)
    print("------------------------------------")

    X = master_df['TimeSeriesCount'].values.reshape(-1,1)
    Y = master_df['ACMObsMemUsageWSSGB'].values.reshape(-1,1)

    regressor = LinearRegression()  
    #training the algorithm
    regressor.fit(X,Y) 
    #To retrieve the intercept:
    alpha = regressor.intercept_
    print(f'alpha = {alpha}')
    #For retrieving the slope:
    beta = regressor.coef_
    print(f'beta = {beta}')

    y_pred = alpha + beta*number_of_managed_clusters*number_of_time_series_per_cluster
    print(f'A rough Predicted Memory need in GB for {number_of_managed_clusters*number_of_time_series_per_cluster} timeseries - IFF linearity holds = {y_pred} GB')

    memory_data = {'Projected Memory (GB)': [y_pred], 
            'Note': ['This is good of linearity assumption holds']
               } 
  
    memory_df = pandas.DataFrame(memory_data) 
    return memory_df

### Inferring CPU requirement


In [8]:
def calculate_cpu(master_df,number_of_managed_clusters,number_of_time_series_per_cluster):
    print("Total Time Series count: ",number_of_managed_clusters*number_of_time_series_per_cluster)
    print("------------------------------------")

    X = master_df['TimeSeriesCount'].values.reshape(-1,1)
    Y = master_df['ACMObsCPUCoreUsage'].values.reshape(-1,1)

    regressor = LinearRegression()  
    #training the algorithm
    regressor.fit(X,Y) 
    #To retrieve the intercept:
    alpha = regressor.intercept_
    print(f'alpha = {alpha}')
    #For retrieving the slope:
    beta = regressor.coef_
    print(f'beta = {beta}')

    y_pred = alpha + beta*number_of_managed_clusters*number_of_time_series_per_cluster
    print(f'A rough Predicted CPU vCPU needed for {number_of_managed_clusters*number_of_time_series_per_cluster} timeseries - IFF linearity holds = {y_pred} vCPU')

    cpu_data = {'Projected CPU  (vCPU)': [y_pred], 
            'Note': ['This is good of linearity assumption holds']
               } 
  
    cpu_df = pandas.DataFrame(cpu_data) 
    return cpu_df

### Inferring PVC Requirement


In [9]:
def calculate_pvc(number_of_samples_per_hour, number_of_managed_clusters, number_of_time_series_per_cluster,number_of_hours_pv_retention_hrs,observed_bytes_for_storage_per_ts):

    #actual_number_of_time_series_local_retention=(
    #    actual_number_of_time_series_per_2h*(number_of_hours_pv_retention_hrs/2)
    #)

    actual_number_of_time_series_local_retention=(
    number_of_samples_per_hour*
    number_of_managed_clusters*
    number_of_time_series_per_cluster*
    number_of_hours_pv_retention_hrs
    )

    inferred_gb_storage_recv=(
        (actual_number_of_time_series_local_retention*observed_bytes_for_storage_per_ts)/(1024*1024*1024)
    )

    inferred_gb_storage_am=10
    inferred_gb_storage_compactor=100
    inferred_gb_storage_rule=30
    inferred_gb_storage_store=100

    print("Number of hours of retention in Receiver PV: ", number_of_hours_pv_retention_hrs)
    print("Assumed storage space per time series: ",observed_bytes_for_storage_per_ts, " bytes")

    # Present Sizing for PVC
    pvc_data = {'Pod': ['alertmanager', 'thanos-receiver', 
                        'thanos-compactor', 'thanos-rule','thanos-store'], 
            'Number of Replicas': [3,3,1,3,3],
            'Sizes Per Replica (GB)': [roundup(inferred_gb_storage_am), 
                                       roundup(inferred_gb_storage_recv), 
                                       roundup(inferred_gb_storage_compactor), 
                                       roundup(inferred_gb_storage_rule),
                                       roundup(inferred_gb_storage_store)],
            'Total Size (GB)': [roundup(inferred_gb_storage_am)*3, 
                                roundup(inferred_gb_storage_recv)*3, 
                                roundup(inferred_gb_storage_compactor)*1, 
                                roundup(inferred_gb_storage_rule)*3,
                                roundup(inferred_gb_storage_store)*3]} 

    # Create DataFrame 
    pvc_df = pandas.DataFrame(pvc_data) 

    # Print the output. 
    return pvc_df


### Inferring Object Store Requirement


In [10]:
def calculate_objstore(number_of_samples_per_hour, number_of_managed_clusters, number_of_time_series_per_cluster,number_of_hours_pv_retention_hrs,observed_bytes_for_storage_per_ts,number_of_days_for_storage):
    
    actual_number_of_time_series_local_retention=(
    number_of_samples_per_hour*
    number_of_managed_clusters*
    number_of_time_series_per_cluster*
    number_of_hours_pv_retention_hrs
    )

    inferred_gb_storage_recv=(
        (actual_number_of_time_series_local_retention*observed_bytes_for_storage_per_ts)/(1024*1024*1024)
    )
    
    #this is needed just in case the number_of_hours_pv_retention_hrs is set to something other than 24hrs
    inferred_obj_storage_per_day=inferred_gb_storage_recv*24/number_of_hours_pv_retention_hrs
    inferred_obj_storage_for_life=inferred_obj_storage_per_day*number_of_days_for_storage*3

    print("Target days for storage of data assuming downsampling: ", number_of_days_for_storage)
    print("Inferred Object Storage needed: ", inferred_obj_storage_for_life , "GB")

    # Present Sizing for Object Store
    obj_data = {'Steps': ['Raw storage for 1 day (GB)', 
                          'number of days to be stored', 
                          'Raw storage for all days (GB)', 
                          '5m storage for all days (GB)',
                          '1hr storage for all days (GB)',
                          'Total storage for all days (GB)'], 
            'Value': [inferred_obj_storage_per_day,
                      number_of_days_for_storage,
                      inferred_obj_storage_for_life/3,
                      inferred_obj_storage_for_life/3,
                      inferred_obj_storage_for_life/3,
                      roundup(inferred_obj_storage_for_life)]
               } 

    # Create DataFrame 
    objstore_df = pandas.DataFrame(obj_data) 

    # Print the output. 
    return objstore_df



In [11]:
def save_date(input_df,memory_df,cpu_df,pvc_df,objstore_df):
    cluster=input_df['Value'][0]
    ts=input_df['Value'][1]
    mem=float(memory_df['Projected Memory (GB)'][0][0])
    cpu=float(cpu_df['Projected CPU  (vCPU)'][0][0])
    pvc=pvc_df['Total Size (GB)'].sum()
    objstore=objstore_df['Value'].iloc[5]
    return cluster,ts,mem,cpu,pvc,objstore


### Critical Input Parameters to Size
>- **Please tweak these values as needed**
>- *Our calculations are solely based on what you input below*


In [12]:
customer_df = pandas.DataFrame({
    'NumMngCluster': [150, 300, 3000],
    'TimeSeriesPerCluster': [4500, 4500, 4500]
})


### Running Final Calculation

In [13]:
for index, row in customer_df.iterrows():
    print("for :"  , row['NumMngCluster'] )
    input_df=set_input(row['NumMngCluster'],row['TimeSeriesPerCluster'])
    
    print(input_df)

    memory_df = calculate_memory(master_df,row['NumMngCluster'],row['TimeSeriesPerCluster'])
    cpu_df = calculate_cpu(master_df,row['NumMngCluster'],row['TimeSeriesPerCluster'])
    pvc_df = calculate_pvc(input_df['Value'][2], input_df['Value'][0], input_df['Value'][1],input_df['Value'][4],observed_bytes_for_storage_per_ts)
    objstore_df = calculate_objstore(input_df['Value'][2], input_df['Value'][0], input_df['Value'][1],input_df['Value'][4],observed_bytes_for_storage_per_ts,input_df['Value'][5])
    cluster,ts,mem,cpu,pvc,objstore=save_date(input_df,memory_df,cpu_df,pvc_df,objstore_df)
    master_output_df = master_output_df._append({'NumManagedCluster': cluster, 'TimeSeriesPerCluster': ts, 'MemoryGB': mem,'CPUCorevCPU':cpu,'PVCGB':pvc,'ObjStoreGB':objstore}, ignore_index=True)
    print("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
#print("--------------- Final Results ------------")
#print(master_output_df)

for : 150
                                               Specs   Value
0                         Number of Managed Clusters     150
1                  Number of time series per cluster    4500
2                  Number of metric samples per hour      12
3                   Base Operative time series count  675000
4        Number of hours of retention in Receiver PV      24
5  Target days for storage of data assuming downs...      30
Total Time Series count:  675000
------------------------------------
alpha = [-3.78028111]
beta = [[1.60469085e-05]]
A rough Predicted Memory need in GB for 675000 timeseries - IFF linearity holds = [[7.0513821]] GB
Total Time Series count:  675000
------------------------------------
alpha = [0.56711376]
beta = [[2.83832114e-06]]
A rough Predicted CPU vCPU needed for 675000 timeseries - IFF linearity holds = [[2.48298053]] vCPU
Number of hours of retention in Receiver PV:  24
Assumed storage space per time series:  2  bytes
Target days for storage of data

  mem=float(memory_df['Projected Memory (GB)'][0][0])
  cpu=float(cpu_df['Projected CPU  (vCPU)'][0][0])
  master_output_df = master_output_df._append({'NumManagedCluster': cluster, 'TimeSeriesPerCluster': ts, 'MemoryGB': mem,'CPUCorevCPU':cpu,'PVCGB':pvc,'ObjStoreGB':objstore}, ignore_index=True)
  mem=float(memory_df['Projected Memory (GB)'][0][0])
  cpu=float(cpu_df['Projected CPU  (vCPU)'][0][0])
  mem=float(memory_df['Projected Memory (GB)'][0][0])
  cpu=float(cpu_df['Projected CPU  (vCPU)'][0][0])


## Final Recommendation

In [14]:
master_output_df

Unnamed: 0,NumManagedCluster,TimeSeriesPerCluster,MemoryGB,CPUCorevCPU,PVCGB,ObjStoreGB
0,150.0,4500.0,7.051382,2.482981,550.0,40.0
1,300.0,4500.0,17.883045,4.398847,550.0,70.0
2,3000.0,4500.0,212.852983,38.884449,550.0,660.0


In [15]:
try:
    master_output_df.to_csv('../output/master_output.csv', index = True, header=True)  
    print("master_outputDF saved..")
except Exception as e:
    print("Failure in saving master_outputDF: ",e)   

master_outputDF saved..
