In [1]:
import random
import numpy as np
from collections import defaultdict

In [2]:
##### SETTINGS FOR THE GENERATOR #####

# Files to write to.
data_file = "../data/PrometheusDataSecond.csv"
results_file = "../data/PrometheusAnswersFirst.csv"

# Number of records (lines in csv file) to generate
num_records = 2000

# Number of values to add to a checkpoint set. This simulates how many values are recorded by an instrument.
num_values = 8

# Range of values to generate.
value_lower_limit = -50
value_upper_limit = 50

# Histogram boundaries for buckets. Buckets in the Go SDK include "lower" buckets -- For example, if values are from 0 to 1
# and there is a boundary at 0.5, the buckets would be (-int, 0.5) and (-inf, +inf) instead of (-inf, 0.5), [0.5, +inf]
histogram_boundaries = [-25, 0, 25]

# Quantiles for distributions.
quantiles = [0.25, 0.5, 0.75]

# List of all aggregation types in the OTel Go SDK.
aggregations = ["hist", "dist", "sum", "mmsc", "lval"]

# A 2D dictionary of answers. Rows represent aggregation types and columns hold properties (name, description, label).
# Each individual dictionary element is a list of 11 elements, which represent:
# 0: Final Balue (sum / last value)
# 1. Min
# 2. Max
# 3. Count
# 4. (-inf, -25) bucket
# 5. (-inf, 0) bucket 
# 6. (-inf, 25) bucket
# 7. (-inf, +inf) bucket
# 8. 0.25 quantile
# 9. 0.5 quantile
# 10. 0.75 quantile.
answers = defaultdict(lambda: defaultdict(lambda: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))


In [3]:
##### GENERATING DATA FILE #####

# Open the data file to write to.
f = open(data_file, "w")

# Write `num_records` records to the file.
for i in range(num_records):
    # Randomly select an aggregation type.
    agg_type = random.choice(aggregations)

    # Create unique strings for the name, description, and label.
    name = f"name{i}_{agg_type}"
    description = f"description{i}"
    label = f"{{key{i}:value{i}}}"

    # Create a properties string that identifies the record with the name, description, and label.
    agg_properties = f"{name},{description},{label}"

    # Generate a list of `num_values` random values that will be used to update the CheckpointSet.
    values = random.sample(range(value_lower_limit, value_upper_limit), num_values)

    # Write different types of records depending on the aggregation type.
    record = f"{agg_type}|{str(values).replace(' ', '')}|{agg_properties}"
    if agg_type == "sum":
        # Final value (sum).
        answers["sum"][agg_properties][0] = sum(values)

    elif agg_type == "lval":
        # Final value (last value).
        answers["lval"][agg_properties][0] = values[len(values) - 1]

    elif agg_type == "mmsc":
        # Final value (sum), min, max, and count.
        answers["mmsc"][agg_properties][0] = sum(values)
        answers["mmsc"][agg_properties][1] = min(values)
        answers["mmsc"][agg_properties][2] = max(values)
        answers["mmsc"][agg_properties][3] = num_values

    # Distribution aggregations are MinMaxSumCount aggregations with quantiles.
    elif agg_type == "dist":
        # Final value (sum), min, max, and count.
        answers["dist"][agg_properties][0] = sum(values)
        answers["dist"][agg_properties][1] = min(values)
        answers["dist"][agg_properties][2] = max(values)
        answers["dist"][agg_properties][3] = num_values

        # Quantiles are calculated using numpy.
        values_numpy = np.array(values)
        answers["dist"][agg_properties][8] = int(np.percentile(values_numpy, quantiles[0]))
        answers["dist"][agg_properties][9] = int(np.percentile(values_numpy, quantiles[1]))
        answers["dist"][agg_properties][10] = int(np.percentile(values_numpy, quantiles[2]))
        
    elif agg_type == "hist":
        # Final value (sum).
        answers["hist"][agg_properties][0] = sum(values)

        # Count.
        answers["hist"][agg_properties][3] = num_values

        # (-inf, -25) bucket.
        answers["hist"][agg_properties][4] = len([i for i in values if i < -25])

        # (-inf, 0) bucket
        answers["hist"][agg_properties][5] = len([i for i in values if i < 0])

        # (-inf, 25) bucket
        answers["hist"][agg_properties][6] = len([i for i in values if i < 25])

        # (-inf, +inf) bucket
        answers["hist"][agg_properties][7] = num_values

    # Write the record to the file.
    f.write(record + "\n")
    
    
f.close()

In [4]:
import math

file = open(, "w")

for record in records:
    agg = record[0]    
    
    # record = (agg, ndl, val)
    if (agg == "counter" or agg == "gauge"):
        fa.write(agg + '|' + str(record[2]) + '|' + record[1] + '\n')
#         print("{}|{}|{}".format(agg, str(record[2]), record[1]))
        
    
    # record = (agg, ndl, update_values, quantiles)
    elif (agg == "sketch" or agg == "exact"):
        s = sum(record[2])
        c = len(record[2])
        quantile_values = []
        
        for quantile in record[3]:
            pos = (len(record[2])-1) * quantile
            ceiling = math.ceil(pos)
            quantile_values.append(record[2][ceiling])
        
        quantile_pairs = "[(" + str(record[3][0]) + "," + str(quantile_values[0]) + ")"
        for q, v in zip(record[3][1:], quantile_values[1:]):
            quantile_pairs = quantile_pairs + ",(" + str(q) + "," + str(v) + ")"
        quantile_pairs += "]"
        
        fa.write("summary|" + str(s) + "|" + str(c) + "|" + quantile_pairs + "|" + record[1] + '\n')
#         print("summary|{}|{}|{}|{}".format(s, c, quantile_pairs, record[1]))


    #record = (agg, ndl, update_values, boundaries)
    elif (agg == "histogram"):
        s = sum(record[2])
        c = len(record[2])
        
        
        # buckets is a dict of that correspond to pairs of (buckets, bucket_count)
        buckets = defaultdict(int)
        
        update_values_sorted = sorted(record[2])
        curr_boundary = -1
        values_within_bound = 0
        for boundary in record[3]:
            if(values_within_bound != len(update_values_sorted)):
                for v in update_values_sorted:
                    if v < boundary and v > curr_boundary:
                        buckets[str(boundary)] += 1
                        values_within_bound += 1
                    elif v < boundary and v <= curr_boundary:
                        continue
                    else:
                        curr_boundary = boundary
                        break
        if(len(update_values_sorted) - values_within_bound != 0):
            buckets["inf"] += len(update_values_sorted) - values_within_bound

        buckets_str = "["
        for k, v in buckets.items():
            buckets_str = buckets_str + "(" + k + "," + str(v) + "),"
        buckets_str += "]"
        
        # extra comma will be present at the end of bucket_str; no functional impact
        fa.write("histogram|" + str(s) + "|" + str(c) + "|" + buckets_str + "|" + record[1] + '\n')
#         print("histogram|{}|{}|{}|{}".format(s, c, buckets_str, record[1]))
            
fa.close()

SyntaxError: invalid syntax (<ipython-input-4-1b41e8bd425c>, line 3)