In [1]:
import pandas as pd
import numpy as np
from performance_anomaly_detection.data_preprocessing.traces_preprocess import TracesPreprocessor
from performance_anomaly_detection.data_preprocessing.metrics_preprocess import MetricsPreprocessor
from performance_anomaly_detection.data_preprocessing.data_combiner import combine_on_metrics, combine_on_spans, combine_on_traces


In [2]:
initial_start_time = 1578330000000000

def create_lambda_for_cutting_start_end(start, end):
    return lambda s :  s[s.find(start)+len(start):s.rfind(end)] if not pd.isnull(s) else s

traces_preprocessor = TracesPreprocessor("../data_collector/out/tracing_data.csv")
traces_preprocessor.minimize_start_time(initial_start_time)

trace_ids_map = traces_preprocessor.create_ids_map("trace_id")
span_ids_map = traces_preprocessor.create_ids_map("span_id")
traces_preprocessor.map_ids("trace_id", trace_ids_map)
traces_preprocessor.map_ids("span_id", span_ids_map)

clean_operation_column_lambda = lambda s: s.replace(".hipstershop", "").replace("./", ".").replace("/", ".")
traces_preprocessor.apply_lambda_to_column_overwrite("operation_name", clean_operation_column_lambda)

process_extract_lambda = create_lambda_for_cutting_start_end("process(service_name='", "', tags=[])")
traces_preprocessor.apply_lambda_to_column_overwrite("process", process_extract_lambda)

ref_type_extract_lambda = create_lambda_for_cutting_start_end("ref_type='", "', trace_id=")
traces_preprocessor.get_data()["ref_type"] = traces_preprocessor.apply_lambda_to_column("refs", process_extract_lambda)

ref_trace_extract_lambda = create_lambda_for_cutting_start_end(", trace_id=", ", span_id=")
traces_preprocessor.get_data()["ref_trace"] = traces_preprocessor.apply_lambda_to_column("refs", process_extract_lambda)
traces_preprocessor.map_ids("ref_trace", trace_ids_map)

ref_trace_extract_lambda = create_lambda_for_cutting_start_end(", span_id=", ", span_id=")
traces_preprocessor.get_data()["ref_span"] = traces_preprocessor.apply_lambda_to_column("refs", process_extract_lambda)
traces_preprocessor.map_ids("ref_span", span_ids_map)

traces_preprocessor.drop_single_column("refs")
traces_preprocessor.sort_values()

traces_preprocessor.create_end_time()


In [3]:
print("Rows: ", len(traces_preprocessor.get_data()))
print("Unique traces: ", len(traces_preprocessor.get_data().trace_id.unique()))


Rows:  324684
Unique traces:  43682


In [4]:
traces_preprocessor.get_data().head()

Unnamed: 0,trace_id,span_id,duration,flags,operation_name,parent_id,process,start_time,ref_type,ref_trace,ref_span,end_time
271974,36547,271974,1570563,1,Recv.,0,frontend,4721406063396,,,,4721407633959
242982,32623,242982,1514738,1,Recv.,0,frontend,4721406296431,,,,4721407811169
271972,36547,271972,113958,1,Sent.CurrencyService.Convert,0,frontend,4721406859509,"hild-of', trace_id=b'\n\xbcI\x9f\x8br!>\x1f{\x...",,,4721406973467
242985,32623,242985,127573,1,Sent.CurrencyService.Convert,0,frontend,4721406869722,"hild-of', trace_id=b'AC\x16\xd1N\x04\xb9\r\xee...",,,4721406997295
119970,16282,119970,1457313,1,Recv.,0,frontend,4721406880824,,,,4721408338137


In [5]:
metrics_preprocessor = MetricsPreprocessor("../data_collector/out/prom_currencyservice.csv")
metrics_preprocessor.normalize_time()
metrics_preprocessor.minimize_start_time(initial_start_time)
metrics_preprocessor.decumulate_data()
metrics_preprocessor.clean_column_names()

In [6]:
metrics_preprocessor.get_data().head()

Unnamed: 0,time,receive_packets,receive_packets_dropped,transmit_packets,transmit_packets_dropped,receive_bytes,transmit_errors,container_memory_usage_bytes_value,container_memory_working_set_bytes_value,container_cpu_system_seconds_total_value,container_cpu_usage_seconds_total_value
1,4722465678000,675.0,0.0,618.0,0.0,78262.0,0.0,0.0,0.0,0.0,0.0
2,4722480677000,596.0,0.0,527.0,0.0,68746.0,0.0,0.0,0.0,0.0,0.0
3,4722495658000,638.0,0.0,572.0,0.0,73626.0,0.0,0.0,0.0,0.0,0.0
4,4722510656000,600.0,0.0,534.0,0.0,67744.0,0.0,0.0,0.0,0.0,0.0
5,4722525633000,475.0,0.0,424.0,0.0,56847.0,0.0,0.0,0.0,0.0,0.0


### 1. Combine on metrics - get operations, durations in between metric scrapes

In [7]:
tracing_data = traces_preprocessor.get_data()
metrics_data = metrics_preprocessor.get_data()
combined_on_metrics_data = combine_on_metrics(tracing_data, metrics_data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [8]:
combined_on_metrics_data

Unnamed: 0,step,Recv.,Recv.ProductCatalogService.ListProducts,Recv._healthz,Recv.grpc.health.v1.Health.Check,Recv.setCurrency,Sent.AdService.GetAds,Sent.CartService.GetCart,Sent.CurrencyService.Convert,Sent.CurrencyService.GetSupportedCurrencies,...,receive_packets,receive_packets_dropped,transmit_packets,transmit_packets_dropped,receive_bytes,transmit_errors,container_memory_usage_bytes_value,container_memory_working_set_bytes_value,container_cpu_system_seconds_total_value,container_cpu_usage_seconds_total_value
0,1.0,187712.200000,3049.800000,117.0,116.00,1987.440000,19.880000,125451.560000,4953.640000,3938.960000,...,596.0,0.0,527.0,0.0,68746.0,0.0,0.0,0.0,0.0,0.0
1,2.0,90697.533333,5797.533333,140.5,189.50,619.000000,20.800000,30781.066667,4112.314815,9107.700000,...,638.0,0.0,572.0,0.0,73626.0,0.0,0.0,0.0,0.0,0.0
2,3.0,68756.608696,3160.739130,141.0,286.75,4617.826087,19.739130,7989.869565,2997.033816,8289.739130,...,600.0,0.0,534.0,0.0,67744.0,0.0,0.0,0.0,0.0,0.0
3,4.0,156149.000000,1694.130435,27036.5,222.00,3027.260870,25.695652,84646.739130,5889.487923,4569.347826,...,475.0,0.0,424.0,0.0,56847.0,0.0,0.0,0.0,0.0,0.0
4,5.0,117895.960000,4830.800000,119.0,203.00,445.880000,51.480000,53165.160000,5338.253333,1677.360000,...,573.0,0.0,527.0,0.0,66680.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
713,714.0,82577.193548,662.483871,365.0,152.50,360.322581,19.741935,35267.870968,3728.032258,8159.032258,...,660.0,0.0,574.0,0.0,74467.0,0.0,0.0,0.0,0.0,0.0
714,715.0,161973.160000,906.240000,376.0,169.50,399.320000,16058.680000,80601.040000,4879.311111,13347.760000,...,549.0,0.0,495.0,0.0,65056.0,0.0,0.0,0.0,0.0,0.0
715,716.0,72357.461538,737.192308,189.5,77.00,6010.653846,19.038462,9071.076923,5799.850427,6548.230769,...,617.0,0.0,572.0,0.0,71024.0,0.0,0.0,0.0,0.0,0.0
716,717.0,114049.923077,2388.961538,451.0,228.75,397.807692,17.720000,66432.115385,4145.897778,1913.153846,...,620.0,0.0,550.0,0.0,69348.0,0.0,0.0,0.0,0.0,0.0


In [9]:
combined_on_metrics_data.to_csv("data_based_on_metrics/data_50_100_250.csv", sep=",", index=False)

### 2. Combine on spans - add metrics to each row for process

In [10]:
tracing_data = traces_preprocessor.get_data()
metrics_data = metrics_preprocessor.get_data()
combined_on_spans_data = combine_on_spans(tracing_data, metrics_data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [11]:
combined_on_spans_data

Unnamed: 0,operation_name,duration,start_time,step,time,receive_packets,receive_packets_dropped,transmit_packets,transmit_packets_dropped,receive_bytes,transmit_errors,container_memory_usage_bytes_value,container_memory_working_set_bytes_value,container_cpu_system_seconds_total_value,container_cpu_usage_seconds_total_value
0,Recv.setCurrency,812,4722467230915,1.0,4722480677000,596.0,0.0,527.0,0.0,68746.0,0.0,0.0,0.0,0.0,0.0
1,Recv.,68302,4722467236568,1.0,4722480677000,596.0,0.0,527.0,0.0,68746.0,0.0,0.0,0.0,0.0,0.0
2,Sent.CurrencyService.GetSupportedCurrencies,1605,4722467237116,1.0,4722480677000,596.0,0.0,527.0,0.0,68746.0,0.0,0.0,0.0,0.0,0.0
3,Sent.ProductCatalogService.ListProducts,1919,4722467238744,1.0,4722480677000,596.0,0.0,527.0,0.0,68746.0,0.0,0.0,0.0,0.0,0.0
4,Recv.ProductCatalogService.ListProducts,1458,4722467238993,1.0,4722480677000,596.0,0.0,527.0,0.0,68746.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294940,Sent.CurrencyService.Convert,1054,4733227461450,718.0,4733227851000,670.0,0.0,599.0,0.0,76641.0,0.0,0.0,0.0,0.0,0.0
294941,Sent.CurrencyService.Convert,836,4733227462549,718.0,4733227851000,670.0,0.0,599.0,0.0,76641.0,0.0,0.0,0.0,0.0,0.0
294942,Sent.CurrencyService.Convert,914,4733227463414,718.0,4733227851000,670.0,0.0,599.0,0.0,76641.0,0.0,0.0,0.0,0.0,0.0
294943,Sent.CurrencyService.Convert,918,4733227464348,718.0,4733227851000,670.0,0.0,599.0,0.0,76641.0,0.0,0.0,0.0,0.0,0.0


In [12]:
combined_on_spans_data.to_csv("data_based_on_spans/data_50_100_250.csv", sep=",", index=False)

### 3. Combine on traces

In [13]:
tracing_data = traces_preprocessor.get_data()
metrics_data = metrics_preprocessor.get_data()
combine_on_traces_data = combine_on_traces(tracing_data, metrics_data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [14]:
combine_on_traces_data

Unnamed: 0,trace_id,Recv.,Recv.ProductCatalogService.ListProducts,Recv._healthz,Recv.grpc.health.v1.Health.Check,Recv.setCurrency,Sent.AdService.GetAds,Sent.CartService.GetCart,Sent.CurrencyService.Convert,Sent.CurrencyService.GetSupportedCurrencies,...,receive_packets_dropped,transmit_packets,transmit_packets_dropped,receive_bytes,transmit_errors,container_memory_usage_bytes_value,container_memory_working_set_bytes_value,container_cpu_system_seconds_total_value,container_cpu_usage_seconds_total_value,trace_duration
0,1,0.0,0.0,0.0,234.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,546.0,0.0,73733.0,0.0,0.0,0.0,0.0,0.0,234
1,948,69319.0,597.0,0.0,0.0,0.0,15.0,57988.0,794.666667,1314.0,...,0.0,546.0,0.0,73733.0,0.0,0.0,0.0,0.0,0.0,69319
2,1569,896627.0,558.0,0.0,0.0,0.0,13.0,799116.0,10485.888889,990.0,...,0.0,546.0,0.0,73733.0,0.0,0.0,0.0,0.0,0.0,896627
3,1853,0.0,0.0,0.0,0.0,161.0,0.0,0.0,0.000000,0.0,...,0.0,546.0,0.0,73733.0,0.0,0.0,0.0,0.0,0.0,161
4,2111,49279.0,500.0,0.0,0.0,0.0,12.0,2616.0,4744.888889,1553.0,...,0.0,546.0,0.0,73733.0,0.0,0.0,0.0,0.0,0.0,49279
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39685,42687,0.0,0.0,0.0,0.0,254.0,0.0,0.0,0.000000,0.0,...,0.0,599.0,0.0,76641.0,0.0,0.0,0.0,0.0,0.0,254
39686,42705,0.0,0.0,0.0,0.0,393.0,0.0,0.0,0.000000,0.0,...,0.0,599.0,0.0,76641.0,0.0,0.0,0.0,0.0,0.0,393
39687,43045,0.0,0.0,0.0,0.0,228.0,0.0,0.0,0.000000,0.0,...,0.0,599.0,0.0,76641.0,0.0,0.0,0.0,0.0,0.0,228
39688,43148,0.0,0.0,0.0,79.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,599.0,0.0,76641.0,0.0,0.0,0.0,0.0,0.0,79


In [15]:
combine_on_traces_data.to_csv("data_based_on_traces/data_50_100_250.csv", sep=",", index=False)