### Imported Libraries

In [None]:
import pandas as pd
import numpy as np
import math as mt
from IPython.core.interactiveshell import InteractiveShell
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
from utils import *
import datetime as dt
from scipy import stats
from scipy.spatial.distance import pdist
import matplotlib as mpl
## agg backend is used to create plot as a .png file
#mpl.use('agg')
from matplotlib.ticker import MultipleLocator, FormatStrFormatter

### Parameters

In [None]:
metric = "response_time_max"
dataset = "baseline40-80-jan2022.csv"
sdataset = "hiccup-no-stiffle-40-80v1.csv"
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

In [None]:
spadata = "spadata.csv"
spadatablob = "spadata-blob.csv"
spadatatrucks = "spadata-empty-semi-trucks.csv"
spadataexpensive = "spadata-expensive-db.csv"
spadatastifle = "spadata-stifle.csv"
spadatajam = "spadata-traffic-jam.csv"
spadatacont = "spadata-continuous.csv"
spadatahic = "spadata-hiccups.csv"

### Util Functions

In [None]:
def label_outliers(anomaly_counter):
    """We label as outliner only the rows with anomaly_counter equals to -1"""
    if anomaly_counter == -1:
        return "Outlier" 
    else: 
        return "Inliner"

In [None]:
def calc_baseline(mean, std):
    """We multiply by 0.6 assuming that the system without queuing has a mean response time 60% lower (T = mean * (1-load))"""
    return (mean+3*std)
   

In [None]:
def eval_counters(counter, baseline):
    """We compare a measurement with the baseline of a given counter"""
    if counter >= baseline:
        return "Fail" 
    else:
        return "Pass"

### Data Extraction

In [None]:
bstatistic_df = pd.read_csv(dataset)
statistic_df = pd.read_csv(sdataset)

In [None]:
spa_df = pd.read_csv(spadata)
spablob_df = pd.read_csv(spadatablob)
spatrucks_df = pd.read_csv(spadatatrucks)
spaexpensive_df = pd.read_csv(spadataexpensive)
spastifle_df = pd.read_csv(spadatastifle)
spajam_df = pd.read_csv(spadatajam)
spacont_df = pd.read_csv(spadatacont)
spahic_df = pd.read_csv(spadatahic)

In [None]:
bstatistic_df

In [None]:
statistic_df

In [None]:
spa_df

In [None]:
#spablob_df

In [None]:
#spatrucks_df
#spaexpensive_df
#spastifle_df
#spajam_df
#spacont_df
spahic_df

In [None]:
#for load in lb_detection_df.load.unique():
 #   if (load == 90):
       # for counter in lb_detection_df.loc[lb_detection_df.counter_name == service, 'counter_name'].unique():
figure = plt.figure
ax = plt.gca()
#ax.scatter(spa_df['ndistance'], spa_df['slope'], color='red')


ax.scatter(spablob_df['ndistance'], spablob_df['Slope'], color='blue',label='the blob')
ax.scatter(spatrucks_df['ndistance'], spatrucks_df['Slope'], color='green',label='empty semi trucks')
ax.scatter(spaexpensive_df['ndistance'], spaexpensive_df['Slope'], color='red',label='expensive db calls')
ax.scatter(spastifle_df['ndistance'], spastifle_df['Slope'], color='purple',label='the stifle')
ax.scatter(spajam_df['ndistance'], spajam_df['Slope'], color='orange',label='traffic jam')
ax.scatter(spacont_df['ndistance'], spacont_df['Slope'], color='black',label='continuous violated requirements')
ax.scatter(spahic_df['ndistance'], spahic_df['Slope'], color='magenta',label='application hiccups')
ax.set_xlabel(' normalized distance > 1 failed performance requirement baseline')
ax.set_ylabel('slope > 0.1 failed scalability trend')
ax.set_ylim(-10,550)
        #ax.set_title("{} vs {}".format(x_col, y_col))
majorLocator   = MultipleLocator(5)
majorFormatter = FormatStrFormatter('%d')
minorLocator   = MultipleLocator(1)
plt.axhline(0.1,color='black')  #horizontal line
plt.axvline(1,color='black')  #vertical line
#plt.legend()
#ax.legend(fancybox=True, framealpha=1, shadow=True, borderpad=1, loc='upper left')
ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.05),
          ncol=2, fancybox=True, shadow=True)
#plt.show()
plt.grid()


In [None]:
#for load in lb_detection_df.load.unique():
 #   if (load == 90):
       # for counter in lb_detection_df.loc[lb_detection_df.counter_name == service, 'counter_name'].unique():
figure = plt.figure
ax = plt.gca()
ax.scatter(spa_df['ndistance'], spa_df['slope'], color='red')


ax.set_xlabel(' normalized distance > 1 failed performance requirement baseline')
ax.set_ylabel('slope > 0.1 failed scalability trend')
ax.set_ylim(-10,400)
        #ax.set_title("{} vs {}".format(x_col, y_col))
majorLocator   = MultipleLocator(5)
majorFormatter = FormatStrFormatter('%d')
minorLocator   = MultipleLocator(1)
plt.axhline(0.1,color='black')  #horizontal line
plt.axvline(1,color='black')  #vertical line


#plt.show()
plt.grid()


### Statistical analysis

In [None]:
# Distribution (probability of S)
bstatistic_df["load"].value_counts(normalize=True)

In [None]:
# Calculate the probability of a giving load
load_probabilities_df = pd.DataFrame(bstatistic_df.groupby(['load']).instance.count().rename("load_probability"))
load_probabilities_df.load_probability /= load_probabilities_df.load_probability.sum()
lstatistic_df = pd.merge(bstatistic_df, load_probabilities_df, left_on=['load'], right_index=True)

In [None]:
# Calculate probability of executing any of the operations by summing up all operations as 
# the denominator and the operation counter as the numerator
probabilities_df = pd.DataFrame(lstatistic_df.groupby(['counter_name']).instance.count().rename("activation_probability"))
#probabilities_df = pd.DataFrame(probabilities_df/probabilities_df.groupby(level=[0, 1]).transform("sum"))
probabilities_df.activation_probability /= probabilities_df.activation_probability.sum()
probabilities_df = probabilities_df.reset_index()
lstatistic_df = pd.merge(lstatistic_df, probabilities_df, on=['counter_name'])

In [None]:
# Baseline calculation and assessment - base statistics
assessment_df = pd.DataFrame(bstatistic_df.groupby(['counter_name'])[metric].agg(['mean', 'std']))
assessment_df['baseline'] = assessment_df.apply(lambda x: calc_baseline(x["mean"], x["std"]), axis=1)
#statistics_df = pd.merge(statistic_df.groupby(['counter_name'])[metric].agg(['mean', 'std']))
statistic_df = pd.merge(statistic_df, assessment_df[['baseline']], 
                        left_on='counter_name', right_index=True).reset_index(drop=True)
statistic_df["assessment"] = statistic_df.apply(lambda x: eval_counters(x[metric], x["baseline"]), axis=1)
statistic_df

In [None]:
# Calculate fraction of successful service execution * the probability activation of the services
s_df = lstatistic_df.groupby(['load', 'load_probability', 'counter_name', 'activation_probability']).instance.count().rename("s")
s_df = pd.DataFrame(s_df/s_df.groupby(level=[0, 1, 2]).transform("sum"))
s_df = s_df.reset_index()
#s_df = s_df[s_df.assessment == 'Pass'].sort_values(['load','s'], ascending=[True, False]).reset_index(drop=True)
#s_df = s_df.drop(columns=['assessment'])
groupby_dict = {"activation_probability":"s", 
           "s":"s"} 
s_df = s_df.set_index(['load', 'load_probability', 'counter_name'])
s_df = s_df.groupby(groupby_dict, axis = 1).prod().reset_index()

In [None]:
s_df.head()

In [None]:
# Add analysis timestamp
statistic_df['analysis_timestamp'] = dt.datetime.today()
#domain_metric_df['analysis_timestamp'] = dt.datetime.today()

In [None]:
# Prepare data for training
lb_detection_df = statistic_df.copy()
lb_detection_df['anomaly'] = pd.Series()
assessment_df

In [None]:
#need to use baseline from bstatistics and max from statistics
#baseline is from bstatistics to metrics is from statistics
df=pd.DataFrame(statistic_df.groupby(['counter_name','load']).agg({metric:'max','baseline':'mean'}))
df.columns.name=None
df=df.reset_index()
df['distance'] = df.baseline - df[metric]
df['ndistance'] = 2*df[metric]/(df.baseline+df[metric])
df['assessment'] = df.distance.apply(lambda x: False if (x>0) else True)
df['binary'] = df.distance.apply(lambda x: 0 if (x>0) else 1)

bs_df=pd.DataFrame(df.groupby(['counter_name']).binary.sum())

df = pd.merge(df, bs_df, on="counter_name")
df = df.rename(columns={"binary_x":"binary", "binary_y":"sbinary"})
df

In [None]:


#measurement < baseline ndistance -> 0, measurement = baseline ndistance = 0.5
#measurement >> baseline ndistance -> 1



In [None]:
bs_df.binary

In [None]:
#compute slope and concatenate to df
#create slope df
slope_df = df.copy()
slope_df['slope'] = pd.Series()
for service in lb_detection_df.counter_name.unique():
    for counter in lb_detection_df.loc[lb_detection_df.counter_name == service, 'counter_name'].unique():
        x = lb_detection_df.loc[(lb_detection_df.counter_name == counter), 'load']
        y = lb_detection_df.loc[(lb_detection_df.counter_name == counter), metric]
        b = lb_detection_df.loc[(lb_detection_df.counter_name == counter), 'baseline']
        slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
        #degree = 2
        #coeffs = np.polyfit(x, y, degree)
        # now, coeffs is an array which contains the polynomial coefficients
        # in ascending order, i.e. x^0, x^1, x^2
        #print('1:ndistance\\' + str(slope_df.loc[(lb_detection_df.counter_name == counter), 'ndistance']))
        #intercept1, linear, quadratic = coeffs
        slope_df.loc[(slope_df.counter_name == counter), 'slope'] = slope
        #print('2:slope\\' + str(slope_df.loc[(lb_detection_df.counter_name == counter), 'slope']))
        
        #slope_df.loc[(slope_df.counter_name == counter), 'quad'] = quadratic
        #print('2:ndistance' + str(slope_df.loc[(lb_detection_df.counter_name == counter), 'ndistance']))
slope_df


In [None]:
#slope_df.loc[slope_df.load == 90,['counter_name','ndistance','slope', 'sbinary']].to_csv('eo.csv',index=False)
slope_df.loc[slope_df.load == 80,['counter_name','ndistance','slope']].to_csv('edb-slope_nd20.csv',index=False)

In [None]:
meas_80_df = slope_df.loc[slope_df.load == 80,['counter_name','ndistance','slope']]
meas_80_df

In [None]:
full_meas_80_df = slope_df.loc[slope_df.load == 80,['ndistance','slope','counter_name','assessment']]

In [None]:
ax = plt.gca()
ax.scatter(meas_80_df['ndistance'], meas_80_df['slope'], color='blue')
ax.set_xlabel(' normalized distance > 1 failed performance requirement baseline, load = 40%')
ax.set_ylabel('slope > 0.1 failed scalability trend')
        #ax.set_title("{} vs {}".format(x_col, y_col))
majorLocator   = MultipleLocator(5)
majorFormatter = FormatStrFormatter('%d')
minorLocator   = MultipleLocator(1)
    #plt.legend()
#plt.show()
plt.axhline(0.1,color='black')  #horizontal line
plt.axvline(1,color='black')  #vertical line
plt.grid()

In [None]:
#meas_90_df['slope']/meas_90_df['slope'].max()

In [None]:
#spa_t_df['slope']/spa_t_df['slope'].max()

In [None]:
#plot with vertical lines per SPA

ax = plt.gca()
ax.scatter(meas_80_df['ndistance'], (meas_80_df['slope']/meas_80_df['slope'].max()), color='blue')


ax.set_xlabel(' normalized distance > 1 failed performance requirement baseline, load = 80%')
ax.set_ylabel('nomalized slope')
        #ax.set_title("{} vs {}".format(x_col, y_col))

# Prepare data for training
spa_t_df = spa_df.copy()
spa_t_df['index'] = pd.Series()
max=spa_t_df['slope'].max()
plt.axvline(spa_t_df['ndistance'][0], color='magenta',label=spa_t_df['spa'][0]) 
plt.axvline(spa_t_df['ndistance'][1], color='purple',label=spa_t_df['spa'][1])
plt.axvline(spa_t_df['ndistance'][2], color='green',label=spa_t_df['spa'][2])
plt.axvline(spa_t_df['ndistance'][3], color='yellow',label=spa_t_df['spa'][3])
plt.axvline(spa_t_df['ndistance'][4], color='blue',label=spa_t_df['spa'][4])
plt.axvline(spa_t_df['ndistance'][5], color='orange',label=spa_t_df['spa'][5])
plt.axvline(spa_t_df['ndistance'][6], color='red',label=spa_t_df['spa'][6])
plt.axhline((spa_t_df['slope'][0]/spa_t_df['slope'].max()), color='magenta') 
plt.axhline((spa_t_df['slope'][1]/spa_t_df['slope'].max()), color='purple')
plt.axhline((spa_t_df['slope'][2]/spa_t_df['slope'].max()), color='green')
plt.axhline((spa_t_df['slope'][3]/spa_t_df['slope'].max()), color='yellow')
plt.axhline((spa_t_df['slope'][4]/spa_t_df['slope'].max()), color='blue')
plt.axhline((spa_t_df['slope'][5]/spa_t_df['slope'].max()), color='orange')
plt.axhline((spa_t_df['slope'][6]/spa_t_df['slope'].max()), color='red')
#plt.grid()
ax.set_ylim(-0.2,1.1)
ax.set_xlim(0,2)
        #ax.set_title("{} vs {}".format(x_col, y_col))
majorLocator   = MultipleLocator(5)
majorFormatter = FormatStrFormatter('%d')
minorLocator   = MultipleLocator(1)
plt.axhline(0.1,color='black')  #horizontal line
plt.axvline(1,color='black')  #vertical line
majorLocator   = MultipleLocator(5)
majorFormatter = FormatStrFormatter('%d')
minorLocator   = MultipleLocator(1)
#plt.legend()
#plt.show()
plt.axhline(0.1,color='black')  #horizontal line
plt.axvline(1,color='black')  #vertical line

#ax.legend(fancybox=True, framealpha=1, shadow=True, borderpad=1, loc='upper left')
ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.05),
          ncol=2, fancybox=True, shadow=True)
plt.legend()
plt.show()
#plt.grid()

In [None]:
spa_t_df

In [None]:
#plot with vertical lines per SPA

ax = plt.gca()
ax.scatter(meas_80_df['ndistance'], meas_80_df['slope'], color='blue')


ax.set_xlabel(' normalized distance > 1 failed performance requirement baseline, load = 90%')
ax.set_ylabel('slope > 0.1 failed scalability trend')
        #ax.set_title("{} vs {}".format(x_col, y_col))

# Prepare data for training
spa_t_df = spa_df.copy()
spa_t_df['index'] = pd.Series()
max=spa_t_df['slope'].max()
plt.axvline(spa_t_df['ndistance'][0], color='magenta',label=spa_t_df['spa'][0]) 
plt.axvline(spa_t_df['ndistance'][1], color='purple',label=spa_t_df['spa'][1])
plt.axvline(spa_t_df['ndistance'][2], color='green',label=spa_t_df['spa'][2])
plt.axvline(spa_t_df['ndistance'][3], color='yellow',label=spa_t_df['spa'][3])
plt.axvline(spa_t_df['ndistance'][4], color='blue',label=spa_t_df['spa'][4])
plt.axvline(spa_t_df['ndistance'][5], color='orange',label=spa_t_df['spa'][5])
plt.axhline(spa_t_df['ndistance'][6], color='red',label=spa_t_df['spa'][6])
#plt.grid()
ax.set_ylim(-30,350)
ax.set_xlim(0,2)
        #ax.set_title("{} vs {}".format(x_col, y_col))
majorLocator   = MultipleLocator(5)
majorFormatter = FormatStrFormatter('%d')
minorLocator   = MultipleLocator(1)
plt.axhline(0.1,color='black')  #horizontal line
plt.axvline(1,color='black')  #vertical line
majorLocator   = MultipleLocator(5)
majorFormatter = FormatStrFormatter('%d')
minorLocator   = MultipleLocator(1)
#plt.legend()
#plt.show()
plt.axhline(0.1,color='black')  #horizontal line
plt.axvline(1,color='black')  #vertical line

#ax.legend(fancybox=True, framealpha=1, shadow=True, borderpad=1, loc='upper left')
ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.05),
          ncol=2, fancybox=True, shadow=True)
plt.legend()
plt.show()
plt.grid()

In [None]:

#for load in lb_detection_df.load.unique():
 #   if (load == 90):
       # for counter in lb_detection_df.loc[lb_detection_df.counter_name == service, 'counter_name'].unique():
figure = plt.figure
ax = plt.gca()
ax.scatter(slope_df['ndistance'], slope_df['slope'], color='red')
ax.set_xlabel(' normalized distance > 1 failed performance requirement baseline')
ax.set_ylabel('slope > 0.1 failed scalability trend')
        #ax.set_title("{} vs {}".format(x_col, y_col))
majorLocator   = MultipleLocator(5)
majorFormatter = FormatStrFormatter('%d')
minorLocator   = MultipleLocator(1)
    #plt.legend()
#plt.show()
plt.axhline(0.1,color='black')  #horizontal line
plt.axvline(1,color='black')  #vertical line
plt.grid()


In [None]:
#slope_df

In [None]:
#sns.set(style="whitegrid")
sns.set()
g=sns.relplot(x='ndistance',y='slope',
              hue="counter_name",
              data=slope_df, style="assessment",col="load")

plt.savefig('hiccup-sens20.pdf')

In [None]:
#full_meas_90_df
sns.set()
sns.relplot(x='ndistance',y='slope',
          hue="counter_name", style="assessment",
            data=full_meas_80_df,legend="full")
plt.savefig('hiccup-40-80load80-0821.pdf')

In [None]:
#quadratic
#for service in lb_detection_df.counter_name.unique():
#    for counter in lb_detection_df.loc[lb_detection_df.counter_name == service, 'counter_name'].unique():
#        figure = plt.figure
#        ax = plt.gca()
#        ax.scatter(slope_df['ndistance'], slope_df['quad'])
 #       ax.set_xlabel(' normalized distance')
 #       ax.set_ylabel('quadratic')
 #   plt.show()

In [None]:
#distances and plot slope vs. distance
for service in lb_detection_df.counter_name.unique():
    for counter in lb_detection_df.loc[lb_detection_df.counter_name == service, 'counter_name'].unique():
        figure = plt.figure
        ax = plt.gca()
        ax.scatter(slope_df['distance'], slope_df['slope'])
        ax.set_xlabel('distance')
        ax.set_ylabel('slope')
    plt.show()


In [None]:
failed_distance_df = pd.DataFrame(df.groupby(['counter_name']).assessment.any())
failed_distance_df.columns.name=None
failed_distance_df=failed_distance_df.reset_index()
failed_distance_df

In [None]:
for counter in failed_distance_df.loc[failed_distance_df.assessment == True,'counter_name'].values:
    plt.figure()
    x = df.loc[df.counter_name == counter, 'load']
    y = df.loc[df.counter_name == counter, metric]
    b = df.loc[df.counter_name == counter, 'baseline']
    plt.xlabel(str(counter) + ' MAX FAILED ')
    plt.scatter(x, y, s=10, color='green')
    plt.plot(x,b,color='red',label='baseline')

In [None]:
# Train models (one per counter, load as the second dimension of the anomaly detection process)
for service in lb_detection_df.counter_name.unique():
    for counter in lb_detection_df.loc[lb_detection_df.counter_name == service, 'counter_name'].unique():
        x = lb_detection_df.loc[(lb_detection_df.counter_name == counter), 'load']
        y = lb_detection_df.loc[(lb_detection_df.counter_name == counter), metric]
        b = lb_detection_df.loc[(lb_detection_df.counter_name == counter), 'baseline']
        slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
        print('counter = ' + str(counter) + ' slope = ' + str(slope))
        if (slope < 0.01):
            plt.figure()
            print('p_value = '+ str(p_value))
            print('slope = ' + str(slope))
            print('intercept = ' +str(intercept))
            print('r_value = ' +str(r_value))
            print('std_err = ' +str(std_err))
            x = lb_detection_df.loc[(lb_detection_df.counter_name == counter), 'load']
            y = lb_detection_df.loc[(lb_detection_df.counter_name == counter), metric]
            plt.xlabel(str(counter) + '\n SLOPE PASS -- p_value = ' + str(p_value) + '   slope = '+ str(slope) + ' r_value = ' + str(r_value))
            plt.scatter(x, y, s=10, color='green')
            plt.plot(x,b,color='red',label='baseline')

        
        if (slope >= 0.01): #and (p_value < 0.05):
            plt.figure()
            x = lb_detection_df.loc[(lb_detection_df.counter_name == counter), 'load']
            y = lb_detection_df.loc[(lb_detection_df.counter_name == counter), metric]
            plt.xlabel(str(counter) + '\n SLOPE FAIL p_value = ' + str(p_value) + '   slope = '+ str(slope) + ' r_value = ' + str(r_value))
            plt.scatter(x, y, s=10, color='green')
            plt.plot(x,b,color='red',label='baseline')
            
            