### Imported Libraries

In [None]:
#import os
#os.chdir('C:/Users/emrcaah/chanterelle-experimentation-master/chanterelle-experimentation-master/notebooks')

In [None]:
import pandas as pd
import numpy as np
import math as mt
from IPython.core.interactiveshell import InteractiveShell
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
from utils import *
import datetime as dt
from scipy import stats
from scipy.spatial.distance import pdist
import matplotlib as mpl
## agg backend is used to create plot as a .png file
#mpl.use('agg')
from matplotlib.ticker import MultipleLocator, FormatStrFormatter

### Parameters

In [None]:
metric = "response_time_max"
dataset = "pm_transformed-fixed-sp.csv"
#dataset = "load40and90-sim.csv"
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

In [None]:
spadata = "spadata.csv"
spadatablob = "spadata-blob.csv"
spadatatrucks = "spadata-empty-semi-trucks.csv"
spadataexpensive = "spadata-expensive-db.csv"
spadatastifle = "spadata-stifle.csv"
spadatajam = "spadata-traffic-jam.csv"
spadatacont = "spadata-continuous.csv"
spadatahic = "spadata-hiccups.csv"

### Util Functions

In [None]:
def label_outliers(anomaly_counter):
    """We label as outliner only the rows with anomaly_counter equals to -1"""
    if anomaly_counter == -1:
        return "Outlier" 
    else: 
        return "Inliner"

In [None]:
def calc_baseline(mean, std):
    """We multiply by 0.6 assuming that the system without queuing has a mean response time 60% lower (T = mean * (1-load))"""
    return (mean * 0.6 + 3*std)

In [None]:
def eval_counters(counter, baseline):
    """We compare a measurement with the baseline of a given counter"""
    if counter >= baseline:
        return "Fail" 
    else:
        return "Pass"

### Data Extraction

In [None]:
statistic_df = pd.read_csv("pm_transformed-fixed-sp.csv")

In [None]:
spa_df = pd.read_csv(spadata)
spablob_df = pd.read_csv(spadatablob)
spatrucks_df = pd.read_csv(spadatatrucks)
spaexpensive_df = pd.read_csv(spadataexpensive)
spastifle_df = pd.read_csv(spadatastifle)
spajam_df = pd.read_csv(spadatajam)
spacont_df = pd.read_csv(spadatacont)
spahic_df = pd.read_csv(spadatahic)

In [None]:
statistic_df.head()

In [None]:
#for load in lb_detection_df.load.unique():
 #   if (load == 90):
       # for counter in lb_detection_df.loc[lb_detection_df.counter_name == service, 'counter_name'].unique():
figure = plt.figure
ax = plt.gca()
#ax.scatter(spa_df['ndistance'], spa_df['slope'], color='red')


ax.scatter(spablob_df['ndistance'], spablob_df['Slope'], color='blue',label='the blob')
ax.scatter(spatrucks_df['ndistance'], spatrucks_df['Slope'], color='green',label='empty semi trucks')
ax.scatter(spaexpensive_df['ndistance'], spaexpensive_df['Slope'], color='red',label='expensive db calls')
ax.scatter(spastifle_df['ndistance'], spastifle_df['Slope'], color='purple',label='the stifle')
ax.scatter(spajam_df['ndistance'], spajam_df['Slope'], color='orange',label='traffic jam')
ax.scatter(spacont_df['ndistance'], spacont_df['Slope'], color='black',label='continuous violated requirements')
ax.scatter(spahic_df['ndistance'], spahic_df['Slope'], color='magenta',label='application hiccups')
ax.set_xlabel(' normalized distance > 1 failed performance requirement baseline')
ax.set_ylabel('slope > 0.1 failed scalability trend')
ax.set_ylim(-10,550)
        #ax.set_title("{} vs {}".format(x_col, y_col))
majorLocator   = MultipleLocator(5)
majorFormatter = FormatStrFormatter('%d')
minorLocator   = MultipleLocator(1)
plt.axhline(0.1,color='black')  #horizontal line
plt.axvline(1,color='black')  #vertical line
#plt.legend()
#ax.legend(fancybox=True, framealpha=1, shadow=True, borderpad=1, loc='upper left')
ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.05),
          ncol=2, fancybox=True, shadow=True)
#plt.show()
plt.grid()
plt.savefig('spa_all.pdf')


In [None]:
#for load in lb_detection_df.load.unique():
 #   if (load == 90):
       # for counter in lb_detection_df.loc[lb_detection_df.counter_name == service, 'counter_name'].unique():
figure = plt.figure
ax = plt.gca()
ax.scatter(spa_df['ndistance'], spa_df['slope'], color='red')


ax.set_xlabel(' normalized distance > 1 failed performance requirement baseline')
ax.set_ylabel('slope > 0.1 failed scalability trend')
ax.set_ylim(-10,400)
        #ax.set_title("{} vs {}".format(x_col, y_col))
majorLocator   = MultipleLocator(5)
majorFormatter = FormatStrFormatter('%d')
minorLocator   = MultipleLocator(1)
plt.axhline(0.1,color='black')  #horizontal line
plt.axvline(1,color='black')  #vertical line


#plt.show()
plt.grid()
plt.savefig('spa_90.pdf')


### Statistical analysis

In [None]:
# Distribution (probability of S)
statistic_df["load"].value_counts(normalize=True)

In [None]:
# Calculate the probability of a giving load
load_probabilities_df = pd.DataFrame(statistic_df.groupby(['load']).instance.count().rename("load_probability"))
print(load_probabilities_df)
load_probabilities_df.load_probability /= load_probabilities_df.load_probability.sum()
statistic_df = pd.merge(statistic_df, load_probabilities_df, left_on=['load'], right_index=True)

In [None]:
print(load_probabilities_df)

In [None]:
statistic_df.head()

In [None]:
# Calculate probability of executing any of the operations by summing up all operations as 
# the denominator and the operation counter as the numerator
probabilities_df = pd.DataFrame(statistic_df.groupby(['counter_name']).instance.count().rename("activation_probability"))
#probabilities_df = pd.DataFrame(probabilities_df/probabilities_df.groupby(level=[0, 1]).transform("sum"))
probabilities_df.activation_probability /= probabilities_df.activation_probability.sum()
probabilities_df = probabilities_df.reset_index()
statistic_df = pd.merge(statistic_df, probabilities_df, on=['counter_name'])

In [None]:
statistic_df.head()

In [None]:
# Baseline calculation and assessment
assessment_df = pd.DataFrame(statistic_df.groupby(['counter_name'])[metric].agg(['mean', 'std']))

assessment_df['baseline'] = assessment_df.apply(lambda x: calc_baseline(x["mean"], x["std"]), axis=1)
print(assessment_df)


In [None]:
statistic_df = pd.merge(statistic_df, assessment_df[['baseline']], 
                        left_on='counter_name', right_index=True).reset_index(drop=True)
statistic_df["assessment"] = statistic_df.apply(lambda x: eval_counters(x[metric], x["baseline"]), axis=1)
statistic_df.head()

In [None]:
# Calculate fraction of successful service execution * the probability activation of the services
s_df = statistic_df.groupby(['load', 'load_probability', 'counter_name', 'activation_probability', 'assessment']).instance.count().rename("s")
s_df = pd.DataFrame(s_df/s_df.groupby(level=[0, 1, 2]).transform("sum"))
s_df = s_df.reset_index()
s_df = s_df[s_df.assessment == 'Pass'].sort_values(['load','s'], ascending=[True, False]).reset_index(drop=True)
s_df = s_df.drop(columns=['assessment'])
groupby_dict = {"activation_probability":"s", 
           "s":"s"} 
s_df = s_df.set_index(['load', 'load_probability', 'counter_name'])
s_df = s_df.groupby(groupby_dict, axis = 1).prod().reset_index()

In [None]:
s_df.head()

In [None]:
# Add analysis timestamp
statistic_df['analysis_timestamp'] = dt.datetime.today()
#domain_metric_df['analysis_timestamp'] = dt.datetime.today()

In [None]:
# Prepare data for training
lb_detection_df = statistic_df.copy()
lb_detection_df['anomaly'] = pd.Series()

In [None]:
lb_detection_df.head()

In [None]:
df=pd.DataFrame(statistic_df.groupby(['counter_name','load']).agg({metric:'max','baseline':'mean'}))
df.columns.name=None

In [None]:
df=pd.DataFrame(statistic_df.groupby(['counter_name','load']).agg({metric:'max','baseline':'mean'}))
df.columns.name=None
df=df.reset_index()
df['distance'] = df.baseline - df[metric]
df['ndistance'] = 2*df[metric]/(df.baseline+df[metric])
df['assessment'] = df.distance.apply(lambda x: False if (x>0) else True)
df['binary'] = df.distance.apply(lambda x: 0 if (x>0) else 1)

bs_df=pd.DataFrame(df.groupby(['counter_name']).binary.sum())

df = pd.merge(df, bs_df, on="counter_name")
df = df.rename(columns={"binary_x":"binary", "binary_y":"sbinary"})

In [None]:
#measurement < baseline ndistance -> 0, measurement = baseline ndistance = 0.5
#measurement >> baseline ndistance -> 1

In [None]:
df.head()

In [None]:
#compute slope and concatenate to df
#create slope df
slope_df = df.copy()
slope_df['slope'] = pd.Series()
for service in lb_detection_df.counter_name.unique():
    for counter in lb_detection_df.loc[lb_detection_df.counter_name == service, 'counter_name'].unique():
        x = lb_detection_df.loc[(lb_detection_df.counter_name == counter), 'load']
        y = lb_detection_df.loc[(lb_detection_df.counter_name == counter), metric]
        b = lb_detection_df.loc[(lb_detection_df.counter_name == counter), 'baseline']
        slope, intercept, r_value, p_value, std_err = stats.linregress(y,x)
        degree = 2
        #coeffs = np.polyfit(x, y, degree)
        # now, coeffs is an array which contains the polynomial coefficients
        # in ascending order, i.e. x^0, x^1, x^2
        #print('1:ndistance' + str(slope_df.loc[(lb_detection_df.counter_name == counter), 'ndistance']))
        #intercept1, linear, quadratic = coeffs
        slope_df.loc[(slope_df.counter_name == counter), 'slope'] = slope
        #slope_df.loc[(slope_df.counter_name == counter), 'quad'] = quadratic
        #print('2:ndistance' + str(slope_df.loc[(lb_detection_df.counter_name == counter), 'ndistance']))

In [None]:
slope_df.head()

In [None]:
#slope_df.loc[slope_df.load == 90,['counter_name','ndistance','slope', 'sbinary']].to_csv('eo.csv',index=False)
slope_df.loc[slope_df.load == 90,['counter_name','ndistance','slope']].to_csv('eo2.csv',index=False)

In [None]:
meas_90_df = slope_df.loc[slope_df.load == 90,['ndistance','slope']]

In [None]:
full_meas_90_df = slope_df.loc[slope_df.load == 90,['ndistance','slope','counter_name','assessment']]

In [None]:
ax = plt.gca()
ax.scatter(meas_90_df['ndistance'], meas_90_df['slope'], color='blue')
ax.set_xlabel(' normalized distance > 1 failed performance requirement baseline, load = 90%')
ax.set_ylabel('slope > 0.1 failed scalability trend')
        #ax.set_title("{} vs {}".format(x_col, y_col))
majorLocator   = MultipleLocator(5)
majorFormatter = FormatStrFormatter('%d')
minorLocator   = MultipleLocator(1)
    #plt.legend()
#plt.show()
plt.axhline(0.1,color='black')  #horizontal line
plt.axvline(1,color='black')  #vertical line
plt.grid()
plt.savefig('partition_induced.pdf')

In [None]:
slope_df

In [None]:
meas_90_df

In [None]:
#plot with vertical lines per SPA

ax = plt.gca()
ax.scatter(meas_90_df['ndistance'], (meas_90_df['slope']/meas_90_df['slope'].max()), color='blue')


ax.set_xlabel(' normalized distance > 1 failed performance requirement baseline, load = 90%')
ax.set_ylabel('nomalized slope')
        #ax.set_title("{} vs {}".format(x_col, y_col))

# Prepare data for training
spa_t_df = spa_df.copy()
spa_t_df['index'] = pd.Series()
max1=spa_t_df['slope'].max()
plt.axvline(spa_t_df['ndistance'][0], color='magenta',label=spa_t_df['spa'][0]) 
plt.axvline(spa_t_df['ndistance'][1], color='purple',label=spa_t_df['spa'][1])
plt.axvline(spa_t_df['ndistance'][2], color='green',label=spa_t_df['spa'][2])
plt.axvline(spa_t_df['ndistance'][3], color='yellow',label=spa_t_df['spa'][3])
plt.axvline(spa_t_df['ndistance'][4], color='blue',label=spa_t_df['spa'][4])
plt.axvline(spa_t_df['ndistance'][5], color='orange',label=spa_t_df['spa'][5])
plt.axvline(spa_t_df['ndistance'][6], color='red',label=spa_t_df['spa'][6])
plt.axhline((spa_t_df['slope'][0]/spa_t_df['slope'].max()), color='magenta') 
plt.axhline((spa_t_df['slope'][1]/spa_t_df['slope'].max()), color='purple')
plt.axhline((spa_t_df['slope'][2]/spa_t_df['slope'].max()), color='green')
plt.axhline((spa_t_df['slope'][3]/spa_t_df['slope'].max()), color='yellow')
plt.axhline((spa_t_df['slope'][4]/spa_t_df['slope'].max()), color='blue')
plt.axhline((spa_t_df['slope'][5]/spa_t_df['slope'].max()), color='orange')
plt.axhline((spa_t_df['slope'][6]/spa_t_df['slope'].max()), color='red')
#plt.grid()
ax.set_ylim(-0.2,1.1)
ax.set_xlim(0,2)
        #ax.set_title("{} vs {}".format(x_col, y_col))
majorLocator   = MultipleLocator(5)
majorFormatter = FormatStrFormatter('%d')
minorLocator   = MultipleLocator(1)
plt.axhline(0.1,color='black')  #horizontal line
plt.axvline(1,color='black')  #vertical line
majorLocator   = MultipleLocator(5)
majorFormatter = FormatStrFormatter('%d')
minorLocator   = MultipleLocator(1)
#plt.legend()
#plt.show()
plt.axhline(0.1,color='black')  #horizontal line
plt.axvline(1,color='black')  #vertical line

#ax.legend(fancybox=True, framealpha=1, shadow=True, borderpad=1, loc='upper left')
ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.05),
          ncol=2, fancybox=True, shadow=True)
plt.legend()
#plt.show()
plt.savefig('partition.pdf')
#plt.grid()

In [None]:
#plot with vertical lines per SPA


        #ax.set_title("{} vs {}".format(x_col, y_col))

# Prepare data for training
spa_t_df = spa_df.copy()
spa_t_df['index'] = pd.Series()
max1=spa_t_df['slope'].max()
for i in range(0,7):
    ax = plt.gca()
    ax.scatter(meas_90_df['ndistance'], (meas_90_df['slope']/meas_90_df['slope'].max()), color='blue')


    ax.set_xlabel(' normalized distance > 1 failed performance requirement baseline, load = 90%')
    ax.set_ylabel('nomalized slope')
    plt.axvline(spa_t_df['ndistance'][i], color='orange',label=spa_t_df['spa'][i]) 

    plt.axhline((spa_t_df['slope'][i]/spa_t_df['slope'].max()), color='orange') 

    #plt.grid()
    ax.set_ylim(-0.2,1.1)
    ax.set_xlim(0,2)
            #ax.set_title("{} vs {}".format(x_col, y_col))
    majorLocator   = MultipleLocator(5)
    majorFormatter = FormatStrFormatter('%d')
    minorLocator   = MultipleLocator(1)
    plt.axhline(0.1,color='black')  #horizontal line
    plt.axvline(1,color='black')  #vertical line
    majorLocator   = MultipleLocator(5)
    majorFormatter = FormatStrFormatter('%d')
    minorLocator   = MultipleLocator(1)
    #plt.legend()
    #plt.show()
    plt.axhline(0.1,color='black')  #horizontal line
    plt.axvline(1,color='black')  #vertical line

    #ax.legend(fancybox=True, framealpha=1, shadow=True, borderpad=1, loc='upper left')
    ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.05),
              ncol=2, fancybox=True, shadow=True)
    plt.legend()
    plt.show()
 

### Detection Step

In [None]:
#only assessment = true counters are taken
meas_90 = slope_df.loc[(slope_df.load == 90) &(slope_df.assessment== True), ['counter_name','ndistance','slope']]

In [None]:
meas_90

In [None]:
spa_mean_std= pd.read_csv("spasmeanstd.csv", sep=';')

In [None]:
spa_mean_std

In [None]:
def calculate_baseline(mean, std):
    return (mean * 0.6 + std)

In [None]:
spa_mean_std['baseline'] = spa_mean_std.apply(lambda x: calculate_baseline(x["mean"], x["stddev"]), axis=1)

In [None]:
spa_mean_std

In [None]:
spa_mean_std=pd.concat([spa_mean_std, spa_t_df],axis=1)
spa_mean_std=spa_mean_std.drop(['spa', 'index'], axis = 1) 


In [None]:
spa_mean_std

In [None]:
meas_90=meas_90.reset_index()
meas_90

In [None]:
mm=meas_90.iloc[:,2:]
ff=spa_mean_std.iloc[:,4:]
mm.iloc[:,1]=mm.iloc[:,1]/mm.iloc[:,1].max()
ff.iloc[:,1]=ff.iloc[:,1]/ff.iloc[:,1].max()

In [None]:
mm

In [None]:
ff

In [None]:
a=[1.00,0.1]

In [None]:
ff_l=len(ff)

In [None]:
ff.loc[ff_l]=a

In [None]:
ff

In [None]:
#ans.shape

In [None]:
# Euclidean distance
ress=[]
for i in range(0,7):
    for j in range(0,8):
        res=((((ff["ndistance"][j] - mm["ndistance"][i] )**2) + ((ff["slope"][j]-mm["slope"][i])**2) )**0.5)
        ress.append(res)
         
ress=pd.Series(ress)

In [None]:
Status_Updates= pd.concat([spa_mean_std['SPAs'],ress[0:8]],axis = 1)
Status_Updates.rename(columns = {0:'euclidean_distance_Control'}, inplace = True) 
Status_Updates = Status_Updates.replace(np.nan, 'Base', regex=True)
min_dist_indx=Status_Updates['euclidean_distance_Control'].idxmin
min_dist_spa=Status_Updates['SPAs'][min_dist_indx]
print('\n',Status_Updates)
print('The Status_Updates Service belongs to ',min_dist_spa)

Control= pd.concat([spa_mean_std['SPAs'],ress[8:16]],axis = 1)
Control.rename(columns = {0:'euclidean_distance_Control'}, inplace = True) 
Control = Control.replace(np.nan, 'Base', regex=True)
#min_dist_indx=Control['euclidean_distance_Control'].idxmin
#min_dist_spa=Control['SPAs'][min_dist_indx]
#print('\n',Control)
#print('The Control Service belongs to ',min_dist_spa)

DB_Data_Management= pd.concat([spa_mean_std['SPAs'],ress[16:24].reset_index(drop=True)],axis = 1,ignore_index=True)
DB_Data_Management.rename(columns = {0:'SPAs',1:'euclidean_distance_Control'}, inplace = True) 
DB_Data_Management= DB_Data_Management.replace(np.nan, 'Base', regex=True)
min_dist_indx=DB_Data_Management['euclidean_distance_Control'].idxmin
min_dist_spa=DB_Data_Management['SPAs'][min_dist_indx]
print('\n',DB_Data_Management)
print('The DB_Data_Management Service belongs to',min_dist_spa)

Enquiry= pd.concat([spa_mean_std['SPAs'],ress[24:32].reset_index(drop=True)],axis = 1,ignore_index=True)
Enquiry.rename(columns = {0:'SPAs',1:'euclidean_distance_Control'}, inplace = True) 
Enquiry= Enquiry.replace(np.nan, 'Base', regex=True)
min_dist_indx=Enquiry['euclidean_distance_Control'].idxmin
min_dist_spa=Enquiry['SPAs'][min_dist_indx]
print('\n',Enquiry)
print('The Enquiry Service belongs to',min_dist_spa)

Interrogation= pd.concat([spa_mean_std['SPAs'],ress[32:40].reset_index(drop=True)],axis = 1,ignore_index=True)
Interrogation.rename(columns = {0:'SPAs',1:'euclidean_distance_Control'}, inplace = True) 
Interrogation= Interrogation.replace(np.nan, 'Base', regex=True)
min_dist_indx=Interrogation['euclidean_distance_Control'].idxmin
min_dist_spa=Interrogation['SPAs'][min_dist_indx]
print('\n',Interrogation)
print('The Interrogation Service belongs to',min_dist_spa)


Offline= pd.concat([spa_mean_std['SPAs'],ress[40:48].reset_index(drop=True)],axis = 1,ignore_index=True)
Offline.rename(columns = {0:'SPAs',1:'euclidean_distance_Control'}, inplace = True) 
Offline= Offline.replace(np.nan, 'Base', regex=True)
min_dist_indx=Offline['euclidean_distance_Control'].idxmin
min_dist_spa=Offline['SPAs'][min_dist_indx]
print('\n',Offline)
print('The Offline Service belongs to',min_dist_spa)

Recompose= pd.concat([spa_mean_std['SPAs'],ress[48:56].reset_index(drop=True)],axis = 1,ignore_index=True)
Recompose.rename(columns = {0:'SPAs',1:'euclidean_distance_Control'}, inplace = True) 
Recompose= Recompose.replace(np.nan, 'Base', regex=True)
min_dist_indx=Recompose['euclidean_distance_Control'].idxmin
min_dist_spa=Recompose['SPAs'][min_dist_indx]
print('\n',Recompose)
print('The Recompose Service belongs to',min_dist_spa)

In [None]:
names=['Status_Updates',  'Control', 'DB Data Management',
       'Enquiry', 'Interrogation', 'Recompose', 'Offline','Application hiccups', 'Continuous violated req.', 'Traffic jam',
       'The stifle', 'Expensive DB call', 'Empty semi-trucks', 'The Blob','Base']

In [None]:
Y=pd.concat([mm,ff],axis=0)
Y=Y.reset_index()
Y['names']=names
Y=Y.drop('index',axis=1)
Y=Y[['names','ndistance','slope']]

In [None]:
from scipy.spatial.distance import cdist
x_test=Y.iloc[[0,1,2,3,4,5,6,7],1:].values
xcv=Y.iloc[8:,1:].values

idx_test = cdist(x_test,xcv,'euclidean','Smallest',1);
idx_test

In [None]:
from sklearn.cluster import KMeans
Z_names=Y.iloc[[0,1,2,3,4,5,6,7,12],0]
Z=Y.iloc[[0,1,2,3,4,5,6,7,12],1:].values
kmeans=KMeans(n_clusters=2,init='k-means++',random_state=0)
y_kmeans=kmeans.fit_predict(Z)

plt.scatter(Z[y_kmeans==0,0],Z[y_kmeans==0,1],s=100,c='red',label='Cluster1')
plt.scatter(Z[y_kmeans==1,0],Z[y_kmeans==1,1],s=100,c='blue',label='Cluster2')
print(Z_names)
print(y_kmeans)


plt.title('DBCALL normalized slope')
plt.xlabel('nomalized distance')
plt.ylabel('nomalized slope')
plt.legend()
plt.show()

In [None]:
from sklearn.cluster import KMeans
Z_names=Y.iloc[[0,1,2,3,4,5,6,7,11],0]
Z=Y.iloc[[0,1,2,3,4,5,6,7,11],1:].values

kmeans=KMeans(n_clusters=2,init='k-means++',random_state=0)
y_kmeans=kmeans.fit_predict(Z)

#Visualize the clusters

plt.scatter(Z[y_kmeans==0,0],Z[y_kmeans==0,1],s=100,c='red',label='Cluster1')
plt.scatter(Z[y_kmeans==1,0],Z[y_kmeans==1,1],s=100,c='blue',label='Cluster2')

print(Z_names)
print(y_kmeans)
plt.title('Stiffle normalized slope')
plt.xlabel('nomalized distance')
plt.ylabel('nomalized slope')
plt.legend()
plt.show()

In [None]:
#SPA values convert to arr and given as centroid
xcv=Y.iloc[7:,1:].values

In [None]:
Z=Y.iloc[:,1:].values
kmeans=KMeans(n_clusters=8,init=xcv,random_state=0)
y_kmeans=kmeans.fit_predict(Z)
x_x=kmeans.cluster_centers_

plt.scatter(Z[y_kmeans==0,0],Z[y_kmeans==0,1],s=100,c='red',label=spa_t_df['spa'][0])
plt.scatter(Z[y_kmeans==1,0],Z[y_kmeans==1,1],s=100,c='blue',label=spa_t_df['spa'][1])
plt.scatter(Z[y_kmeans==2,0],Z[y_kmeans==2,1],s=100,c='orange',label=spa_t_df['spa'][2])
plt.scatter(Z[y_kmeans==3,0],Z[y_kmeans==3,1],s=100,c='brown',label=spa_t_df['spa'][3])
plt.scatter(Z[y_kmeans==4,0],Z[y_kmeans==4,1],s=100,c='pink',label=spa_t_df['spa'][4])
plt.scatter(Z[y_kmeans==5,0],Z[y_kmeans==5,1],s=100,c='purple',label=spa_t_df['spa'][5])
plt.scatter(Z[y_kmeans==6,0],Z[y_kmeans==6,1],s=100,c='brown',label=spa_t_df['spa'][6])
#plt.scatter(Z[y_kmeans==7,0],Z[y_kmeans==7,1],s=100,c='c',label='Base')
plt.scatter(xcv[0:7,0],xcv[0:7,1], marker = "x", s = 50, color = 'black')

plt.scatter(Y['ndistance'][0],Y['slope'][0],s=20,marker=".",c='gray',label='Status_Updates')
plt.scatter(Y['ndistance'][1],Y['slope'][1],s=20,marker=".",c='brown',label='Control')
plt.scatter(Y['ndistance'][2],Y['slope'][2],s=20,marker=".",c='cyan',label='DB Data Management')
plt.scatter(Y['ndistance'][3],Y['slope'][3],s=20,marker=".",c='red',label='Enquiry')
plt.scatter(Y['ndistance'][4],Y['slope'][4],s=20,marker=".",c='yellow',label='Interrogation')
plt.scatter(Y['ndistance'][5],Y['slope'][5],s=20,marker=".",c='green',label='Offline')
plt.scatter(Y['ndistance'][6],Y['slope'][6],s=20,marker=".",c='black',label='Recompose')

#plt.title('SPAs and Services')
plt.xlabel('nomalized distance')
plt.ylabel('nomalized slope')

# Put a legend below current axis
#ax.legend(loc='upper center', 
     #     , shadow=True, ncol=5)

#plt.legend(loc='lower center', ncol=1,fancybox=True,bbox_to_anchor=(0.3, 0.3))
plt.legend()
plt.tight_layout()
#plt.rcParams["figure.figsize"] = (6,6)
plt.savefig('kmeans_legend.pdf',dpi=300)
plt.savefig('kmeans_legend.jpeg',dpi=300)
#plt.show()


In [None]:
clusters_centroids=dict()
clusters_radii= dict()

'''looping over clusters and calculate Euclidian distance of 
each point within that cluster from its centroid and 
pick the maximum which is the radius of that cluster'''

for cluster in range(0,8):

    clusters_centroids[cluster]=list(zip(kmeans.cluster_centers_[:, 0],kmeans.cluster_centers_[:,1]))[cluster]
    clusters_radii[cluster] = max([np.linalg.norm(np.subtract(i,clusters_centroids[cluster])) for i in zip(Z[y_kmeans == cluster, 0],Z[y_kmeans == cluster, 1])])

In [None]:
import matplotlib.patches as mpatches

fig, ax = plt.subplots(1,figsize=(7,5))

plt.scatter(Z[y_kmeans==0,0],Z[y_kmeans==0,1],s=100,c='red',label=spa_t_df['spa'][0])
art = mpatches.Circle(clusters_centroids[0],clusters_radii[0], edgecolor='r',fill=False)
ax.add_patch(art)

plt.scatter(Z[y_kmeans==1,0],Z[y_kmeans==1,1],s=100,c='blue',label=spa_t_df['spa'][1])
art = mpatches.Circle(clusters_centroids[1],clusters_radii[1], edgecolor='b',fill=False)
ax.add_patch(art)

plt.scatter(Z[y_kmeans==2,0],Z[y_kmeans==2,1],s=100,c='orange',label=spa_t_df['spa'][2])
art = mpatches.Circle(clusters_centroids[2],clusters_radii[2], edgecolor='orange',fill=False)
ax.add_patch(art)


plt.scatter(Z[y_kmeans==3,0],Z[y_kmeans==3,1],s=100,c='green',label="Outlier")
art = mpatches.Circle(clusters_centroids[3],clusters_radii[3], edgecolor='g',fill=False)
ax.add_patch(art)


plt.scatter(Z[y_kmeans==4,0],Z[y_kmeans==4,1],s=100,c='pink',label=spa_t_df['spa'][4])
art = mpatches.Circle(clusters_centroids[4],clusters_radii[4], edgecolor='g',fill=False)
ax.add_patch(art)


plt.scatter(Z[y_kmeans==5,0],Z[y_kmeans==5,1],s=100,c='purple',label=spa_t_df['spa'][5])
art = mpatches.Circle(clusters_centroids[5],clusters_radii[5], edgecolor='purple',fill=False)
ax.add_patch(art)

plt.scatter(Z[y_kmeans==6,0],Z[y_kmeans==6,1],s=100,c='brown',label=spa_t_df['spa'][6])
art = mpatches.Circle(clusters_centroids[6],clusters_radii[6], edgecolor='brown',fill=False)
ax.add_patch(art)

#plt.scatter(Z[y_kmeans==7,0],Z[y_kmeans==7,1],s=100,c='c',label='Base')
#art = mpatches.Circle(clusters_centroids[7],clusters_radii[7], edgecolor='g',fill=False)
#ax.add_patch(art)

#plt.scatter(xcv[:,0],xcv[:,1], marker = "x", s = 50, color = 'w', label = 'SPA Coordinates')

plt.scatter(Y['ndistance'][0],Y['slope'][0],s=20,marker=".",c='gray',label='Status_Updates')
plt.scatter(Y['ndistance'][1],Y['slope'][1],s=20,marker=".",c='brown',label='Control')
plt.scatter(Y['ndistance'][2],Y['slope'][2],s=20,marker=".",c='cyan',label='DB Data Management')
plt.scatter(Y['ndistance'][3],Y['slope'][3],s=20,marker=".",c='red',label='Enquiry')
plt.scatter(Y['ndistance'][4],Y['slope'][4],s=20,marker=".",c='yellow',label='Interrogation')
plt.scatter(Y['ndistance'][5],Y['slope'][5],s=20,marker=".",c='green',label='Offline')
plt.scatter(Y['ndistance'][6],Y['slope'][6],s=20,marker=".",c='black',label='Recompose')
#plt.scatter(Z[y_kmeans==7,0],Z[y_kmeans==7,1],s=100,c='c',label='Base')
plt.scatter(xcv[0:7,0],xcv[0:7,1], marker = "x", s = 50, color = 'black')
plt.legend()
plt.tight_layout()
plt.savefig('kmeans_centroid.pdf',dpi=300)
plt.savefig('kmeans_centroid.jpeg',dpi=300)

In [None]:
kmeans=KMeans(n_clusters=8,init='k-means++', n_init=10,random_state=0)
y_kmeans=kmeans.fit_predict(Z)

plt.scatter(Z[y_kmeans==0,0],Z[y_kmeans==0,1],s=100,c='red')
plt.scatter(Z[y_kmeans==1,0],Z[y_kmeans==1,1],s=100,c='blue')
plt.scatter(Z[y_kmeans==2,0],Z[y_kmeans==2,1],s=100,c='orange')
plt.scatter(Z[y_kmeans==3,0],Z[y_kmeans==3,1],s=100,c='green')
plt.scatter(Z[y_kmeans==4,0],Z[y_kmeans==4,1],s=100,c='pink')
plt.scatter(Z[y_kmeans==5,0],Z[y_kmeans==5,1],s=100,c='purple')
plt.scatter(Z[y_kmeans==6,0],Z[y_kmeans==6,1],s=100,c='brown')
plt.scatter(Z[y_kmeans==7,0],Z[y_kmeans==7,1],s=100,c='yellow')


plt.title('SPAs and Services')
plt.xlabel('nomalized distance')
plt.ylabel('nomalized slope')
plt.legend()
plt.show()


In [None]:
#plot with vertical lines per SPA

ax = plt.gca()
ax.scatter(meas_90_df['ndistance'], meas_90_df['slope'], color='blue')


ax.set_xlabel(' normalized distance > 1 failed performance requirement baseline, load = 90%')
ax.set_ylabel('slope > 0.1 failed scalability trend')
        #ax.set_title("{} vs {}".format(x_col, y_col))

# Prepare data for training
spa_t_df = spa_df.copy()
spa_t_df['index'] = pd.Series()
max1=spa_t_df['slope'].max()
plt.axvline(spa_t_df['ndistance'][0], color='magenta',label=spa_t_df['spa'][0]) 
plt.axvline(spa_t_df['ndistance'][1], color='purple',label=spa_t_df['spa'][1])
plt.axvline(spa_t_df['ndistance'][2], color='green',label=spa_t_df['spa'][2])
plt.axvline(spa_t_df['ndistance'][3], color='yellow',label=spa_t_df['spa'][3])
plt.axvline(spa_t_df['ndistance'][4], color='blue',label=spa_t_df['spa'][4])
plt.axvline(spa_t_df['ndistance'][5], color='orange',label=spa_t_df['spa'][5])
plt.axvline(spa_t_df['ndistance'][6], color='red',label=spa_t_df['spa'][6])
#plt.grid()
ax.set_ylim(-30,350)
ax.set_xlim(0,2)
        #ax.set_title("{} vs {}".format(x_col, y_col))
majorLocator   = MultipleLocator(5)
majorFormatter = FormatStrFormatter('%d')
minorLocator   = MultipleLocator(1)
plt.axhline(0.1,color='black')  #horizontal line
plt.axvline(1,color='black')  #vertical line
majorLocator   = MultipleLocator(5)
majorFormatter = FormatStrFormatter('%d')
minorLocator   = MultipleLocator(1)
#plt.legend()
#plt.show()
plt.axhline(0.1,color='black')  #horizontal line
plt.axvline(1,color='black')  #vertical line

#ax.legend(fancybox=True, framealpha=1, shadow=True, borderpad=1, loc='upper left')
ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.05),
          ncol=2, fancybox=True, shadow=True)
plt.legend()
plt.show()
plt.grid()

In [None]:

#for load in lb_detection_df.load.unique():
 #   if (load == 90):
       # for counter in lb_detection_df.loc[lb_detection_df.counter_name == service, 'counter_name'].unique():
figure = plt.figure
ax = plt.gca()
ax.scatter(slope_df['ndistance'], slope_df['slope'], color='red')
ax.set_xlabel(' normalized distance > 1 failed performance requirement baseline')
ax.set_ylabel('slope > 0.1 failed scalability trend')
        #ax.set_title("{} vs {}".format(x_col, y_col))
majorLocator   = MultipleLocator(5)
majorFormatter = FormatStrFormatter('%d')
minorLocator   = MultipleLocator(1)
    #plt.legend()
#plt.show()
plt.axhline(0.1,color='black')  #horizontal line
plt.axvline(1,color='black')  #vertical line
plt.grid()


In [None]:
#sns.set(style="whitegrid")
sns.set()
sns.relplot(x='ndistance',y='slope',
              hue="counter_name",size="load",
              data=slope_df, style="assessment",legend="full" )
              #data=full_meas_90_df, style="assessment",legend="full" )


In [None]:
#full_meas_90_df
sns.set()
sns.relplot(x='ndistance',y='slope',
          hue="counter_name", style="assessment",
            data=full_meas_90_df,legend="full")
plt.savefig('full_meas_90.pdf')

In [None]:
#distances and plot slope vs. distance
for service in lb_detection_df.counter_name.unique():
    for counter in lb_detection_df.loc[lb_detection_df.counter_name == service, 'counter_name'].unique():
        figure = plt.figure
        ax = plt.gca()
        ax.scatter(slope_df['distance'], slope_df['slope'])
        ax.set_xlabel('distance')
        ax.set_ylabel('slope')
    plt.show()


In [None]:
failed_distance_df = pd.DataFrame(df.groupby(['counter_name']).assessment.any())
failed_distance_df.columns.name=None
failed_distance_df=failed_distance_df.reset_index()
failed_distance_df.head()

In [None]:
for counter in failed_distance_df.loc[failed_distance_df.assessment == True,'counter_name'].values:
    plt.figure()
    x = df.loc[df.counter_name == counter, 'load']
    y = df.loc[df.counter_name == counter, metric]
    b = df.loc[df.counter_name == counter, 'baseline']
    plt.xlabel(str(counter) + ' MAX FAILED ')
    plt.scatter(x, y, s=10, color='green')
    plt.plot(x,b,color='red',label='baseline')

In [None]:
# Train models (one per counter, load as the second dimension of the anomaly detection process)
for service in lb_detection_df.counter_name.unique():
    for counter in lb_detection_df.loc[lb_detection_df.counter_name == service, 'counter_name'].unique():
        x = lb_detection_df.loc[(lb_detection_df.counter_name == counter), 'load']
        y = lb_detection_df.loc[(lb_detection_df.counter_name == counter), metric]
        b = lb_detection_df.loc[(lb_detection_df.counter_name == counter), 'baseline']
        slope, intercept, r_value, p_value, std_err = stats.linregress(y,x)
        print('counter = ' + str(counter) + ' slope = ' + str(slope))
        if (slope < 0.01):
            plt.figure()
            print('p_value = '+ str(p_value))
            print('slope = ' + str(slope))
            print('intercept = ' +str(intercept))
            print('r_value = ' +str(r_value))
            print('std_err = ' +str(std_err))
            x = lb_detection_df.loc[(lb_detection_df.counter_name == counter), 'load']
            y = lb_detection_df.loc[(lb_detection_df.counter_name == counter), metric]
            plt.xlabel(str(counter) + '\n SLOPE PASS -- p_value = ' + str(p_value) + '   slope = '+ str(slope) + ' r_value = ' + str(r_value))
            plt.scatter(x, y, s=10, color='green')
            plt.plot(x,b,color='red',label='baseline')

        
        if (slope >= 0.01) and (p_value < 0.05):
            plt.figure()
            x = lb_detection_df.loc[(lb_detection_df.counter_name == counter), 'load']
            y = lb_detection_df.loc[(lb_detection_df.counter_name == counter), metric]
            plt.xlabel(str(counter) + '\n SLOPE FAIL p_value = ' + str(p_value) + '   slope = '+ str(slope) + ' r_value = ' + str(r_value))
            plt.scatter(x, y, s=10, color='green')
            plt.plot(x,b,color='red',label='baseline')
            
            

In [None]:
data=spa_t_df['ndistance']

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import statistics 

data=spa_t_df['slope']
# Function to Detection Outlier on one-dimentional datasets.
def find_anomalies(data):
    #define a list to accumlate anomalies
    anomalies = []
    
    # Set upper and lower limit to 3 standard deviation
    data_std = statistics.stdev(data)
    data_mean = statistics.mean(data)
    anomaly_cut_off = data_std * 3
    print(anomaly_cut_off)
    lower_limit  = data_mean - anomaly_cut_off 
    upper_limit = data_mean + anomaly_cut_off
    print(lower_limit)
    print(upper_limit)
    # Generate outliers
    for outlier in data:
        if outlier > upper_limit or outlier < lower_limit:
            print('hi')
            anomalies.append(outlier)
    return anomalies

find_anomalies(data)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
data=spa_t_df['ndistance']
sns.boxplot(data)

In [None]:
data=spa_t_df.copy()

In [None]:
dataa= data.drop(['spa', 'index'], axis = 1).values

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.boxplot(data=spa_t_df['slope'])

In [None]:
plt.hist(spa_t_df['ndistance'])

In [None]:
print(stats.shapiro((spa_t_df['slope'])))

In [None]:
import statsmodels.api as sm
import pylab
sm.qqplot(spa_t_df['slope'], loc = 4, scale = 3, line='s')
pylab.show()