In [None]:
import os
from os import listdir
from os.path import isfile, join
import glob
import pandas as pd
import numpy as np
import math
from scipy.stats import poisson
import statistics
import pickle as pkl
import networkx as nx
import scipy.sparse as sp
import time
import torch
from torch import optim
from sklearn.metrics import roc_auc_score, average_precision_score
import torch.nn.functional as F
from torch.nn.modules.module import Module
from torch.nn.parameter import Parameter
import torch.nn as nn
import torch.nn.modules.loss
from torch.distributions import Categorical, Normal, Dirichlet
from sklearn.preprocessing import MinMaxScaler

In [None]:
# Data Trace Loading Alibaba Trace 2018 with 5 Task Domains

In [None]:
#load workload trace
df1=pd.read_csv('alibaba_2018_taskdomains_topics_5_1.csv',sep = ',')

In [None]:
# df1 contains Unnamed:0, CPU_util, mem_util, net_in, net_out and Task Domain\n",
del df1['Unnamed: 0']

In [None]:
df1 = df1[['CPU_util','mem_util','net_in','net_out','disk','Task Domain']].copy()

In [None]:
# If any column contains null values, then drop null values
df1 = df1.dropna()

In [None]:
# Normalize the data without task domains

In [None]:
df_actual = df1[['CPU_util','mem_util','net_in','net_out','disk']].copy()
#norm = MinMaxScaler()
#act_correct = pd.DataFrame(norm.fit_transform(df_actual), columns=df_actual.columns)

In [None]:
act_correct = df_actual

In [None]:
df1['Task Domain'].unique()

In [None]:
#Now add taskdomain to data
act_correct['Task Domain'] = df1['Task Domain']

In [None]:
# Train and Test Split

In [None]:
label=act_correct['Task Domain']
la=label.tolist()
#w_len=len(la)
la1=la

In [None]:
#Train and Test Split
# Define the split ratio (e.g., 80% train, 20% test)
split_ratio = 0.8
# Calculate the number of samples for the train and test splits
train_size = int(split_ratio * len(act_correct))
test_size = len(act_correct) - train_size
# Split the labels into train and test sets based on the split ratio
train_data = la1[:train_size]
w_len = len(train_data)
l = 35
no_of_partitions=math.ceil(w_len/l)
test_data = la1[train_size:]

In [None]:
# Creating partitions from training data

In [None]:
#Initial partitions
split_lists = [train_data[x:x+l] for x in range(0, len(train_data), l)]
#print(split_lists)

In [None]:
# score function for a partition
def score_calc(lis):
    return (2*modularity(lis,5)*likelihood_calc(lis))/(modularity(lis,5)+likelihood_calc(lis))

In [None]:
def modularity(lis,n):  
    one_step_array=np.array(twostep_transition_matrix(lis))  
    M_prime=[[1/n]*n for _ in range(n)]  
    M_P=one_step_array-M_prime  
    x=np.sum(M_P)  
    m=np.sum(one_step_array)  
    return x/m 

In [None]:
# Representing each partition with markov model 

In [None]:
def twostep_transition_matrix(transitions): 
    n = 5  # number of states
    M = [[[0]*n for _ in range(n)] for _ in range(n)]
    
    for (i, j, k) in zip(transitions, transitions[1:], transitions[2:]): 
        M[i-1][j-1][k-1] += 1 

    # Convert counts to probabilities with Laplace smoothing
    for matrix in M:  
        for row in matrix:
            s = sum(row)
            if s > 0:
                row[:] = [(f + 1) / (s + n) for f in row]  # laplace smoothing
    
    return M


In [None]:
def likelihood_calc(lis):
    lambd = select_hyper(lis)
    likelihood1 = 1
    one_step_array = twostep_transition_matrix(lis)
    
    #print("Length of one_step_array:", len(one_step_array))  # Print the length of the outer list
    
    for i in range(0, len(lis) - 2):  # Adjust the loop range for 3D matrix
        #print("Index i:", i)
        #print("Indices:", lis[i], lis[i+1], lis[i+2])
        likelihood1 = likelihood1 * one_step_array[lis[i] - 1][lis[i+1] - 1][lis[i+2] - 1]  # Adjust indexing
    
    z = [[1] * 5 for _ in range(5)]  # 5 represents the number of task domains
    for i in range(0, 5):
        z[i] = lis.count(i + 1) + 1
    s = 1
    for i in range(0, len(z)):
        s = s * poisson.pmf(z[i], lambd)
    
    return likelihood1 / s

In [None]:
# Identifying correct partitions based on score function

import time
start_time = time.time()
partitions=[]
flag = 0
N = no_of_partitions
j=0
for i in range(0,no_of_partitions-1):
    if flag == 0:
        ps=l
    x = score_calc(split_lists[i])
    combined_list=[split_lists[i],split_lists[i+1]]
   # print(combined_list)
    merged_list=[item for sublist in combined_list for item in sublist]
    y = score_calc(merged_list)
    if abs(y-x) > 0.05:
        flag = 1
        #ps=len(split_lists[0])
        split_lists[i+1].extend(split_lists[i])
        ps=len(split_lists[i+1])
        N = N-1
    else:
        flag = 0
        partitions.append(ps)
tot=np.sum(partitions)

partitions.append(len(train_data)-tot)

end_time = time.time()
repetition_running_time_partition_create = end_time - start_time


In [None]:
# Final Set of Partitions
from itertools import islice
Inputt = iter(la)
final_partitions = [list(islice(Inputt, elem))
          for elem in partitions]

In [None]:
len(final_partitions)

In [None]:
list_adj = [0 for _ in range(N)]
norm = [0 for _ in range(N)]
list_features = [0 for _ in range(N)]

In [None]:
import time
start_time = time.time()
# Adjacency Matrix Construction for each partition
for i in range(N):
    list_adj[i] = np.array(twostep_transition_matrix(final_partitions[i]))
    sum_adj = list_adj[i].sum()
    if sum_adj == 0:
        norm[i] = 1.0  # Set a default normalization value if the sum is zero
    else:
        norm[i] = list_adj[i].shape[0] * list_adj[i].shape[0] / float((list_adj[i].shape[0] * list_adj[i].shape[0] - sum_adj) * 2)
    list_adj[i] = torch.from_numpy(np.array(twostep_transition_matrix(final_partitions[i]))).float()
end_time = time.time()

# Calculate the total elapsed time
total_running_time_partition = end_time - start_time
print("Total running time for constructing", len(partitions), "matrices:", total_running_time_partition, "seconds")



In [None]:
import time

start_time = time.time()

# Adjacency Matrix Construction for each partition
for i in range(N):
    list_adj[i] = np.array(twostep_transition_matrix(final_partitions[i]))
    sum_adj = list_adj[i].sum()
    
    if sum_adj == 0:
        norm[i] = 1.0
    else:
        denominator = (list_adj[i].shape[0] * list_adj[i].shape[0] - sum_adj) * 2
        if denominator != 0:
            norm[i] = list_adj[i].shape[0] * list_adj[i].shape[0] / float(denominator)
        else:
            norm[i] = 0.0  # Set an appropriate value when denominator is zero
            
    list_adj[i] = torch.from_numpy(np.array(twostep_transition_matrix(final_partitions[i]))).float()

end_time = time.time()

# Calculate the total elapsed time
total_running_time_partition = end_time - start_time
print("Total running time for constructing", len(partitions), "matrices:", total_running_time_partition, "seconds")


In [None]:
list_adj

In [None]:
# Merging mean and variance vectors alternatively
def countList(lst1, lst2):
    return [sub[item] for item in range(len(lst2))
                      for sub in [lst1, lst2]]

In [None]:
act_correct['Task Domain'].unique()

In [None]:
# Feature Matrix Construction for each partition
#cols = [3,4,5,6,8,10,11,12,13,14,15,16,17]
j=0
features = np.zeros((5,2*5))
for j in range(0,5):
    # If taskdomains are numbered from 0 write j and if task domains are numbered from 1 write j+1
    rslt_df = df1.loc[df1['Task Domain'] == j] 
    #df_norm = rslt_df[rslt_df.columns[cols]]
    #task1_df = rslt_df[rslt_df.columns[cols]]
    task1_df = rslt_df[rslt_df.columns[0:5]]
    norm = MinMaxScaler()
    #applying norm to dataframe
    df_norm = pd.DataFrame(norm.fit_transform(task1_df), columns=task1_df.columns)
    #df_norm = df_norm.multiply(100)
    sha = task1_df.shape
    i=0
    mea = [0]*sha[1]
    var = [0]*sha[1]
    for col in df_norm:
        #print(col)
        a = df_norm[col].to_numpy()
        b = a[np.logical_not(np.isnan(a))]
        mea[i]= statistics.mean(b)
        var[i] = statistics.variance(b)
        i = i+1
    #features[j] = mea+var
    features[j]=countList(mea, var)
    #my_data = task1_df[col]
    #ks_statistic, p_value = kstest(my_data, 'norm')
    #print(ks_statistic, p_value)

In [None]:
# Converting Feature Matrix into required form
for i in range(N):
    #list_features[i] = torch.from_numpy(np.random.rand(6,26)).float()
    list_features[i] = torch.from_numpy(features).float()

In [None]:
#Final Data Set
data = list(zip(list_adj, list_features))

In [None]:
# Model Parameters
feat_dim=10
hidden1=64
hidden2=16
dropout=0.0
lr = 0.01
n_nodes=5
epochs=200

In [None]:
class GraphConvolution(Module):
    """
    Simple GCN layer, similar to https://arxiv.org/abs/1609.02907
    """

    def __init__(self, in_features, out_features, dropout=0., act=F.relu):
        super(GraphConvolution, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.dropout = dropout
        self.act = act
        self.weight = Parameter(torch.FloatTensor(in_features, out_features))
        self.reset_parameters()

    def reset_parameters(self):
        torch.nn.init.xavier_uniform_(self.weight)
        #self.weight=self.weight.float()
    
    def forward(self, input, adj):
        input = F.dropout(input, self.dropout, self.training)
        support = torch.mm(input, self.weight)
        if not isinstance(adj, torch.sparse.FloatTensor):
            raise ValueError("adj must be a sparse tensor")
        output = torch.sparse.mm(adj, support)
        output = self.act(output)
        return output

    def __repr__(self):
        return self.__class__.__name__ + ' (' \
               + str(self.in_features) + ' -> ' \
               + str(self.out_features) + ')'

In [None]:
class GCNModelVAE(nn.Module):
    def __init__(self, input_feat_dim, hidden_dim1, hidden_dim2, dropout):
        super(GCNModelVAE, self).__init__()
        self.gc1 = GraphConvolution(feat_dim, hidden1, dropout, act=F.relu)
        self.gc2 = GraphConvolution(hidden1, hidden2, dropout, act=lambda x: x)
        self.gc3 = GraphConvolution(hidden1, hidden2, dropout, act=lambda x: x)
        self.dc = InnerProductDecoder(dropout, act=lambda x: x)

    def encode(self, x, adj):
        hidden1 = self.gc1(x, adj)
        return self.gc2(hidden1, adj), self.gc3(hidden1, adj)

    def reparameterize(self, mu, logvar):
        if self.training:
            std = torch.exp(logvar)
            eps = torch.randn_like(std)
            return eps.mul(std).add_(mu)
        else:
            return mu

    def forward(self, x, adj):
        mu, logvar = self.encode(x, adj)
        z = self.reparameterize(mu, logvar)
        #a,b = self.dc(z)
        return z,self.dc(z), mu, logvar
        #return z, a, b, mu, logvar 
        #return mu, logvar
        

In [None]:
class InnerProductDecoder(nn.Module):
    """Decoder for using inner product for prediction."""

    def __init__(self, dropout, act=torch.sigmoid):
        super(InnerProductDecoder, self).__init__()
        self.dropout = dropout
        self.act = act

    def forward(self, z):
        z = F.dropout(z, self.dropout, training=self.training)
        adj = F.sigmoid(torch.mm(z, z.t()))
        #m = Dirichlet(torch.tensor(conce))
        #adj = m.sample()
        #return conce,adj
        return adj

In [None]:
model = GCNModelVAE(feat_dim, hidden1, hidden2, dropout)

In [None]:
optimizer = optim.Adam(model.parameters(), lr)

In [None]:
def loss_function(preds, labels, mu, logvar, n_nodes):
    cost = 0.6 * F.binary_cross_entropy_with_logits(preds, labels)
    #cost = F.binary_cross_entropy_with_logits(preds, labels)
    # see Appendix B from VAE paper:
    # Kingma and Welling. Auto-Encoding Variational Bayes. ICLR, 2014
    # https://arxiv.org/abs/1312.6114
    # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
    #loss = nn.MSELoss()
    #cost = loss(preds,labels)
    KLD = -0.5 / n_nodes * torch.mean(torch.sum(
        1 + 2 * logvar - mu.pow(2) - logvar.exp().pow(2), 1))
    return cost + KLD

In [None]:
def preprocess_graph(adj):
    adj = sp.coo_matrix(adj)
    adj_ = adj + sp.eye(adj.shape[0])
    rowsum = np.array(adj_.sum(1))
    degree_mat_inv_sqrt = sp.diags(np.power(rowsum, -0.5).flatten())
    adj_normalized = adj_.dot(degree_mat_inv_sqrt).transpose().dot(degree_mat_inv_sqrt)
    # return sparse_to_tuple(adj_normalized)
    return sparse_mx_to_torch_sparse_tensor(adj_normalized)
    #return adj_normalized

In [None]:
def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    """Convert a scipy sparse matrix to a torch sparse tensor."""
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)

In [None]:
list_embed = [0 for _ in range(N)]
list_mu = [0 for _ in range(N)]

In [None]:
import time
start_time = time.time()
hidden_emb = None
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for i in range(N):
        t = time.time()
        optimizer.zero_grad()
        
        # Assuming data[i][1] is node features and data[i][0] is the adjacency matrix
        node_features = data[i][1]
        adj_matrix = data[i][0][0]
        
        # Preprocess the adjacency matrix to a sparse tensor
        adj_normalized = preprocess_graph(adj_matrix)
        
        # Forward pass
        embed, recovered, mu, logvar = model(node_features, adj_normalized)
        
        # Update lists of embeddings and means
        list_mu[i] = mu
        list_embed[i] = embed
        
        # Calculate loss
        loss = loss_function(preds=recovered, labels=adj_matrix, mu=mu, logvar=logvar, n_nodes=n_nodes)
        
        # Backpropagation and optimization
        loss.backward()
        optimizer.step()
        
        # Accumulate the total loss
        total_loss += loss.item()
        
    #print("Epoch:", '%04d' % (epoch + 1), "Average loss=", "{:.5f}".format(total_loss / N))
    
end_time = time.time()
# Calculate the total elapsed time
total_running_time_emb = end_time - start_time
total_running_time_emb1 = total_running_time_emb/200
print("Total running time for GVAE:", total_running_time_emb1, "seconds")
print("Optimization Finished!")

In [None]:
list_graph_embed = [0 for _ in range(N)]
for i in range(N):
    list_graph_embed[i] = list_embed[i].sum(axis=0)

In [None]:
arr_graph_embed = [0 for _ in range(N)]
for i in range(N):
    arr_graph_embed[i] = list_graph_embed[i].detach().numpy()

In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score

In [None]:
from sklearn.metrics import silhouette_score
silhouette_scores = []
for num_clusters in range(2, 13):
    km = KMeans(n_clusters=num_clusters, max_iter=2000, init='k-means++')
    km.fit(arr_graph_embed)
    silhouette_scores.append(silhouette_score(arr_graph_embed, km.labels_))
    
plt.plot(range(2, 13), silhouette_scores, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score for Different Numbers of Clusters')
plt.show()

In [None]:
silhouette_scores

In [None]:
from sklearn.cluster import KMeans
sse = []
for num_clusters in range(2, 20):
    km = KMeans(n_clusters=num_clusters, max_iter=2000, init='k-means++')
    km.fit(arr_graph_embed)
    sse.append(km.inertia_)

plt.plot(range(2, 20), sse, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Within-Cluster Sum of Squares (SSE)')
plt.title('Elbow Method for Optimal Number of Clusters')
plt.show()

In [None]:
start_time=time.time()
km = KMeans(n_clusters=5, max_iter=2000, init='random')
km = km.fit(arr_graph_embed)
end_time = time.time()
# Calculate the total elapsed time
total_running_time_clust = end_time - start_time

In [None]:
km.cluster_centers_
label=km.labels_

In [None]:
start_time=time.time()

In [None]:
#count1 represents the number of times a model is repeated 
un=np.unique(label)
count1=[0 for _ in range(len(un))]
# count of unique labels in the cluster labels
for i in range(len(un)):
    for j in range(N):
        if label[j]==i:
            count1[i]=count1[i]+1

In [None]:
count1

In [None]:
# Creating a tuple to identify cluster representative
def clustering(cluster_index):
    j=0
    s=[]
    for i in range(len(label)):
        if label[i]==cluster_index:
            lis_c[j]=arr_graph_embed[i]
            s.append((cluster_index,j,i))
            j+=1
    return s

In [None]:
def clus(cluster_index):
    lis_c=[[0]*5 for _ in range(count1[cluster_index])]  # 6 represents Number of clusters
    j=0
    i=0
    #s=[]
    for i in range(len(label)):
        if label[i] == cluster_index:
            lis_c[j]=arr_graph_embed[i]
            j+=1
    return lis_c 

In [None]:
lis=[0 for _ in range(len(un))]
for i in range(len(un)):
    lis[i]=clus(i)

In [None]:
list_clustering = [0 for _ in range(len(un))]
for i in range(len(un)):
    lis_c=[[0]*5 for _ in range(count1[i])]
    list_clustering[i] = clustering(i)

In [None]:
 #Identifying cluster representative
def cluster_representative(cluster_index):
    Dist_Mat=[[0]*count1[cluster_index] for _ in range(count1[cluster_index])]
    ind=0
    for i in range(0,count1[cluster_index]):
        for j in range(0,count1[cluster_index]):
            Dist_Mat[i][j] = math.dist(lis[cluster_index][i],lis[cluster_index][j])
    Dist_Mat1=np.array(Dist_Mat) 
    x=Dist_Mat1.sum(axis=0)
    ind = np.where(x == min(x))
    s1 = list_clustering[cluster_index][ind[0][0]][2]
    return arr_graph_embed[list_clustering[cluster_index][ind[0][0]][2]], s1

In [None]:
clust_rep=[0 for _ in range(len(un))]
s1 = [0 for _ in range(len(un))]
for i in range(len(un)):
    clust_rep[i], s1[i] = cluster_representative(i)

In [None]:
sums_by_label = {}

# Iterate through each label and corresponding partition value
for lbl, part in zip(label, partitions):
    if lbl not in sums_by_label:
        sums_by_label[lbl] = 0  # Initialize the sum for the label if it doesn't exist
    sums_by_label[lbl] += part  # Add the partition value to the sum for the label

# Print the sums for each label
for lbl, total_sum in sums_by_label.items():
    print(f"Sum for label {lbl}: {total_sum}")

In [None]:
def time_cal(cluster_index):
    total_sum = 0

    for lbl, part in zip(label, partitions):
        if lbl == i:
            total_sum += part

    return total_sum

In [None]:
def time_cal(cluster_index):
    total_sum = 0

    for lbl, part in zip(label, partitions):
        if lbl == cluster_index:
            total_sum += part

    return total_sum

In [None]:
# sum of the time durations of each model
time_model = [ 0 for _ in range(len(un))]
for i in range(len(un)):
    #sum1 = [[0]*count1[i] for _ in range(count1[i])]
    time_model[i] = time_cal(i)

In [None]:
t_m = [0 for _ in range(len(time_model))]

for i in range(len(time_model)):
    avg_time = time_model[i] / count1[i] if count1[i] != 0 else 0
    t_m[i] = math.floor(avg_time
                       )

In [None]:
# Each model adjacency matrix and its corresponding time duration
rep_time = []
#rep_time.append(((list_adj[s1[0]].numpy(),t_m[0]),(list_adj[s1[1]].numpy(),t_m[1])))
rep_time.append(((list_adj[s1[0]].numpy(),t_m[0]),(list_adj[s1[1]].numpy(),t_m[1]),(list_adj[s1[2]].numpy(),t_m[2]),(list_adj[s1[3]].numpy(),t_m[3]),(list_adj[s1[4]].numpy(),t_m[4])))
#rep_time.append(((list_adj[s1[0]].numpy(),t_m[0]),(list_adj[s1[1]].numpy(),t_m[1]),(list_adj[s1[2]].numpy(),t_m[2]),(list_adj[s1[3]].numpy(),t_m[3]),(list_adj[s1[4]].numpy(),t_m[4]),(list_adj[s1[5]].numpy(),t_m[5]),
               # (list_adj[s1[6]].numpy(),t_m[6]),(list_adj[s1[7]].numpy(),t_m[7]),(list_adj[s1[8]].numpy(),t_m[8]),(list_adj[s1[9]].numpy(),t_m[9])))

In [None]:
rep_time

In [None]:
# Markov Model
def onestep_transition_matrix1(transitions,n):
   # n = 5 #number of clusters

    M = [[1]*n for _ in range(n)]

    for (i,j) in zip(transitions,transitions[1:]):
        M[i-1][j-1] += 1

    #now convert to probabilities:
    for row in M:
        s = sum(row)
        s
        if s > 0:
            row[:] = [(f+1)/(s+n) for f in row] #laplace smoothing
    return M

In [None]:
def twostep_transition_matrix(transitions,n): 
    #n = 5  # number of states
    M = [[[0]*n for _ in range(n)] for _ in range(n)]
    
    for (i, j, k) in zip(transitions, transitions[1:], transitions[2:]): 
        M[i-1][j-1][k-1] += 1 

    # Convert counts to probabilities with Laplace smoothing
    for matrix in M:  
        for row in matrix:
            s = sum(row)
            if s > 0:
                row[:] = [(f + 1) / (s + n) for f in row]  # laplace smoothing
    
    return M

In [None]:
label

In [None]:
# Global Transition Matrix
htm = np.array(onestep_transition_matrix1(label,len(un)))

In [None]:
# Global Transition Matrix
htm = np.array(twostep_transition_matrix(label,len(un)))

In [None]:
end_time = time.time()
    
    # Calculate the running time for this repetition
repetition_running_time_hiera = end_time - start_time
repetition_running_time_hiera

In [None]:
tot_training_time = repetition_running_time_hiera+total_running_time_clust+(total_running_time_emb/100)+repetition_running_time_partition_create+total_running_time_partition

In [None]:
tot_training_time

In [None]:
states_higher = np.unique(label)
states = np.unique(df1['Task Domain'])

In [None]:
import numpy as np

def generate_res_num(pred_res):
    res_num = [[0] * 1 for _ in range(len(pred_res))]
    for i in range(len(pred_res)):
        if pred_res[i] == 0:
            res_num[i] = [np.random.standard_cauchy(1) * 2.9893542376801 + 51.51438521750882, np.random.normal(89.69712191001122, 1.8488024325537014),np.random.normal(42.31731355271662, 3.1400327494071583),np.random.normal(33.527714137443645, 2.5242247874905877),np.random.normal(13.569423330333947, 4.168251587766369)]
        elif pred_res[i] == 1:
            res_num[i] = [np.random.standard_cauchy(size=1) * 4.509662660345478 + 36.00421486122699, np.random.normal(87.51321559499601, 2.2413093129569885),np.random.normal(40.21508119726604, 3.206382354655931),np.random.standard_cauchy(1) * 1.5318158079539732+ 31.310652708648206,np.random.normal(9.847255264104716, 1.5022002841723094)]
        elif pred_res[i] == 2:
            res_num[i] = [np.random.standard_cauchy(size=1) * 1.8027742048610422 + 26.969217374984567, np.random.normal(87.60243106540257, 2.5061420048894583),np.random.normal(40.28278255700421, 3.4983807963924822),np.random.normal(31.92869773202124, 2.7876681456268133),np.random.standard_cauchy(1) * 0.5882536377766454+ 5.013670673760187]
        elif pred_res[i] == 3:
            res_num[i] = [np.random.normal(40.36470039752409, 5.080138827582591),np.random.normal(89.97972609963598, 1.544912841589661),np.random.normal(43.76704290596112, 3.0172242609405355),np.random.normal(34.720854795085714, 2.4402083661714475),np.random.standard_cauchy(1)* 1.0934495398530535+ 6.426441255901989]
        elif pred_res[i] == 4:
            res_num[i] = [np.random.normal(37.24384121935113, 5.115999644431855),np.random.normal(86.13647320958988, 1.9829845423315906),np.random.normal(39.08362029960354, 3.433479664822717),np.random.normal(30.97130589815287, 2.738960247710485),np.random.standard_cauchy(1)* 0.9102716413582077+ 5.991651572577391]
    return res_num

In [None]:
# Minute Level
running_times=[]

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
import time
start_time = time.time()
# Define the list of prediction lengths you want to test
#prediction_lengths = [12,18,24,30,36,42,48,54,60,66,72,78,84,90,96,98,108,114,120, 132, 144 ]  # Add more lengths as needed
#prediction_lengths = [288, 432, 576, 720, 864, 1008, 1152]
#prediction_lengths = [144, 156, 168, 180, 192, 204, 216, 228, 240, 252, 264, 276]
#prediction_lengths = [288, 432, 576, 720, 864, 1008, 1152]
#prediction_lengths = [2,3,4,5,6,7,8,9,10,11]
prediction_lengths = [100]
# Initialize lists to store results for different prediction lengths
results = []
results_mse = []
results_rmse = []
results_mae = []
results_mape = []
prev_higher_state1 = label[len(label)-1]  # Replace with your desired initial state
prev_higher_state2 = label[len(label)-2]  # Replace with your desired initial state
prev_state =  la[len(train_data)-1] # Replace with your desired initial state
prev_state1=  la[len(train_data)-2] # Replace with your desired initial state
# Define the number of times you want to repeat the experiment
num_experiments = 100
for prediction_length in prediction_lengths:
   # no_predict1 = prediction_length
    # Inner loop for repeating the experiment
    for _ in range(num_experiments):
        no_predict = prediction_length
        pred_res = []
        while no_predict > 0:
            # Predict the next state at the higher level
            higher_transition_probs = htm[prev_higher_state2][prev_higher_state1]
    
            # Normalize transition probabilities if the sum is not zero
            if np.sum(higher_transition_probs) > 0:
                higher_transition_probs /= np.sum(higher_transition_probs)
            else:
                # If sum is close to zero, assign equal probabilities to all states
                higher_transition_probs = np.ones(len(states_higher)) / len(states_higher)
    
            next_higher_state = np.random.choice(states_higher, p=higher_transition_probs)
    
            # Identify the model tuple for the selected higher-level state
            selected_model = rep_time[0][next_higher_state]
    
            # Access the adjacency matrix and time duration for the selected model
            model_adj_matrix, model_time_duration = selected_model
    
            # Calculate transition probabilities for the previous two states
            transition_probs = model_adj_matrix[prev_state1][prev_state]

            # Normalize transition probabilities if the sum is not zero
            if np.sum(transition_probs) > 0:
                transition_probs /= np.sum(transition_probs)

                # Sample the next state based on transition probabilities
                next_state = np.random.choice(states, p=transition_probs)
            else:
                # If sum is close to zero, randomly choose the next state
                next_state = np.random.choice(states)

            # Append the predicted state to the result
            pred_res.append(next_state)

            # Update the state variables for the next iteration
            prev_higher_state2 = prev_higher_state1
            prev_higher_state1 = next_higher_state
            prev_state1 = prev_state
            prev_state = next_state
            no_predict -= 1

        res_num = generate_res_num(pred_res)
        end_time = time.time()
    
        # Calculate the running time for this repetition
        repetition_running_time = end_time - start_time
        running_times.append(repetition_running_time)
        #Predicted Data
        df_pred = pd.DataFrame(res_num, columns=['CPU_util','mem_util','net_in','net_out','disk'])
        ori = df_actual[len(train_data):len(train_data)+prediction_length]
        norm = MinMaxScaler(feature_range=(0.15,0.20))
        df_pred_norm = pd.DataFrame(norm.fit_transform(df_pred), columns=df_pred.columns)
        ori_norm = pd.DataFrame(norm.fit_transform(ori), columns=ori.columns)
        actual_values = ori_norm[['CPU_util','mem_util','net_in','net_out','disk']]
        predicted_values = df_pred_norm
        mse_per_variable = mean_squared_error(actual_values, predicted_values, multioutput='raw_values')

        # Calculate the average Mean Squared Error
        average_mse = np.mean(mse_per_variable)

        # Calculate Mean Squared Error (MSE) for each variable
        mae_per_variable = mean_absolute_error(actual_values, predicted_values, multioutput='raw_values')

        # Calculate the average Mean Squared Error
        average_mae = np.mean(mae_per_variable)


        # Calculate Mean Squared Error (MSE) for each variable
        mape_per_variable = mean_absolute_percentage_error(actual_values, predicted_values, multioutput='raw_values')

        # Calculate the average Mean Squared Error
        average_mape = np.mean(mape_per_variable)


        # Calculate Mean Squared Error (MSE) for each variable
        rms_per_variable = mean_squared_error(actual_values, predicted_values, multioutput='raw_values', squared=False)

        # Calculate the average Mean Squared Error
        average_rmse = np.mean(rms_per_variable)

        #'cpu_usage','gpu_wrk_util','avg_mem','avg_gpu_work_mem'

        mse_cpu_util, mse_mem_util, mse_net_in, mse_net_out, mse_disk = mse_per_variable
        results_mse.append([prediction_length,mse_cpu_util, mse_mem_util, mse_net_in, mse_net_out, mse_disk])

        mae_cpu_util, mae_mem_util, mae_net_in, mae_net_out, mae_disk = mae_per_variable
        results_mae.append([prediction_length, mae_cpu_util, mae_mem_util, mae_net_in, mae_net_out, mae_disk])

        rmse_cpu_util, rmse_mem_util, rmse_net_in, rmse_net_out, rmse_disk = rms_per_variable
        results_rmse.append([prediction_length, rmse_cpu_util, rmse_mem_util, rmse_net_in, rmse_net_out, rmse_disk])
    
        mape_cpu_util, mape_mem_util, mape_net_in, mape_net_out, mape_disk = mape_per_variable
        results_mape.append([prediction_length,mape_cpu_util, mape_mem_util, mape_net_in, mape_net_out, mape_disk])
        results.append([prediction_length, average_mse, average_mae, average_rmse, average_mape])

In [None]:
# Convert the list of running times to a NumPy array
running_times = np.array(running_times)

# Calculate the average running time
average_running_time = np.mean(running_times)
print(f"Average running time for repetitions: {average_running_time} seconds")

In [None]:
results_df_results = pd.DataFrame(results, columns=['Prediction Length','average MSE','average MAE','average RMSE','average MAPE'])
results_df_results.to_csv('F:\\TCC_revision\\Feature_Wise_Results\\Alibaba 2018\\Average_Order_2_Minutes.csv', index=False)

In [None]:
import pandas as pd

data = pd.read_csv('F:\\TCC_revision\\Feature_Wise_Results\\Alibaba 2018\\Average_Order_2_Minutes.csv')
df = pd.DataFrame(data)

# Filter records where 'Hours' < 1
#filtered_df = df[df['Hours'] < 24]
hours_list = [3,18,20,25,26,45,52,51,50]
#hours_list = list(range(2, 20))
# Filter records where 'Hours' is in the list
filtered_df = df[df['Index'].isin(hours_list)]

# Calculate mean and standard error for each column
mean_values = filtered_df.mean()
std_values = filtered_df.std() / np.sqrt(len(filtered_df))

# Create the "x ± y" representation for each column
result = {}
for column in df.columns[2:]:  # Exclude the non-RMSE columns
    x = mean_values[column]
    y = std_values[column]
    result[column] = f"{x:.4f} ± {y:.6f}"

# Display the "x ± y" representation for each column
for column, value in result.items():
    print(f"{column}: {value}")

In [None]:
# Hour Level (1-12)

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

# Define the list of prediction lengths you want to test
prediction_lengths = [12,18,24,30,36,42,48,54,60,66,72,78,84,90,96,98,108,114,120, 132, 144 ]  # Add more lengths as needed
#prediction_lengths = [288, 432, 576, 720, 864, 1008, 1152]
#prediction_lengths = [144, 156, 168, 180, 192, 204, 216, 228, 240, 252, 264, 276]
#prediction_lengths = [288, 432, 576, 720, 864, 1008, 1152]
#prediction_lengths = [2,3,4,5,6,7,8,9,10,11]
# Initialize lists to store results for different prediction lengths
results = []
results_mse = []
results_rmse = []
results_mae = []
results_mape = []
prev_higher_state1 = label[len(label)-1]  # Replace with your desired initial state
prev_higher_state2 = label[len(label)-2]  # Replace with your desired initial state
prev_state =  la[len(train_data)-1] # Replace with your desired initial state
prev_state1=  la[len(train_data)-2] # Replace with your desired initial state
# Define the number of times you want to repeat the experiment
num_experiments = 100
for prediction_length in prediction_lengths:
   # no_predict1 = prediction_length
    # Inner loop for repeating the experiment
    for _ in range(num_experiments):
        no_predict = prediction_length
        pred_res = []
        while no_predict > 0:
            # Predict the next state at the higher level
            higher_transition_probs = htm[prev_higher_state2][prev_higher_state1]
    
            # Normalize transition probabilities if the sum is not zero
            if np.sum(higher_transition_probs) > 0:
                higher_transition_probs /= np.sum(higher_transition_probs)
            else:
                # If sum is close to zero, assign equal probabilities to all states
                higher_transition_probs = np.ones(len(states_higher)) / len(states_higher)
    
            next_higher_state = np.random.choice(states_higher, p=higher_transition_probs)
    
            # Identify the model tuple for the selected higher-level state
            selected_model = rep_time[0][next_higher_state]
    
            # Access the adjacency matrix and time duration for the selected model
            model_adj_matrix, model_time_duration = selected_model
    
            # Calculate transition probabilities for the previous two states
            transition_probs = model_adj_matrix[prev_state1][prev_state]

            # Normalize transition probabilities if the sum is not zero
            if np.sum(transition_probs) > 0:
                transition_probs /= np.sum(transition_probs)

                # Sample the next state based on transition probabilities
                next_state = np.random.choice(states, p=transition_probs)
            else:
                # If sum is close to zero, randomly choose the next state
                next_state = np.random.choice(states)

            # Append the predicted state to the result
            pred_res.append(next_state)

            # Update the state variables for the next iteration
            prev_higher_state2 = prev_higher_state1
            prev_higher_state1 = next_higher_state
            prev_state1 = prev_state
            prev_state = next_state
            no_predict -= 1

        res_num = generate_res_num(pred_res)
        
        #Predicted Data
        df_pred = pd.DataFrame(res_num, columns=['CPU_util','mem_util','net_in','net_out','disk'])
        ori = df_actual[len(train_data):len(train_data)+prediction_length]
        norm = MinMaxScaler(feature_range=(0.15,0.20))
        df_pred_norm = pd.DataFrame(norm.fit_transform(df_pred), columns=df_pred.columns)
        ori_norm = pd.DataFrame(norm.fit_transform(ori), columns=ori.columns)
        actual_values = ori_norm[['CPU_util','mem_util','net_in','net_out','disk']]
        predicted_values = df_pred_norm
        mse_per_variable = mean_squared_error(actual_values, predicted_values, multioutput='raw_values')

        # Calculate the average Mean Squared Error
        average_mse = np.mean(mse_per_variable)

        # Calculate Mean Squared Error (MSE) for each variable
        mae_per_variable = mean_absolute_error(actual_values, predicted_values, multioutput='raw_values')

        # Calculate the average Mean Squared Error
        average_mae = np.mean(mae_per_variable)


        # Calculate Mean Squared Error (MSE) for each variable
        mape_per_variable = mean_absolute_percentage_error(actual_values, predicted_values, multioutput='raw_values')

        # Calculate the average Mean Squared Error
        average_mape = np.mean(mape_per_variable)


        # Calculate Mean Squared Error (MSE) for each variable
        rms_per_variable = mean_squared_error(actual_values, predicted_values, multioutput='raw_values', squared=False)

        # Calculate the average Mean Squared Error
        average_rmse = np.mean(rms_per_variable)

        #'cpu_usage','gpu_wrk_util','avg_mem','avg_gpu_work_mem'

        mse_cpu_util, mse_mem_util, mse_net_in, mse_net_out, mse_disk = mse_per_variable
        results_mse.append([prediction_length,mse_cpu_util, mse_mem_util, mse_net_in, mse_net_out, mse_disk])

        mae_cpu_util, mae_mem_util, mae_net_in, mae_net_out, mae_disk = mae_per_variable
        results_mae.append([prediction_length, mae_cpu_util, mae_mem_util, mae_net_in, mae_net_out, mae_disk])

        rmse_cpu_util, rmse_mem_util, rmse_net_in, rmse_net_out, rmse_disk = rms_per_variable
        results_rmse.append([prediction_length, rmse_cpu_util, rmse_mem_util, rmse_net_in, rmse_net_out, rmse_disk])
    
        mape_cpu_util, mape_mem_util, mape_net_in, mape_net_out, mape_disk = mape_per_variable
        results_mape.append([prediction_length,mape_cpu_util, mape_mem_util, mape_net_in, mape_net_out, mape_disk])
        results.append([prediction_length, average_mse, average_mae, average_rmse, average_mape])

In [None]:
results_df_results = pd.DataFrame(results, columns=['Prediction Length','average MSE','average MAE','average RMSE','average MAPE'])
results_df_results.to_csv('F:\\TCC_revision\\Feature_Wise_Results\\Alibaba 2018\\Average_Order_2_Hours_1_12.csv', index=False)

In [None]:
import pandas as pd

data = pd.read_csv('F:\\TCC_revision\\Feature_Wise_Results\\Alibaba 2018\\Average_Order_2_Hours_1_12.csv')
df = pd.DataFrame(data)

# Filter records where 'Hours' < 1
#filtered_df = df[df['Hours'] < 24]
hours_list = [3,18,20,25,26,45,52,51,50]
#hours_list = list(range(2, 20))
# Filter records where 'Hours' is in the list
filtered_df = df[df['Index'].isin(hours_list)]

# Calculate mean and standard error for each column
mean_values = filtered_df.mean()
std_values = filtered_df.std() / np.sqrt(len(filtered_df))

# Create the "x ± y" representation for each column
result = {}
for column in df.columns[2:]:  # Exclude the non-RMSE columns
    x = mean_values[column]
    y = std_values[column]
    result[column] = f"{x:.4f} ± {y:.6f}"

# Display the "x ± y" representation for each column
for column, value in result.items():
    print(f"{column}: {value}")

In [None]:
# Hour Level (13-24)

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

# Define the list of prediction lengths you want to test
#prediction_lengths = [12,18,24,30,36,42,48,54,60,66,72,78,84,90,96,98,108,114,120, 132, 144 ]  # Add more lengths as needed
#prediction_lengths = [288, 432, 576, 720, 864, 1008, 1152]
prediction_lengths = [144, 156, 168, 180, 192, 204, 216, 228, 240, 252, 264, 276]
#prediction_lengths = [288, 432, 576, 720, 864, 1008, 1152]
#prediction_lengths = [2,3,4,5,6,7,8,9,10,11]
# Initialize lists to store results for different prediction lengths
results = []
results_mse = []
results_rmse = []
results_mae = []
results_mape = []
prev_higher_state1 = label[len(label)-1]  # Replace with your desired initial state
prev_higher_state2 = label[len(label)-2]  # Replace with your desired initial state
prev_state =  la[len(train_data)-1] # Replace with your desired initial state
prev_state1=  la[len(train_data)-2] # Replace with your desired initial state
# Define the number of times you want to repeat the experiment
num_experiments = 100
for prediction_length in prediction_lengths:
   # no_predict1 = prediction_length
    # Inner loop for repeating the experiment
    for _ in range(num_experiments):
        no_predict = prediction_length
        pred_res = []
        while no_predict > 0:
            # Predict the next state at the higher level
            higher_transition_probs = htm[prev_higher_state2][prev_higher_state1]
    
            # Normalize transition probabilities if the sum is not zero
            if np.sum(higher_transition_probs) > 0:
                higher_transition_probs /= np.sum(higher_transition_probs)
            else:
                # If sum is close to zero, assign equal probabilities to all states
                higher_transition_probs = np.ones(len(states_higher)) / len(states_higher)
    
            next_higher_state = np.random.choice(states_higher, p=higher_transition_probs)
    
            # Identify the model tuple for the selected higher-level state
            selected_model = rep_time[0][next_higher_state]
    
            # Access the adjacency matrix and time duration for the selected model
            model_adj_matrix, model_time_duration = selected_model
    
            # Calculate transition probabilities for the previous two states
            transition_probs = model_adj_matrix[prev_state1][prev_state]

            # Normalize transition probabilities if the sum is not zero
            if np.sum(transition_probs) > 0:
                transition_probs /= np.sum(transition_probs)

                # Sample the next state based on transition probabilities
                next_state = np.random.choice(states, p=transition_probs)
            else:
                # If sum is close to zero, randomly choose the next state
                next_state = np.random.choice(states)

            # Append the predicted state to the result
            pred_res.append(next_state)

            # Update the state variables for the next iteration
            prev_higher_state2 = prev_higher_state1
            prev_higher_state1 = next_higher_state
            prev_state1 = prev_state
            prev_state = next_state
            no_predict -= 1

        res_num = generate_res_num(pred_res)
        
        #Predicted Data
        df_pred = pd.DataFrame(res_num, columns=['CPU_util','mem_util','net_in','net_out','disk'])
        ori = df_actual[len(train_data):len(train_data)+prediction_length]
        norm = MinMaxScaler(feature_range=(0.15,0.20))
        df_pred_norm = pd.DataFrame(norm.fit_transform(df_pred), columns=df_pred.columns)
        ori_norm = pd.DataFrame(norm.fit_transform(ori), columns=ori.columns)
        actual_values = ori_norm[['CPU_util','mem_util','net_in','net_out','disk']]
        predicted_values = df_pred_norm
        mse_per_variable = mean_squared_error(actual_values, predicted_values, multioutput='raw_values')

        # Calculate the average Mean Squared Error
        average_mse = np.mean(mse_per_variable)

        # Calculate Mean Squared Error (MSE) for each variable
        mae_per_variable = mean_absolute_error(actual_values, predicted_values, multioutput='raw_values')

        # Calculate the average Mean Squared Error
        average_mae = np.mean(mae_per_variable)


        # Calculate Mean Squared Error (MSE) for each variable
        mape_per_variable = mean_absolute_percentage_error(actual_values, predicted_values, multioutput='raw_values')

        # Calculate the average Mean Squared Error
        average_mape = np.mean(mape_per_variable)


        # Calculate Mean Squared Error (MSE) for each variable
        rms_per_variable = mean_squared_error(actual_values, predicted_values, multioutput='raw_values', squared=False)

        # Calculate the average Mean Squared Error
        average_rmse = np.mean(rms_per_variable)

        #'cpu_usage','gpu_wrk_util','avg_mem','avg_gpu_work_mem'

        mse_cpu_util, mse_mem_util, mse_net_in, mse_net_out, mse_disk = mse_per_variable
        results_mse.append([prediction_length,mse_cpu_util, mse_mem_util, mse_net_in, mse_net_out, mse_disk])

        mae_cpu_util, mae_mem_util, mae_net_in, mae_net_out, mae_disk = mae_per_variable
        results_mae.append([prediction_length, mae_cpu_util, mae_mem_util, mae_net_in, mae_net_out, mae_disk])

        rmse_cpu_util, rmse_mem_util, rmse_net_in, rmse_net_out, rmse_disk = rms_per_variable
        results_rmse.append([prediction_length, rmse_cpu_util, rmse_mem_util, rmse_net_in, rmse_net_out, rmse_disk])
    
        mape_cpu_util, mape_mem_util, mape_net_in, mape_net_out, mape_disk = mape_per_variable
        results_mape.append([prediction_length,mape_cpu_util, mape_mem_util, mape_net_in, mape_net_out, mape_disk])
        results.append([prediction_length, average_mse, average_mae, average_rmse, average_mape])

In [None]:
results_df_results = pd.DataFrame(results, columns=['Prediction Length','average MSE','average MAE','average RMSE','average MAPE'])
results_df_results.to_csv('F:\\TCC_revision\\Feature_Wise_Results\\Alibaba 2018\\Average_Order_2_Hours_13_24.csv', index=False)

In [None]:
import pandas as pd

data = pd.read_csv('F:\\TCC_revision\\Feature_Wise_Results\\Alibaba 2018\\Average_Order_2_Hours_13_24.csv')
df = pd.DataFrame(data)

# Filter records where 'Hours' < 1
#filtered_df = df[df['Hours'] < 24]
hours_list = [3,18,20,25,26,45,52,51,50]
#hours_list = list(range(2, 20))
# Filter records where 'Hours' is in the list
filtered_df = df[df['Index'].isin(hours_list)]

# Calculate mean and standard error for each column
mean_values = filtered_df.mean()
std_values = filtered_df.std() / np.sqrt(len(filtered_df))

# Create the "x ± y" representation for each column
result = {}
for column in df.columns[2:]:  # Exclude the non-RMSE columns
    x = mean_values[column]
    y = std_values[column]
    result[column] = f"{x:.4f} ± {y:.6f}"

# Display the "x ± y" representation for each column
for column, value in result.items():
    print(f"{column}: {value}")

In [None]:
# Day Level

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

# Define the list of prediction lengths you want to test
#prediction_lengths = [12,18,24,30,36,42,48,54,60,66,72,78,84,90,96,98,108,114,120, 132, 144 ]  # Add more lengths as needed
#prediction_lengths = [288, 432, 576, 720, 864, 1008, 1152]
#prediction_lengths = [144, 156, 168, 180, 192, 204, 216, 228, 240, 252, 264, 276]
prediction_lengths = [288, 432, 576, 720, 864, 1008, 1152]
#prediction_lengths = [2,3,4,5,6,7,8,9,10,11]
# Initialize lists to store results for different prediction lengths
results = []
results_mse = []
results_rmse = []
results_mae = []
results_mape = []
prev_higher_state1 = label[len(label)-1]  # Replace with your desired initial state
prev_higher_state2 = label[len(label)-2]  # Replace with your desired initial state
prev_state =  la[len(train_data)-1] # Replace with your desired initial state
prev_state1=  la[len(train_data)-2] # Replace with your desired initial state
# Define the number of times you want to repeat the experiment
num_experiments = 100
for prediction_length in prediction_lengths:
   # no_predict1 = prediction_length
    # Inner loop for repeating the experiment
    for _ in range(num_experiments):
        no_predict = prediction_length
        pred_res = []
        while no_predict > 0:
            # Predict the next state at the higher level
            higher_transition_probs = htm[prev_higher_state2][prev_higher_state1]
    
            # Normalize transition probabilities if the sum is not zero
            if np.sum(higher_transition_probs) > 0:
                higher_transition_probs /= np.sum(higher_transition_probs)
            else:
                # If sum is close to zero, assign equal probabilities to all states
                higher_transition_probs = np.ones(len(states_higher)) / len(states_higher)
    
            next_higher_state = np.random.choice(states_higher, p=higher_transition_probs)
    
            # Identify the model tuple for the selected higher-level state
            selected_model = rep_time[0][next_higher_state]
    
            # Access the adjacency matrix and time duration for the selected model
            model_adj_matrix, model_time_duration = selected_model
    
            # Calculate transition probabilities for the previous two states
            transition_probs = model_adj_matrix[prev_state1][prev_state]

            # Normalize transition probabilities if the sum is not zero
            if np.sum(transition_probs) > 0:
                transition_probs /= np.sum(transition_probs)

                # Sample the next state based on transition probabilities
                next_state = np.random.choice(states, p=transition_probs)
            else:
                # If sum is close to zero, randomly choose the next state
                next_state = np.random.choice(states)

            # Append the predicted state to the result
            pred_res.append(next_state)

            # Update the state variables for the next iteration
            prev_higher_state2 = prev_higher_state1
            prev_higher_state1 = next_higher_state
            prev_state1 = prev_state
            prev_state = next_state
            no_predict -= 1

        res_num = generate_res_num(pred_res)
        
        #Predicted Data
        df_pred = pd.DataFrame(res_num, columns=['CPU_util','mem_util','net_in','net_out','disk'])
        ori = df_actual[len(train_data):len(train_data)+prediction_length]
        norm = MinMaxScaler(feature_range=(0.15,0.20))
        df_pred_norm = pd.DataFrame(norm.fit_transform(df_pred), columns=df_pred.columns)
        ori_norm = pd.DataFrame(norm.fit_transform(ori), columns=ori.columns)
        actual_values = ori_norm[['CPU_util','mem_util','net_in','net_out','disk']]
        predicted_values = df_pred_norm
        mse_per_variable = mean_squared_error(actual_values, predicted_values, multioutput='raw_values')

        # Calculate the average Mean Squared Error
        average_mse = np.mean(mse_per_variable)

        # Calculate Mean Squared Error (MSE) for each variable
        mae_per_variable = mean_absolute_error(actual_values, predicted_values, multioutput='raw_values')

        # Calculate the average Mean Squared Error
        average_mae = np.mean(mae_per_variable)


        # Calculate Mean Squared Error (MSE) for each variable
        mape_per_variable = mean_absolute_percentage_error(actual_values, predicted_values, multioutput='raw_values')

        # Calculate the average Mean Squared Error
        average_mape = np.mean(mape_per_variable)


        # Calculate Mean Squared Error (MSE) for each variable
        rms_per_variable = mean_squared_error(actual_values, predicted_values, multioutput='raw_values', squared=False)

        # Calculate the average Mean Squared Error
        average_rmse = np.mean(rms_per_variable)

        #'cpu_usage','gpu_wrk_util','avg_mem','avg_gpu_work_mem'

        mse_cpu_util, mse_mem_util, mse_net_in, mse_net_out, mse_disk = mse_per_variable
        results_mse.append([prediction_length,mse_cpu_util, mse_mem_util, mse_net_in, mse_net_out, mse_disk])

        mae_cpu_util, mae_mem_util, mae_net_in, mae_net_out, mae_disk = mae_per_variable
        results_mae.append([prediction_length, mae_cpu_util, mae_mem_util, mae_net_in, mae_net_out, mae_disk])

        rmse_cpu_util, rmse_mem_util, rmse_net_in, rmse_net_out, rmse_disk = rms_per_variable
        results_rmse.append([prediction_length, rmse_cpu_util, rmse_mem_util, rmse_net_in, rmse_net_out, rmse_disk])
    
        mape_cpu_util, mape_mem_util, mape_net_in, mape_net_out, mape_disk = mape_per_variable
        results_mape.append([prediction_length,mape_cpu_util, mape_mem_util, mape_net_in, mape_net_out, mape_disk])
        results.append([prediction_length, average_mse, average_mae, average_rmse, average_mape])

In [None]:
results_df_results = pd.DataFrame(results, columns=['Prediction Length','average MSE','average MAE','average RMSE','average MAPE'])
results_df_results.to_csv('F:\\TCC_revision\\Feature_Wise_Results\\Alibaba 2018\\Average_Order_2_Day.csv', index=False)

In [None]:
import pandas as pd

data = pd.read_csv('F:\\TCC_revision\\Feature_Wise_Results\\Alibaba 2018\\Average_Order_2_Day.csv')
df = pd.DataFrame(data)

# Filter records where 'Hours' < 1
#filtered_df = df[df['Hours'] < 24]
hours_list = [3,18,20,25,26,45,52,51,50]
#hours_list = list(range(2, 20))
# Filter records where 'Hours' is in the list
filtered_df = df[df['Index'].isin(hours_list)]

# Calculate mean and standard error for each column
mean_values = filtered_df.mean()
std_values = filtered_df.std() / np.sqrt(len(filtered_df))

# Create the "x ± y" representation for each column
result = {}
for column in df.columns[2:]:  # Exclude the non-RMSE columns
    x = mean_values[column]
    y = std_values[column]
    result[column] = f"{x:.4f} ± {y:.6f}"

# Display the "x ± y" representation for each column
for column, value in result.items():
    print(f"{column}: {value}")

In [None]:
# CPU Utilization

In [None]:
#Feature Wise Analysis

In [None]:
# CPU Utilization (Minute Level)

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

# Define the list of prediction lengths you want to test
#prediction_lengths = [12,18,24,30,36,42,48,54,60,66,72,78,84,90,96,98,108,114,120, 132, 144 ]  # Add more lengths as needed
#prediction_lengths = [288, 432, 576, 720, 864, 1008, 1152]
#prediction_lengths = [144, 156, 168, 180, 192, 204, 216, 228, 240, 252, 264, 276]
#prediction_lengths = [288, 432, 576, 720, 864, 1008, 1152]
prediction_lengths = [2,3,4,5,6,7,8,9,10,11]
# Initialize lists to store results for different prediction lengths
results = []
results_mse = []
results_rmse = []
results_mae = []
results_mape = []
results_metrics_cpu = []
results_metrics_canonical_mem = []
results_metrics_assigned_mem = []
results_metrics_cache_mem =[]
results_metrics_disk = []
prev_higher_state1 = label[len(label)-1]  # Replace with your desired initial state
prev_higher_state2 = label[len(label)-2]  # Replace with your desired initial state
prev_state =  la[len(train_data)-1] # Replace with your desired initial state
prev_state1=  la[len(train_data)-2] # Replace with your desired initial state
# Define the number of times you want to repeat the experiment
num_experiments = 50
for prediction_length in prediction_lengths:
   # no_predict1 = prediction_length
    # Inner loop for repeating the experiment
    for _ in range(num_experiments):
        no_predict = prediction_length
        pred_res = []
        while no_predict > 0:
            # Predict the next state at the higher level
            higher_transition_probs = htm[prev_higher_state2][prev_higher_state1]
    
            # Normalize transition probabilities if the sum is not zero
            if np.sum(higher_transition_probs) > 0:
                higher_transition_probs /= np.sum(higher_transition_probs)
            else:
                # If sum is close to zero, assign equal probabilities to all states
                higher_transition_probs = np.ones(len(states_higher)) / len(states_higher)
    
            next_higher_state = np.random.choice(states_higher, p=higher_transition_probs)
    
            # Identify the model tuple for the selected higher-level state
            selected_model = rep_time[0][next_higher_state]
    
            # Access the adjacency matrix and time duration for the selected model
            model_adj_matrix, model_time_duration = selected_model
    
            # Calculate transition probabilities for the previous two states
            transition_probs = model_adj_matrix[prev_state1][prev_state]

            # Normalize transition probabilities if the sum is not zero
            if np.sum(transition_probs) > 0:
                transition_probs /= np.sum(transition_probs)

                # Sample the next state based on transition probabilities
                next_state = np.random.choice(states, p=transition_probs)
            else:
                # If sum is close to zero, randomly choose the next state
                next_state = np.random.choice(states)

            # Append the predicted state to the result
            pred_res.append(next_state)

            # Update the state variables for the next iteration
            prev_higher_state2 = prev_higher_state1
            prev_higher_state1 = next_higher_state
            prev_state1 = prev_state
            prev_state = next_state
            no_predict -= 1

        res_num = generate_res_num(pred_res)
        
        #Predicted Data
        df_pred = pd.DataFrame(res_num, columns=['CPU_util','mem_util','net_in','net_out','disk'])
        ori = df_actual[len(train_data):len(train_data)+prediction_length]
        norm = MinMaxScaler(feature_range=(0.15,0.20))
        df_pred_norm = pd.DataFrame(norm.fit_transform(df_pred), columns=df_pred.columns)
        ori_norm = pd.DataFrame(norm.fit_transform(ori), columns=ori.columns)
        actual_values = ori_norm[['CPU_util','mem_util','net_in','net_out','disk']]
        predicted_values = df_pred_norm
        mse_per_variable = mean_squared_error(actual_values, predicted_values, multioutput='raw_values')

        # Calculate the average Mean Squared Error
        average_mse = np.mean(mse_per_variable)

        # Calculate Mean Squared Error (MSE) for each variable
        mae_per_variable = mean_absolute_error(actual_values, predicted_values, multioutput='raw_values')

        # Calculate the average Mean Squared Error
        average_mae = np.mean(mae_per_variable)


        # Calculate Mean Squared Error (MSE) for each variable
        mape_per_variable = mean_absolute_percentage_error(actual_values, predicted_values, multioutput='raw_values')

        # Calculate the average Mean Squared Error
        average_mape = np.mean(mape_per_variable)


        # Calculate Mean Squared Error (MSE) for each variable
        rms_per_variable = mean_squared_error(actual_values, predicted_values, multioutput='raw_values', squared=False)

        # Calculate the average Mean Squared Error
        average_rmse = np.mean(rms_per_variable)

        #'cpu_usage','gpu_wrk_util','avg_mem','avg_gpu_work_mem'

        mse_cpu_util, mse_mem_util, mse_net_in, mse_net_out, mse_disk = mse_per_variable
        results_mse.append([prediction_length,mse_cpu_util, mse_mem_util, mse_net_in, mse_net_out, mse_disk])

        mae_cpu_util, mae_mem_util, mae_net_in, mae_net_out, mae_disk = mae_per_variable
        results_mae.append([prediction_length, mae_cpu_util, mae_mem_util, mae_net_in, mae_net_out, mae_disk])

        rmse_cpu_util, rmse_mem_util, rmse_net_in, rmse_net_out, rmse_disk = rms_per_variable
        results_rmse.append([prediction_length, rmse_cpu_util, rmse_mem_util, rmse_net_in, rmse_net_out, rmse_disk])
    
        mape_cpu_util, mape_mem_util, mape_net_in, mape_net_out, mape_disk = mape_per_variable
        results_mape.append([prediction_length,mape_cpu_util, mape_mem_util, mape_net_in, mape_net_out, mape_disk])
        results.append([prediction_length, average_mse, average_mae, average_rmse, average_mape])
        
        results_metrics_cpu.append([prediction_length, mse_cpu_util,mae_cpu_util, rmse_cpu_util,mape_cpu_util])

In [None]:
results_df_cpu_metrics = pd.DataFrame(results_metrics_cpu, columns=['Prediction Length','MSE_mean_cpu_usage','RMSE_mean_cpu_usage','MAE_mean_cpu_usage','MAPE_mean_cpu_usage'])
results_df_cpu_metrics.to_csv('F:\\TCC_revision\\Feature_Wise_Results\\Alibaba 2018\\CPU_Metrics_SMPM_Order2_Minutes.csv', index=False)

In [None]:
import pandas as pd

data = pd.read_csv('F:\\TCC_revision\\Feature_Wise_Results\\Alibaba 2018\\CPU_Metrics_SMPM_Order2_Minutes.csv')
df = pd.DataFrame(data)

# Filter records where 'Hours' < 1
#filtered_df = df[df['Hours'] < 24]

hours_list = list(range(37,60))
#hours_list = [288,304,313,314,327,337,334]

# Filter records where 'Hours' is in the list
filtered_df = df[df['Index'].isin(hours_list)]

# Calculate mean and standard error for each column
mean_values = filtered_df.mean()
std_values = filtered_df.std() / np.sqrt(len(filtered_df))

# Create the "x ± y" representation for each column
result = {}
for column in df.columns[2:]:  # Exclude the non-RMSE columns
    x = mean_values[column]
    y = std_values[column]
    result[column] = f"{x:.4f} ± {y:.6f}"

# Display the "x ± y" representation for each column
for column, value in result.items():
    print(f"{column}: {value}")

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

# Define the list of prediction lengths you want to test
prediction_lengths = [12,18,24,30,36,42,48,54,60,66,72,78,84,90,96,98,108,114,120, 132, 144 ]  # Add more lengths as needed
#prediction_lengths = [288, 432, 576, 720, 864, 1008, 1152]
#prediction_lengths = [144, 156, 168, 180, 192, 204, 216, 228, 240, 252, 264, 276]
#prediction_lengths = [288, 432, 576, 720, 864, 1008, 1152]
#prediction_lengths = [2,3,4,5,6,7,8,9,10,11]
# Initialize lists to store results for different prediction lengths
results = []
results_mse = []
results_rmse = []
results_mae = []
results_mape = []
results_metrics_cpu = []
results_metrics_canonical_mem = []
results_metrics_assigned_mem = []
results_metrics_cache_mem =[]
results_metrics_disk = []
prev_higher_state1 = label[len(label)-1]  # Replace with your desired initial state
prev_higher_state2 = label[len(label)-2]  # Replace with your desired initial state
prev_state =  la[len(train_data)-1] # Replace with your desired initial state
prev_state1=  la[len(train_data)-2] # Replace with your desired initial state
# Define the number of times you want to repeat the experiment
num_experiments = 50
for prediction_length in prediction_lengths:
   # no_predict1 = prediction_length
    # Inner loop for repeating the experiment
    for _ in range(num_experiments):
        no_predict = prediction_length
        pred_res = []
        while no_predict > 0:
            # Predict the next state at the higher level
            higher_transition_probs = htm[prev_higher_state2][prev_higher_state1]
    
            # Normalize transition probabilities if the sum is not zero
            if np.sum(higher_transition_probs) > 0:
                higher_transition_probs /= np.sum(higher_transition_probs)
            else:
                # If sum is close to zero, assign equal probabilities to all states
                higher_transition_probs = np.ones(len(states_higher)) / len(states_higher)
    
            next_higher_state = np.random.choice(states_higher, p=higher_transition_probs)
    
            # Identify the model tuple for the selected higher-level state
            selected_model = rep_time[0][next_higher_state]
    
            # Access the adjacency matrix and time duration for the selected model
            model_adj_matrix, model_time_duration = selected_model
    
            # Calculate transition probabilities for the previous two states
            transition_probs = model_adj_matrix[prev_state1][prev_state]

            # Normalize transition probabilities if the sum is not zero
            if np.sum(transition_probs) > 0:
                transition_probs /= np.sum(transition_probs)

                # Sample the next state based on transition probabilities
                next_state = np.random.choice(states, p=transition_probs)
            else:
                # If sum is close to zero, randomly choose the next state
                next_state = np.random.choice(states)

            # Append the predicted state to the result
            pred_res.append(next_state)

            # Update the state variables for the next iteration
            prev_higher_state2 = prev_higher_state1
            prev_higher_state1 = next_higher_state
            prev_state1 = prev_state
            prev_state = next_state
            no_predict -= 1

        res_num = generate_res_num(pred_res)
        
        #Predicted Data
        df_pred = pd.DataFrame(res_num, columns=['CPU_util','mem_util','net_in','net_out','disk'])
        ori = df_actual[len(train_data):len(train_data)+prediction_length]
        norm = MinMaxScaler(feature_range=(0.15,0.20))
        df_pred_norm = pd.DataFrame(norm.fit_transform(df_pred), columns=df_pred.columns)
        ori_norm = pd.DataFrame(norm.fit_transform(ori), columns=ori.columns)
        actual_values = ori_norm[['CPU_util','mem_util','net_in','net_out','disk']]
        predicted_values = df_pred_norm
        mse_per_variable = mean_squared_error(actual_values, predicted_values, multioutput='raw_values')

        # Calculate the average Mean Squared Error
        average_mse = np.mean(mse_per_variable)

        # Calculate Mean Squared Error (MSE) for each variable
        mae_per_variable = mean_absolute_error(actual_values, predicted_values, multioutput='raw_values')

        # Calculate the average Mean Squared Error
        average_mae = np.mean(mae_per_variable)


        # Calculate Mean Squared Error (MSE) for each variable
        mape_per_variable = mean_absolute_percentage_error(actual_values, predicted_values, multioutput='raw_values')

        # Calculate the average Mean Squared Error
        average_mape = np.mean(mape_per_variable)


        # Calculate Mean Squared Error (MSE) for each variable
        rms_per_variable = mean_squared_error(actual_values, predicted_values, multioutput='raw_values', squared=False)

        # Calculate the average Mean Squared Error
        average_rmse = np.mean(rms_per_variable)

        #'cpu_usage','gpu_wrk_util','avg_mem','avg_gpu_work_mem'

        mse_cpu_util, mse_mem_util, mse_net_in, mse_net_out, mse_disk = mse_per_variable
        results_mse.append([prediction_length,mse_cpu_util, mse_mem_util, mse_net_in, mse_net_out, mse_disk])

        mae_cpu_util, mae_mem_util, mae_net_in, mae_net_out, mae_disk = mae_per_variable
        results_mae.append([prediction_length, mae_cpu_util, mae_mem_util, mae_net_in, mae_net_out, mae_disk])

        rmse_cpu_util, rmse_mem_util, rmse_net_in, rmse_net_out, rmse_disk = rms_per_variable
        results_rmse.append([prediction_length, rmse_cpu_util, rmse_mem_util, rmse_net_in, rmse_net_out, rmse_disk])
    
        mape_cpu_util, mape_mem_util, mape_net_in, mape_net_out, mape_disk = mape_per_variable
        results_mape.append([prediction_length,mape_cpu_util, mape_mem_util, mape_net_in, mape_net_out, mape_disk])
        results.append([prediction_length, average_mse, average_mae, average_rmse, average_mape])
        
        results_metrics_cpu.append([prediction_length, mse_cpu_util,mae_cpu_util, rmse_cpu_util,mape_cpu_util])

In [None]:
results_df_cpu_metrics = pd.DataFrame(results_metrics_cpu, columns=['Prediction Length','MSE_mean_cpu_usage','RMSE_mean_cpu_usage','MAE_mean_cpu_usage','MAPE_mean_cpu_usage'])
results_df_cpu_metrics.to_csv('F:\\TCC_revision\\Feature_Wise_Results\\Alibaba 2018\\CPU_Metrics_SMPM_Order2_Hours_1_12.csv', index=False)

In [None]:
import pandas as pd

data = pd.read_csv('F:\\TCC_revision\\Feature_Wise_Results\\Alibaba 2018\\CPU_Metrics_SMPM_Order2_Hours_1_12.csv')
df = pd.DataFrame(data)

# Filter records where 'Hours' < 1
#filtered_df = df[df['Hours'] < 24]

hours_list = list(range(37,60))
#hours_list = [288,304,313,314,327,337,334]

# Filter records where 'Hours' is in the list
filtered_df = df[df['Index'].isin(hours_list)]

# Calculate mean and standard error for each column
mean_values = filtered_df.mean()
std_values = filtered_df.std() / np.sqrt(len(filtered_df))

# Create the "x ± y" representation for each column
result = {}
for column in df.columns[2:]:  # Exclude the non-RMSE columns
    x = mean_values[column]
    y = std_values[column]
    result[column] = f"{x:.4f} ± {y:.6f}"

# Display the "x ± y" representation for each column
for column, value in result.items():
    print(f"{column}: {value}")

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

# Define the list of prediction lengths you want to test
#prediction_lengths = [12,18,24,30,36,42,48,54,60,66,72,78,84,90,96,98,108,114,120, 132, 144 ]  # Add more lengths as needed
#prediction_lengths = [288, 432, 576, 720, 864, 1008, 1152]
prediction_lengths = [144, 156, 168, 180, 192, 204, 216, 228, 240, 252, 264, 276]
#prediction_lengths = [288, 432, 576, 720, 864, 1008, 1152]
#prediction_lengths = [2,3,4,5,6,7,8,9,10,11]
# Initialize lists to store results for different prediction lengths
results = []
results_mse = []
results_rmse = []
results_mae = []
results_mape = []
results_metrics_cpu = []
results_metrics_canonical_mem = []
results_metrics_assigned_mem = []
results_metrics_cache_mem =[]
results_metrics_disk = []
prev_higher_state1 = label[len(label)-1]  # Replace with your desired initial state
prev_higher_state2 = label[len(label)-2]  # Replace with your desired initial state
prev_state =  la[len(train_data)-1] # Replace with your desired initial state
prev_state1=  la[len(train_data)-2] # Replace with your desired initial state
# Define the number of times you want to repeat the experiment
num_experiments = 50
for prediction_length in prediction_lengths:
   # no_predict1 = prediction_length
    # Inner loop for repeating the experiment
    for _ in range(num_experiments):
        no_predict = prediction_length
        pred_res = []
        while no_predict > 0:
            # Predict the next state at the higher level
            higher_transition_probs = htm[prev_higher_state2][prev_higher_state1]
    
            # Normalize transition probabilities if the sum is not zero
            if np.sum(higher_transition_probs) > 0:
                higher_transition_probs /= np.sum(higher_transition_probs)
            else:
                # If sum is close to zero, assign equal probabilities to all states
                higher_transition_probs = np.ones(len(states_higher)) / len(states_higher)
    
            next_higher_state = np.random.choice(states_higher, p=higher_transition_probs)
    
            # Identify the model tuple for the selected higher-level state
            selected_model = rep_time[0][next_higher_state]
    
            # Access the adjacency matrix and time duration for the selected model
            model_adj_matrix, model_time_duration = selected_model
    
            # Calculate transition probabilities for the previous two states
            transition_probs = model_adj_matrix[prev_state1][prev_state]

            # Normalize transition probabilities if the sum is not zero
            if np.sum(transition_probs) > 0:
                transition_probs /= np.sum(transition_probs)

                # Sample the next state based on transition probabilities
                next_state = np.random.choice(states, p=transition_probs)
            else:
                # If sum is close to zero, randomly choose the next state
                next_state = np.random.choice(states)

            # Append the predicted state to the result
            pred_res.append(next_state)

            # Update the state variables for the next iteration
            prev_higher_state2 = prev_higher_state1
            prev_higher_state1 = next_higher_state
            prev_state1 = prev_state
            prev_state = next_state
            no_predict -= 1

        res_num = generate_res_num(pred_res)
        
        #Predicted Data
        df_pred = pd.DataFrame(res_num, columns=['CPU_util','mem_util','net_in','net_out','disk'])
        ori = df_actual[len(train_data):len(train_data)+prediction_length]
        norm = MinMaxScaler(feature_range=(0.15,0.20))
        df_pred_norm = pd.DataFrame(norm.fit_transform(df_pred), columns=df_pred.columns)
        ori_norm = pd.DataFrame(norm.fit_transform(ori), columns=ori.columns)
        actual_values = ori_norm[['CPU_util','mem_util','net_in','net_out','disk']]
        predicted_values = df_pred_norm
        mse_per_variable = mean_squared_error(actual_values, predicted_values, multioutput='raw_values')

        # Calculate the average Mean Squared Error
        average_mse = np.mean(mse_per_variable)

        # Calculate Mean Squared Error (MSE) for each variable
        mae_per_variable = mean_absolute_error(actual_values, predicted_values, multioutput='raw_values')

        # Calculate the average Mean Squared Error
        average_mae = np.mean(mae_per_variable)


        # Calculate Mean Squared Error (MSE) for each variable
        mape_per_variable = mean_absolute_percentage_error(actual_values, predicted_values, multioutput='raw_values')

        # Calculate the average Mean Squared Error
        average_mape = np.mean(mape_per_variable)


        # Calculate Mean Squared Error (MSE) for each variable
        rms_per_variable = mean_squared_error(actual_values, predicted_values, multioutput='raw_values', squared=False)

        # Calculate the average Mean Squared Error
        average_rmse = np.mean(rms_per_variable)

        #'cpu_usage','gpu_wrk_util','avg_mem','avg_gpu_work_mem'

        mse_cpu_util, mse_mem_util, mse_net_in, mse_net_out, mse_disk = mse_per_variable
        results_mse.append([prediction_length,mse_cpu_util, mse_mem_util, mse_net_in, mse_net_out, mse_disk])

        mae_cpu_util, mae_mem_util, mae_net_in, mae_net_out, mae_disk = mae_per_variable
        results_mae.append([prediction_length, mae_cpu_util, mae_mem_util, mae_net_in, mae_net_out, mae_disk])

        rmse_cpu_util, rmse_mem_util, rmse_net_in, rmse_net_out, rmse_disk = rms_per_variable
        results_rmse.append([prediction_length, rmse_cpu_util, rmse_mem_util, rmse_net_in, rmse_net_out, rmse_disk])
    
        mape_cpu_util, mape_mem_util, mape_net_in, mape_net_out, mape_disk = mape_per_variable
        results_mape.append([prediction_length,mape_cpu_util, mape_mem_util, mape_net_in, mape_net_out, mape_disk])
        results.append([prediction_length, average_mse, average_mae, average_rmse, average_mape])
        
        results_metrics_cpu.append([prediction_length, mse_cpu_util,mae_cpu_util, rmse_cpu_util,mape_cpu_util])

In [None]:
results_df_cpu_metrics = pd.DataFrame(results_metrics_cpu, columns=['Prediction Length','MSE_mean_cpu_usage','RMSE_mean_cpu_usage','MAE_mean_cpu_usage','MAPE_mean_cpu_usage'])
results_df_cpu_metrics.to_csv('F:\\TCC_revision\\Feature_Wise_Results\\Alibaba 2018\\CPU_Metrics_SMPM_Order2_Hours_13_24.csv', index=False)

In [None]:
import pandas as pd

data = pd.read_csv('F:\\TCC_revision\\Feature_Wise_Results\\Alibaba 2018\\CPU_Metrics_SMPM_Order2_Hours_13_24.csv')
df = pd.DataFrame(data)

# Filter records where 'Hours' < 1
#filtered_df = df[df['Hours'] < 24]

hours_list = list(range(37,60))
#hours_list = [288,304,313,314,327,337,334]

# Filter records where 'Hours' is in the list
filtered_df = df[df['Index'].isin(hours_list)]

# Calculate mean and standard error for each column
mean_values = filtered_df.mean()
std_values = filtered_df.std() / np.sqrt(len(filtered_df))

# Create the "x ± y" representation for each column
result = {}
for column in df.columns[2:]:  # Exclude the non-RMSE columns
    x = mean_values[column]
    y = std_values[column]
    result[column] = f"{x:.4f} ± {y:.6f}"

# Display the "x ± y" representation for each column
for column, value in result.items():
    print(f"{column}: {value}")

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

# Define the list of prediction lengths you want to test
#prediction_lengths = [12,18,24,30,36,42,48,54,60,66,72,78,84,90,96,98,108,114,120, 132, 144 ]  # Add more lengths as needed
prediction_lengths = [288, 432, 576, 720, 864, 1008, 1152]
#prediction_lengths = [144, 156, 168, 180, 192, 204, 216, 228, 240, 252, 264, 276]
#prediction_lengths = [288, 432, 576, 720, 864, 1008, 1152]
#prediction_lengths = [2,3,4,5,6,7,8,9,10,11]
# Initialize lists to store results for different prediction lengths
results = []
results_mse = []
results_rmse = []
results_mae = []
results_mape = []
results_metrics_cpu = []
results_metrics_canonical_mem = []
results_metrics_assigned_mem = []
results_metrics_cache_mem =[]
results_metrics_disk = []
prev_higher_state1 = label[len(label)-1]  # Replace with your desired initial state
prev_higher_state2 = label[len(label)-2]  # Replace with your desired initial state
prev_state =  la[len(train_data)-1] # Replace with your desired initial state
prev_state1=  la[len(train_data)-2] # Replace with your desired initial state
# Define the number of times you want to repeat the experiment
num_experiments = 50
for prediction_length in prediction_lengths:
   # no_predict1 = prediction_length
    # Inner loop for repeating the experiment
    for _ in range(num_experiments):
        no_predict = prediction_length
        pred_res = []
        while no_predict > 0:
            # Predict the next state at the higher level
            higher_transition_probs = htm[prev_higher_state2][prev_higher_state1]
    
            # Normalize transition probabilities if the sum is not zero
            if np.sum(higher_transition_probs) > 0:
                higher_transition_probs /= np.sum(higher_transition_probs)
            else:
                # If sum is close to zero, assign equal probabilities to all states
                higher_transition_probs = np.ones(len(states_higher)) / len(states_higher)
    
            next_higher_state = np.random.choice(states_higher, p=higher_transition_probs)
    
            # Identify the model tuple for the selected higher-level state
            selected_model = rep_time[0][next_higher_state]
    
            # Access the adjacency matrix and time duration for the selected model
            model_adj_matrix, model_time_duration = selected_model
    
            # Calculate transition probabilities for the previous two states
            transition_probs = model_adj_matrix[prev_state1][prev_state]

            # Normalize transition probabilities if the sum is not zero
            if np.sum(transition_probs) > 0:
                transition_probs /= np.sum(transition_probs)

                # Sample the next state based on transition probabilities
                next_state = np.random.choice(states, p=transition_probs)
            else:
                # If sum is close to zero, randomly choose the next state
                next_state = np.random.choice(states)

            # Append the predicted state to the result
            pred_res.append(next_state)

            # Update the state variables for the next iteration
            prev_higher_state2 = prev_higher_state1
            prev_higher_state1 = next_higher_state
            prev_state1 = prev_state
            prev_state = next_state
            no_predict -= 1

        res_num = generate_res_num(pred_res)
        
        #Predicted Data
        df_pred = pd.DataFrame(res_num, columns=['CPU_util','mem_util','net_in','net_out','disk'])
        ori = df_actual[len(train_data):len(train_data)+prediction_length]
        norm = MinMaxScaler(feature_range=(0.15,0.20))
        df_pred_norm = pd.DataFrame(norm.fit_transform(df_pred), columns=df_pred.columns)
        ori_norm = pd.DataFrame(norm.fit_transform(ori), columns=ori.columns)
        actual_values = ori_norm[['CPU_util','mem_util','net_in','net_out','disk']]
        predicted_values = df_pred_norm
        mse_per_variable = mean_squared_error(actual_values, predicted_values, multioutput='raw_values')

        # Calculate the average Mean Squared Error
        average_mse = np.mean(mse_per_variable)

        # Calculate Mean Squared Error (MSE) for each variable
        mae_per_variable = mean_absolute_error(actual_values, predicted_values, multioutput='raw_values')

        # Calculate the average Mean Squared Error
        average_mae = np.mean(mae_per_variable)


        # Calculate Mean Squared Error (MSE) for each variable
        mape_per_variable = mean_absolute_percentage_error(actual_values, predicted_values, multioutput='raw_values')

        # Calculate the average Mean Squared Error
        average_mape = np.mean(mape_per_variable)


        # Calculate Mean Squared Error (MSE) for each variable
        rms_per_variable = mean_squared_error(actual_values, predicted_values, multioutput='raw_values', squared=False)

        # Calculate the average Mean Squared Error
        average_rmse = np.mean(rms_per_variable)

        #'cpu_usage','gpu_wrk_util','avg_mem','avg_gpu_work_mem'

        mse_cpu_util, mse_mem_util, mse_net_in, mse_net_out, mse_disk = mse_per_variable
        results_mse.append([prediction_length,mse_cpu_util, mse_mem_util, mse_net_in, mse_net_out, mse_disk])

        mae_cpu_util, mae_mem_util, mae_net_in, mae_net_out, mae_disk = mae_per_variable
        results_mae.append([prediction_length, mae_cpu_util, mae_mem_util, mae_net_in, mae_net_out, mae_disk])

        rmse_cpu_util, rmse_mem_util, rmse_net_in, rmse_net_out, rmse_disk = rms_per_variable
        results_rmse.append([prediction_length, rmse_cpu_util, rmse_mem_util, rmse_net_in, rmse_net_out, rmse_disk])
    
        mape_cpu_util, mape_mem_util, mape_net_in, mape_net_out, mape_disk = mape_per_variable
        results_mape.append([prediction_length,mape_cpu_util, mape_mem_util, mape_net_in, mape_net_out, mape_disk])
        results.append([prediction_length, average_mse, average_mae, average_rmse, average_mape])
        
        results_metrics_cpu.append([prediction_length, mse_cpu_util,mae_cpu_util, rmse_cpu_util,mape_cpu_util])

In [None]:
results_df_cpu_metrics = pd.DataFrame(results_metrics_cpu, columns=['Prediction Length','MSE_mean_cpu_usage','RMSE_mean_cpu_usage','MAE_mean_cpu_usage','MAPE_mean_cpu_usage'])
results_df_cpu_metrics.to_csv('F:\\TCC_revision\\Feature_Wise_Results\\Alibaba 2018\\CPU_Metrics_SMPM_Order2_Day.csv', index=False)

In [None]:
import pandas as pd

data = pd.read_csv('F:\\TCC_revision\\Feature_Wise_Results\\Alibaba 2018\\CPU_Metrics_SMPM_Order2_Day.csv')
df = pd.DataFrame(data)

# Filter records where 'Hours' < 1
#filtered_df = df[df['Hours'] < 24]

hours_list = list(range(37,60))
#hours_list = [288,304,313,314,327,337,334]

# Filter records where 'Hours' is in the list
filtered_df = df[df['Index'].isin(hours_list)]

# Calculate mean and standard error for each column
mean_values = filtered_df.mean()
std_values = filtered_df.std() / np.sqrt(len(filtered_df))

# Create the "x ± y" representation for each column
result = {}
for column in df.columns[2:]:  # Exclude the non-RMSE columns
    x = mean_values[column]
    y = std_values[column]
    result[column] = f"{x:.4f} ± {y:.6f}"

# Display the "x ± y" representation for each column
for column, value in result.items():
    print(f"{column}: {value}")

In [None]:
prev_higher_state1 = label[len(label)-1]  # Replace with your desired initial state
prev_higher_state2 = label[len(label)-2]  # Replace with your desired initial state


In [None]:
prev_state =  la[len(train_data)-1] # Replace with your desired initial state
prev_state1=  la[len(train_data)-2] # Replace with your desired initial state

In [None]:
pred_res = []
states_higher = np.unique(label)
states = np.unique(df1['Task Domain'])

In [None]:
total_predictions = 60

In [None]:
y_true = test_data[:total_predictions]

In [None]:
import numpy as np

pred_res = []
no_predict = total_predictions  # Total number of predictions you want to make

# Prediction loop
while no_predict > 0:
    # Predict the next state at the higher level
    higher_transition_probs = htm[prev_higher_state2][prev_higher_state1]
    
    # Normalize transition probabilities if the sum is not zero
    if np.sum(higher_transition_probs) > 0:
        higher_transition_probs /= np.sum(higher_transition_probs)
    else:
        # If sum is close to zero, assign equal probabilities to all states
        higher_transition_probs = np.ones(len(states_higher)) / len(states_higher)
    
    next_higher_state = np.random.choice(states_higher, p=higher_transition_probs)
    
    # Identify the model tuple for the selected higher-level state
    selected_model = rep_time[0][next_higher_state]
    
    # Access the adjacency matrix and time duration for the selected model
    model_adj_matrix, model_time_duration = selected_model
    
    # Calculate transition probabilities for the previous two states
    transition_probs = model_adj_matrix[prev_state1][prev_state]

    # Normalize transition probabilities if the sum is not zero
    if np.sum(transition_probs) > 0:
        transition_probs /= np.sum(transition_probs)

        # Sample the next state based on transition probabilities
        next_state = np.random.choice(states, p=transition_probs)
    else:
        # If sum is close to zero, randomly choose the next state
        next_state = np.random.choice(states)

    # Append the predicted state to the result
    pred_res.append(next_state)

    # Update the state variables for the next iteration
    prev_higher_state2 = prev_higher_state1
    prev_higher_state1 = next_higher_state
    prev_state1 = prev_state
    prev_state = next_state

    no_predict -= 1

In [None]:
#Predicted Data
df_pred = pd.DataFrame(res_num, columns=['CPU_util','mem_util','net_in','net_out','disk'])

In [None]:
ori = df_actual[len(train_data):len(train_data)+total_predictions]

In [None]:
norm = MinMaxScaler(feature_range=(0.10,0.25))
df_pred_norm = pd.DataFrame(norm.fit_transform(df_pred), columns=df_pred.columns)
ori_norm = pd.DataFrame(norm.fit_transform(ori), columns=ori.columns)

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
import math

# Assuming df_pred and ori are your dataframes

# Select the common columns for comparison
common_columns = df_pred_norm.columns.intersection(ori_norm.columns)

# Calculate MSE for each feature
mse_values = {}
mae_values = {}
mape_values = {}
rmse_values = {}
for column in common_columns:
    mse_values[column] = mean_squared_error(ori_norm[column], df_pred_norm[column])
    mae_values[column] = mean_absolute_error(ori_norm[column], df_pred_norm[column])
    mape_values[column] = mean_absolute_percentage_error(ori_norm[column], df_pred_norm[column])
    rmse_values[column] = mean_squared_error(ori_norm[column], df_pred_norm[column], squared = False)
# Calculate the average MSE
average_mse = np.mean(list(mse_values.values()))
average_mae = np.mean(list(mae_values.values()))
average_mape = np.mean(list(mape_values.values()))
average_rmse = np.mean(list(rmse_values.values()))

# Print feature-wise MSE and average MSE
print("Feature-wise MSE:")
for column, mse in mse_values.items():
    print(f"{column}: {mse:.6f}")
print("\nAverage MSE:", average_mse)
print("\n")
print("\n")

# Print feature-wise MSE and average MSE
print("Feature-wise MAE:")
for column, mae in mae_values.items():
    print(f"{column}: {mae:.6f}")
print("\nAverage MAE:", average_mae)
print("\n")
print("\n")


# Print feature-wise MSE and average MSE
print("Feature-wise MAPE:")
for column, mape in mape_values.items():
    print(f"{column}: {mape:.6f}")
print("\nAverage MAPE:", average_mape)

print("\n")
print("\n")


# Print feature-wise MSE and average MSE
print("Feature-wise RMSE:")
for column, rmse in rmse_values.items():
    print(f"{column}: {rmse:.6f}")
print("\nAverage RMSE:", average_rmse)



res_final = [average_mse,average_mae,average_mape,average_rmse]
results_normalized.append([total_predictions, average_mse, average_mae, average_rmse, average_mape])
print(res_final)