In [230]:
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.api import VAR
import os
import copy
import pickle

from statsmodels.graphics.api import qqplot
%matplotlib inline

# Data Modeling
Reading the No Of Instances For a Particular Group and Making a dataframe of that

In [207]:
group_data=pd.DataFrame(columns=['Group','Instance'])

In [208]:
directory = r'D:\Practice2'
groupnames=[]
instancenames=[]
filenames=[]
for filename in os.listdir(directory):
    groupname=filename[0:8]
    groupnames.append(groupname)
    instancename=filename[8:]
    instancenames.append(instancename)
    filenames.append(filename)

In [209]:
group_data=pd.DataFrame({'Group':groupnames,'Instance':instancenames})

In [210]:
group_data.shape

(3, 2)

In [211]:
group_data.head()

Unnamed: 0,Group,Instance
0,group_1_,1b6ffb4a-b7bc-48d0-ab60-b43f64b7c6f4
1,group_1_,1bda6faa-cb1c-4192-94b6-0ff8b19c8caa
2,group_1_,2d2122c0-4f06-4e3e-ade0-6394ba46f505


Now We have the list Of Instances for which i have to Predict the Forecasting , We will use Each Instances Model  Pickle File to forecast , before that we have to Processed the Input Data before making the predictions 

Creating the Dataframes for Each Instance Input

In [212]:
dfs ={}
for instance,file in zip(instancenames,filenames):
    dfs[instance]=pd.read_csv("D:\Practice\\"+file+"\\mem.log",doublequote=False,sep=":",header=None,names=['timestamp','Utilization'])

In [213]:
dfs['1b6ffb4a-b7bc-48d0-ab60-b43f64b7c6f4']

Unnamed: 0,timestamp,Utilization
0,Wed Aug 28 16:59:45 IST 2019,65664:65435:14:0.30:0.00:4014G
1,Wed Aug 28 17:02:28 IST 2019,65664:65435:14:0.31:0.00:4014G
2,Wed Aug 28 17:05:32 IST 2019,65664:65435:14:0.32:0.00:4014G
3,Wed Aug 28 17:08:27 IST 2019,65664:65435:14:0.29:0.00:4014G
4,Wed Aug 28 17:11:47 IST 2019,65664:65435:14:0.28:0.00:4014G
...,...,...
47041,Tue Jun 23 19:02:01 IST 2020,65664:61229:14:0.34:0.00:4014G
47042,Tue Jun 23 19:11:59 IST 2020,65664:61229:14:0.19:0.00:4014G
47043,Tue Jun 23 19:21:56 IST 2020,65664:61229:14:0.19:0.00:4014G
47044,Tue Jun 23 19:31:58 IST 2020,65664:61229:14:0.35:0.00:4014G


Preprocessing The data before Applying to the Modal

In [214]:
#Updating the columns for data model
def update_columns():
    for key in dfs:
        dfs[key][['Memory Allocated','Memory Used','CPU Allocated','CPU Used','Network bandwidth utilization','Storage space utilization']]=dfs[key].Utilization.apply(lambda x: pd.Series(str(x).split(":")))
        dfs[key].drop(['Utilization'],axis=1,inplace=True)
        dfs[key]['Storage space utilization']=dfs[key]['Storage space utilization'].apply(lambda x: str(x[:len(x)-1]))

In [215]:
update_columns()

In [327]:
dfs['1b6ffb4a-b7bc-48d0-ab60-b43f64b7c6f4']

Unnamed: 0_level_0,Memory Allocated,Memory Used,CPU Allocated,CPU Used,Network bandwidth utilization,Storage space utilization
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-08-28 16:59:45,65664,65435,14,0.30,0.0,4014
2019-08-28 17:02:28,65664,65435,14,0.31,0.0,4014
2019-08-28 17:05:32,65664,65435,14,0.32,0.0,4014
2019-08-28 17:08:27,65664,65435,14,0.29,0.0,4014
2019-08-28 17:11:47,65664,65435,14,0.28,0.0,4014
...,...,...,...,...,...,...
2020-06-23 19:02:01,65664,61229,14,0.34,0.0,4014
2020-06-23 19:11:59,65664,61229,14,0.19,0.0,4014
2020-06-23 19:21:56,65664,61229,14,0.19,0.0,4014
2020-06-23 19:31:58,65664,61229,14,0.35,0.0,4014


In [217]:

#Changing the data types of the data frames
def change_data_type():
     for key in dfs:
            dfs[key]['timestamp']=pd.to_datetime(dfs[key]['timestamp']) #Converting Data To TimeStamp
            dfs[key]['Memory Used']=dfs[key]['Memory Used'].astype('int')
            dfs[key]['Memory Allocated']=dfs[key]['Memory Allocated'].astype('int')
            dfs[key]['CPU Used']=dfs[key]['CPU Used'].astype('float')
            dfs[key]['CPU Allocated']=dfs[key]['CPU Allocated'].astype('int')
            dfs[key]['Storage space utilization']=dfs[key]['Storage space utilization'].astype('int')
            dfs[key]['Network bandwidth utilization']=dfs[key]['Network bandwidth utilization'].astype('float')
            dfs[key].set_index('timestamp',inplace=True)

In [218]:
change_data_type()



In [219]:
processed=copy.deepcopy(dfs)

# Pre Processing The Data

In [220]:
#Removing the Constant Columns
def pre_process():
    for key in processed:
        for column in processed[key].columns:
            constant_column=processed[key][column].nunique()
            if constant_column==1:
                processed[key].drop([column],axis=1,inplace=True)   

In [221]:
pre_process()

In [225]:
processed['1b6ffb4a-b7bc-48d0-ab60-b43f64b7c6f4']

Unnamed: 0_level_0,Memory Allocated,Memory Used,CPU Used
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-08-28 16:59:45,65664,65435,0.30
2019-08-28 17:02:28,65664,65435,0.31
2019-08-28 17:05:32,65664,65435,0.32
2019-08-28 17:08:27,65664,65435,0.29
2019-08-28 17:11:47,65664,65435,0.28
...,...,...,...
2020-06-23 19:02:01,65664,61229,0.34
2020-06-23 19:11:59,65664,61229,0.19
2020-06-23 19:21:56,65664,61229,0.19
2020-06-23 19:31:58,65664,61229,0.35


# Predicting The Data

In [256]:
#Fetching the training models using pickl
directory = r'D:\InstanceModel'
models=[]
for filename in os.listdir(directory):
    infile = open('D:\InstanceModel\\'+filename,'rb')
    models.append( pickle.load(infile))
infile.close()

In [276]:
Instance1_Model=models[0]
Instance2_Model=models[1]

In [281]:
lag_order = Instance1_Model.k_ar
print(lag_order)  #> 4

# Input data for forecasting
forecast_input = processed['1b6ffb4a-b7bc-48d0-ab60-b43f64b7c6f4'].values[-lag_order:]
forecast_input

5


array([[6.5664e+04, 6.1229e+04, 3.4000e-01],
       [6.5664e+04, 6.1229e+04, 1.9000e-01],
       [6.5664e+04, 6.1229e+04, 1.9000e-01],
       [6.5664e+04, 6.1229e+04, 3.5000e-01],
       [6.5664e+04, 6.1229e+04, 1.8000e-01]])

In [282]:
lag_order2 = Instance2_Model.k_ar
print(lag_order2)  #> 4

# Input data for forecasting
forecast_input2 = processed['1bda6faa-cb1c-4192-94b6-0ff8b19c8caa'].values[-lag_order2:]
forecast_input2

1


array([[3.2768e+04, 3.2768e+04, 4.0000e+00, 4.0000e+00, 3.0500e+02]])

In [334]:
nobs=5000
fc = Instance1_Model.forecast(y=forecast_input, steps=nobs)
df_forecast = pd.DataFrame(fc, index= processed['1b6ffb4a-b7bc-48d0-ab60-b43f64b7c6f4'].index[-nobs:], columns=processed['1b6ffb4a-b7bc-48d0-ab60-b43f64b7c6f4'].columns)

df_forecast.head()

Unnamed: 0_level_0,Memory Allocated,Memory Used,CPU Used
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-02-22 11:39:08,65663.99292,62430.945173,0.262153
2020-02-22 11:45:45,65663.992984,62759.364013,0.279213
2020-02-22 11:52:22,65663.991595,63014.183576,0.31176
2020-02-22 11:59:00,65663.993883,63236.621634,0.326682
2020-02-22 12:05:37,65663.993254,63586.454174,0.347273


In [286]:
nobs=1000
fc = Instance2_Model.forecast(y=forecast_input2, steps=nobs)
df_forecast2 = pd.DataFrame(fc, index= processed['1bda6faa-cb1c-4192-94b6-0ff8b19c8caa'].index[-nobs:], columns=processed['1bda6faa-cb1c-4192-94b6-0ff8b19c8caa'].columns)

df_forecast2.head()

Unnamed: 0_level_0,Memory Allocated,Memory Used,CPU Allocated,CPU Used,Storage space utilization
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-06-16 11:50:19,1683.035407,914.847314,0.205449,-1.725354,-0.1523
2020-06-16 11:59:36,-293.362454,11.446798,-0.035811,0.670246,0.052699
2020-06-16 12:09:34,144.403011,13.585036,0.017627,-0.262134,0.056703
2020-06-16 12:20:42,-52.521742,2.470462,-0.006411,0.102496,0.05676
2020-06-16 12:30:23,31.124392,6.950642,0.003799,-0.040061,0.056781


Inverting The Result

In [300]:
columns=processed['1bda6faa-cb1c-4192-94b6-0ff8b19c8caa'].columns
df_final = df_forecast2.copy()
for col in columns:
        # Roll back 1st Diff
        df_final[col] = processed['1bda6faa-cb1c-4192-94b6-0ff8b19c8caa'][col].iloc[-1] + df_forecast2[col].cumsum()

In [326]:
df_final.head()

Unnamed: 0_level_0,Memory Allocated,Memory Used,CPU Allocated,CPU Used,Storage space utilization
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-06-16 11:50:19,34451.035407,33682.847314,4.205449,2.274646,304.8477
2020-06-16 11:59:36,34157.672953,33694.294113,4.169638,2.944893,304.900398
2020-06-16 12:09:34,34302.075964,33707.879148,4.187265,2.682759,304.957101
2020-06-16 12:20:42,34249.554222,33710.349611,4.180854,2.785255,305.013861
2020-06-16 12:30:23,34280.678614,33717.300252,4.184653,2.745194,305.070642


In [331]:
#Adding Constant Columns Back 
def add_constant(instance,df_forecast):
    for key in dfs:
        if key==instance:
            for column in dfs[key].columns:
                constant_column=dfs[key][column].nunique()
                if constant_column==1:
                    df_forecast[column]=dfs[key][column]

In [338]:
add_constant('1b6ffb4a-b7bc-48d0-ab60-b43f64b7c6f4',df_forecast)
add_constant('1bda6faa-cb1c-4192-94b6-0ff8b19c8caa',df_final)

In [339]:
df_forecast.head()

Unnamed: 0_level_0,Memory Allocated,Memory Used,CPU Used,CPU Allocated,Network bandwidth utilization,Storage space utilization
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-02-22 11:39:08,65663.99292,62430.945173,0.262153,14,0.0,4014
2020-02-22 11:45:45,65663.992984,62759.364013,0.279213,14,0.0,4014
2020-02-22 11:52:22,65663.991595,63014.183576,0.31176,14,0.0,4014
2020-02-22 11:59:00,65663.993883,63236.621634,0.326682,14,0.0,4014
2020-02-22 12:05:37,65663.993254,63586.454174,0.347273,14,0.0,4014


In [340]:
df_final.head()

Unnamed: 0_level_0,Memory Allocated,Memory Used,CPU Allocated,CPU Used,Storage space utilization,Network bandwidth utilization
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-06-16 11:50:19,34451.035407,33682.847314,4.205449,2.274646,304.8477,0.0
2020-06-16 11:59:36,34157.672953,33694.294113,4.169638,2.944893,304.900398,0.0
2020-06-16 12:09:34,34302.075964,33707.879148,4.187265,2.682759,304.957101,0.0
2020-06-16 12:20:42,34249.554222,33710.349611,4.180854,2.785255,305.013861,0.0
2020-06-16 12:30:23,34280.678614,33717.300252,4.184653,2.745194,305.070642,0.0
