In [None]:
import pandas as pd
import csv

In [None]:
mdir = "/home/sandeep/datasets/MaternaDataset/GWA-T-13_Materna-Workload-Traces/Materna-Trace-3/"


In [None]:
def set_ts_index(df):
    # convert the column (it's a string) to datetime type
    datetime_series = pd.to_datetime(df['Timestamp'], format='%d.%m.%Y %H:%M:%S', errors='raise')

    # create datetime index passing the datetime series
    datetime_index = pd.DatetimeIndex(datetime_series)

    # assignment is required for index to change (IMP)
    df = df.set_index(datetime_index)
    return df

In [None]:
import os

dataframes = []
from glob import glob
filenames = glob(mdir + '*.csv')
for idx, f in enumerate(filenames):
    df = pd.read_csv(f, sep=';', quoting = csv.QUOTE_ALL)
    df = set_ts_index(df)
    df = df.rename(columns={"Disk read throughput [KB/s]": "disk_read", 
                       "Disk write throughput [KB/s]": "disk_write",
                       "Network received throughput [KB/s]": "net_read",
                       "Network transmitted throughput [KB/s]": "net_write",
                       "CPU usage [MHZ]": "cpu_usage",
                       "Memory usage [KB]": "mem_usage"
                      })
    df.dataframeName = os.path.basename(f)
    dataframes.append(df)

### new dataframe with one column from each VM

In [None]:
new_df = pd.DataFrame()

for index in range(len(dataframes)):
    diter = dataframes[index]
    new_df[['net_write_' + diter.dataframeName]] = diter[['net_write']]
    
print(new_df.shape)


In [None]:
df = new_df

In [None]:
df.describe()


In [None]:
df.index

In [None]:
df.dtypes

In [None]:
df.head()

### Inf columns

In [None]:
df.columns.to_series()[np.isinf(df).any()]


In [None]:
df.index[np.isinf(df).any(1)]


In [None]:
import numpy as np

df.replace([np.inf, -np.inf], np.nan)


### Null columns

In [None]:
df.isnull().values.any()

In [None]:
df[df.isnull().any(axis=1)] 

In [None]:
df = df.interpolate( axis='columns')

In [None]:
df.dropna()
df.shape

### mean throughput over time per VM

In [None]:
ax = df.mean().plot(grid=False)


In [None]:
### mean throughput across VMs at any time

In [None]:
ax = df.T.mean().plot(grid=False)


### multivariate PCA
https://www.statsmodels.org/stable/examples/notebooks/generated/pca_fertility_factors.html

In [None]:
import statsmodels.api as sm
from statsmodels.multivariate.pca import PCA

pca_model = PCA(df, standardize=False, demean=True)


In [None]:
fig = pca_model.plot_scree(log_scale=False)


In [None]:
%matplotlib inline

import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(8, 4))
lines = ax.plot(pca_model.factors.iloc[:,:3], lw=4, alpha=.6)
ax.set_xticklabels(df.T.columns.values[::10])
ax.set_xlim(0, 51)
ax.set_xlabel("time", size=17)
fig.subplots_adjust(.1, .1, .85, .9)
legend = fig.legend(lines, ['PC 1', 'PC 2', 'PC 3'], loc='center right')
legend.draw_frame(False)

In [None]:
idx = pca_model.loadings.iloc[:,0].argsort()


In [None]:
def make_plot(labels):
    fig, ax = plt.subplots(figsize=(9,5))
    ax = df.loc[labels].T.plot(legend=False, grid=False, ax=ax)
    df.T.mean().plot(ax=ax, grid=False, label='Mean')
    ax.set_xlim(0, 51);
    fig.subplots_adjust(.1, .1, .75, .9)
    ax.set_xlabel("time", size=17)
    ax.set_ylabel("vm", size=17);
    legend = ax.legend(*ax.get_legend_handles_labels(), loc='center left', bbox_to_anchor=(1, .5))
    legend.draw_frame(False)

In [None]:
labels = df.index[idx[-5:]]
make_plot(labels)

In [None]:
idx = pca_model.loadings.iloc[:,1].argsort()
make_plot(df.index[idx[-5:]])

In [None]:
make_plot(df.index[idx[:5]])


In [None]:
fig, ax = plt.subplots()
pca_model.loadings.plot.scatter(x='comp_00',y='comp_01', ax=ax)
ax.set_xlabel("PC 1", size=17)
ax.set_ylabel("PC 2", size=17)
df.index[pca_model.loadings.iloc[:, 1] > .2].values