# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import json
from pathlib import Path
import re
import datetime

# Load JSON function

In [2]:
def loadJson(path):
    with open(path,'r') as fp: 
        data_json = json.load(fp)
        return (data_json)

# Save Data Frame function

In [3]:
def saveDataFrame(df,path):
    '''save DataFerame in csv format'''
    df.to_csv(path,sep=',',index=False, encoding="utf-8")

# JSON to Data Frame function

In [4]:
def getData(l):
    instance = []
    usage = []
    timestamps = []
    
    for t in l:
        if t != []:
            idata = t[0]
            for h in (idata['datapoints']):
                target = idata['target']
                target = re.sub('\_com.*', '', target)
                target = target+'_com'
                instance.append(target)
                usage.append(h[0])
                timestamps.append(h[1])
        else:
            instance.append(np.nan)
            usage.append(np.nan)
            timestamps.append(np.nan)
    df = pd.DataFrame({'instance': instance, 'usage':usage, 'timestamp':timestamps})
    return (df)

# Loading cpu logs

In [5]:
load_from = './logs/cpu'
save_to = './dataframes/'
pathlist = Path(load_from).glob('*.json')
df_cpu = pd.DataFrame()
for p in pathlist:
    jsonData = loadJson(p)
    dayDF = getData(jsonData)
    df_cpu = df_cpu.append(dayDF)
    saveDataFrame(df_cpu,path=save_to+'cpu.csv')

# Loading network in logs

In [6]:
load_from = './logs/network/inb'
save_to = './dataframes/'
pathlist = Path(load_from).glob('*.json')
df_nwin = pd.DataFrame()
for p in pathlist:
    jsonData = loadJson(p)
    dayDF = getData(jsonData)
    df_nwin = df_nwin.append(dayDF)
    saveDataFrame(df_nwin,path=save_to+'nwin.csv')

# Loading network out logs

In [7]:
load_from = './logs/network/outb'
save_to = './dataframes/'
pathlist = Path(load_from).glob('*.json')
df_nwout = pd.DataFrame()
for p in pathlist:
    jsonData = loadJson(p)
    dayDF = getData(jsonData)
    df_nwout = df_nwout.append(dayDF)
    saveDataFrame(df_nwout,path=save_to+'nwout.csv')

# Clean data and merge

In [8]:
# Remove empty fields
df_cpu_clean = df_cpu.dropna(axis=0, how='all',subset=['instance','usage','timestamp'])
df_nwin_clean = df_nwin.dropna(axis=0, how='all',subset=['instance','usage','timestamp'])
df_nwout_clean = df_nwout.dropna(axis=0, how='all',subset=['instance','usage','timestamp'])

In [9]:
# Merge all 3 dataframes
cpu_nwin = pd.merge(df_cpu_clean, df_nwin_clean, how='inner', on= ['instance','timestamp'], suffixes=['_cpu','_nwin'])
logData = pd.merge(cpu_nwin, df_nwout_clean, how='inner', on= ['instance','timestamp'])

# rename to usage nwout
logData.rename(columns={'usage': 'usage_nwout'}, inplace=True) 

# create readable date column from timestamp
logData['date'] = logData.timestamp.apply(lambda x: datetime.datetime.fromtimestamp(x).strftime('%Y-%m-%d %X'))

# clean instance name
logData.instance = logData.instance.apply(lambda x: re.sub(r'.+EC2.','', x))
logData.instance = logData.instance.apply(lambda x: re.sub('_','.', x))

# reorder
logData = logData[['instance', 'timestamp', 'date', 'usage_cpu', 'usage_nwin', 'usage_nwout']]

# Look at data range 

In [10]:
# number of records
len(logData)

169923

In [11]:
# number of instances we have log data for
len(logData.instance.unique())

372

In [16]:
# number of instances we have log data for
len(df_cpu_clean.instance.unique())

372

In [17]:
# number of instances we have log data for
len(df_nwin_clean.instance.unique())

372

In [18]:
# number of instances we have log data for
len(df_nwout_clean.instance.unique())

372

In [12]:
# Date range
aux = logData.sort_values(by='timestamp', ascending=False)
aux = aux.reset_index(drop=True)

dlast = aux.timestamp[0]
dfirst = aux.timestamp[len(aux.timestamp)-1]
days_range = (dlast-dfirst)/(3600*24)
days_range

12.958333333333334

## Save dataframe

In [13]:
saveDataFrame(logData, path='./dataframes/logData.csv')

# Tests

In [None]:
# to access a field (first column then row)
# df['instance'][87]

In [None]:
# load_from = "./logs/cpu/cpu_1.json"
# with open(load_from,'r') as fp: 
#         data_json = json.load(fp)

In [None]:
# for i in range(len(data_json)):
#     if data_json[i]!= []:
#         print(data_json[i][0]['datapoints'])

In [None]:
# # Same as above but trying to input it in a Dataframe directly
# def getData(l):
#     df = pd.DataFrame()
#     df['instance'] = np.NaN
#     df['usage'] = np.NaN
#     df['timestamp'] = np.NaN
#     for t in l:
#         if t != []:
#             usage = []
#             timestamps = []
#             idata = t[0]
#             df['instance'] = idata['target']*len(idata['datapoints'])
#             for h in (idata['datapoints']):
#                 usage.append(h[0])
#                 timestamps.append(h[1])
#             df['usage'] = usage
#             df['timestamp'] = timestamps
#     return (df)

In [None]:
# Different test i did

# def getData(l):
#     instance = []
#     usage = []
#     timestamps = []
#     for t in l:
#         if t != []:
#             usage = []
#             timestamps = []
#             idata = t[0]
#             instance = idata['target']
#             for h in (idata['datapoints']):
#                 usage.append(h[0])
#                 timestamps.append(h[1])
#         c = [instance, usage, timestamps]
#     return (c)

In [None]:
# # Same as above but trying to input it in a Dataframe directly
# def getData(l):
# #     df = pd.DataFrame()
# #     df['instance'] = np.NaN
# #     df['usage'] = np.NaN
# #     df['timestamp'] = np.NaN
#     instance = []
#     usage = []
#     timestamps = []
    
#     for t in l:
#         if t != []:
# #             instance = []
# #             usage = []
# #             timestamps = []
#             idata = t[0]
#             #df['instance'] = idata['target']*len(idata['datapoints'])
#            # instance.append(idata['target']*len(idata['datapoints']))
#             for h in (idata['datapoints']):
#                 instance.append(idata['target'])
#                 usage.append(h[0])
#                 timestamps.append(h[1])
#             #df['usage'] = usage
#             #df['timestamp'] = timestamps
# #     df['instance'] = instance
# #     df['usage'] = usage
# #     df['timestamp'] = timestamps        
# #     df = pd.concat([instance, usage, timestamps], axis=1)    
# #    print(len(instance),len(usage), len(timestamps))
#         else:
#             instance.append([])
#             usage.append([])
#             timestamps.append([])
#     df = pd.DataFrame({'instance': instance, 'usage':usage, 'timestamp':timestamps})
#     return (df)

In [None]:
#Regex tests
# target = data_json[87][0]['target']
# instance = re.sub('\_com.*', '', target)
# instance = instance+'.com'
# instance

In [None]:
#tests for date
# ts = logData.timestamp[0]
# ts2 = logData.timestamp[1]


# readable = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %X')
# readable2 = datetime.datetime.fromtimestamp(ts2).strftime('%Y-%m-%d %X')
# #.isoformat()
# # datetime.datetime.strptime(readable, '%Y
# datetime.datetime.strptime(readable2)-datetime.datetime.strptime(readable2)

In [None]:
# more regex testing
# e = 'DEV0.domain.CSSAPPS.infra_service.EC2.cssapps001_da_aws_cccis_com'
# e = re.sub(r'.+EC2.','',e)
# e = re.sub('_', '.', e)
# e