In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

df = pd.read_csv('data/logs.csv')
display(df.shape)
df.head()

(1668950, 16)

Unnamed: 0,id,timeRead,timeDelete,totalTime,timestamp,serverType,clientTotalTime,requestSize,cpuUsage,frequency,instanceType,freeMem,timeWrite,databaseType,totalMem,memUsage
0,--4qETfhcD,-1.0,-1.0,-1.0,-1.0,java,-1.0,50.0,-1.0,15.0,b1,-1.0,-1.0,nosql,-1.0,-1.0
1,--Lo0QBj3fC,-1.0,-1.0,-1.0,-1.0,java,-1.0,200.0,-1.0,40.0,b1,-1.0,-1.0,nosql,-1.0,-1.0
2,--P-Z58M2G,-1.0,-1.0,-1.0,-1.0,java,-1.0,1.0,-1.0,15.0,b1,-1.0,-1.0,nosql,-1.0,-1.0
3,--nUeIfVXJ,-1.0,-1.0,-1.0,-1.0,java,-1.0,100.0,-1.0,15.0,b1,-1.0,-1.0,nosql,-1.0,-1.0
4,--od3lhTv,-1.0,-1.0,-1.0,-1.0,java,-1.0,100.0,-1.0,15.0,b1,-1.0,-1.0,nosql,-1.0,-1.0


In [10]:
servers = sorted(['java', 'nodejs'])
databases = sorted(['nosql', 'sql'])
instances = sorted(['b1', 'b2', 'b4', 'b8'])

combinations = [(servers[0], databases[0]), (servers[1], databases[0]), (servers[0], databases[1]), (servers[1], databases[1])]

sizes = sorted(list(df['requestSize'].unique()))
frequencies = sorted(list(df['frequency'].unique()))
types = np.transpose([np.tile(sizes, len(frequencies)), np.repeat(frequencies, len(sizes))])

In [11]:
types = np.transpose([np.tile(sizes, len(frequencies)), np.repeat(frequencies, len(sizes))])
df.drop(['cpuUsage'], axis=1, inplace=True)
df.dropna(inplace=True)
display(df.shape)

(1668950, 15)

In [12]:
cols = df.dtypes.to_dict()

query = None

for col in cols:
    if cols[col] == 'float64':
        temp_query = (df[col] > 0)
    else:
        temp_query = (df[col] != '')
        
    temp_query = temp_query & (df[col].notnull())
    
    if query is None:
        query = temp_query
        
    else:
        query = query & temp_query
        
        
clean_df = df.loc[query]
clean_df.to_csv('data/clean.csv', index=False)
display(clean_df.shape)
clean_df.head()

(1274926, 15)

Unnamed: 0,id,timeRead,timeDelete,totalTime,timestamp,serverType,clientTotalTime,requestSize,frequency,instanceType,freeMem,timeWrite,databaseType,totalMem,memUsage
218499,VWo2CJrUe,3.0,81.0,236.0,1585877000000.0,java,425.0,1.0,1.0,b1,1887261000.0,8.0,sql,2147484000.0,0.121176
218500,Ge9D-u1oz,3.0,91.0,299.0,1585877000000.0,java,489.0,1.0,1.0,b1,1887121000.0,9.0,sql,2147484000.0,0.121241
218501,W789KeB31,3.0,7.0,154.0,1585877000000.0,java,292.0,1.0,1.0,b1,1886994000.0,10.0,sql,2147484000.0,0.1213
218502,YW4UJ3m4N,86.0,8.0,255.0,1585877000000.0,java,448.0,1.0,1.0,b1,1886646000.0,10.0,sql,2147484000.0,0.121462
218503,RRAbhpXkJ,4.0,6.0,152.0,1585877000000.0,java,288.0,1.0,1.0,b1,1886482000.0,9.0,sql,2147484000.0,0.121538


In [13]:
counts = df.groupby(['instanceType', 'databaseType', 'serverType', 'requestSize', 'frequency']).count().reset_index()
counts.drop(['freeMem', 'timeWrite', 'totalMem', 'timeRead', 'timeDelete', 'totalTime', 'timestamp', 'clientTotalTime', 'memUsage'], axis=1, inplace=True)
counts.rename(columns={'id': 'count'}, inplace=True)
counts.sort_values(by=['count'], ascending=True, inplace=True)

counts['key'] = counts.apply(lambda x: '{} {} {} {}KB {}Hz'.format(x.instanceType, x.serverType, x.databaseType, x.requestSize, x.frequency), axis=1)
counts.head()

Unnamed: 0,instanceType,databaseType,serverType,requestSize,frequency,count,key
8,b1,nosql,java,5.0,1.0,37,b1 java nosql 5.0KB 1.0Hz
237,b1,sql,nodejs,200.0,60.0,51,b1 nodejs sql 200.0KB 60.0Hz
115,b1,nosql,nodejs,200.0,60.0,53,b1 nodejs nosql 200.0KB 60.0Hz
360,b2,nosql,nodejs,200.0,60.0,56,b2 nodejs nosql 200.0KB 60.0Hz
159,b1,sql,java,100.0,2.0,57,b1 java sql 100.0KB 2.0Hz


In [None]:
ax = counts.plot.barh(x='key', y='count', figsize=(30, 300), title='Successfull Requests N = {:,}'.format(clean_df.shape[0]))
ax.get_legend().remove()

for i, v in enumerate(counts['count'].to_numpy()):
    ax.text(v + 30, i - 0.125, 'N = {}'.format(v), fontweight='bold')

In [15]:
low_counts = counts.loc[(counts['count'] < 150)].reset_index()
low_counts = counts.groupby(['instanceType']).agg(['unique']).reset_index()
low_counts.to_csv('data/low-count-requests.csv')
low_counts

Unnamed: 0_level_0,instanceType,databaseType,serverType,requestSize,frequency,count,key
Unnamed: 0_level_1,Unnamed: 1_level_1,unique,unique,unique,unique,unique,unique
0,b1,"[nosql, sql]","[java, nodejs]","[5.0, 200.0, 100.0, 1.0, 500.0, 20.0, 50.0]","[1.0, 60.0, 2.0, 40.0, 10.0, 3.0, 4.0, 15.0, 5...","[37, 51, 53, 57, 61, 65, 66, 70, 71, 73, 75, 7...","[b1 java nosql 5.0KB 1.0Hz, b1 nodejs sql 200...."
1,b2,"[nosql, sql]","[nodejs, java]","[200.0, 100.0, 50.0, 20.0, 5.0, 1.0, 500.0]","[60.0, 40.0, 1.0, 15.0, 10.0, 2.0, 20.0, 3.0, ...","[56, 61, 66, 67, 92, 101, 117, 119, 121, 187, ...","[b2 nodejs nosql 200.0KB 60.0Hz, b2 nodejs sql..."
2,b4,"[sql, nosql]","[nodejs, java]","[200.0, 1.0, 5.0, 20.0, 50.0, 100.0, 500.0]","[60.0, 1.0, 40.0, 2.0, 20.0, 3.0, 4.0, 5.0, 15...","[110, 119, 160, 161, 185, 211, 235, 238, 239, ...","[b4 nodejs sql 200.0KB 60.0Hz, b4 java sql 1.0..."
3,b8,"[sql, nosql]","[java, nodejs]","[20.0, 500.0, 50.0, 200.0, 100.0, 5.0, 1.0]","[1.0, 2.0, 60.0, 3.0, 40.0, 4.0, 5.0, 10.0, 15...","[119, 239, 309, 338, 359, 360, 420, 432, 447, ...","[b8 java sql 20.0KB 1.0Hz, b8 java nosql 500.0..."
