In [1]:
import pandas as pd
from firebase_admin import credentials, firestore, initialize_app
from datetime import datetime, timedelta
from IPython.display import clear_output

initialize_app(credentials.Certificate('cred.json'))
db = firestore.client()

In [2]:
servers = sorted(['java', 'nodejs'])
databases = sorted(['nosql', 'sql'])
instances = sorted(['b1', 'b2', 'b4', 'b8'])

logs = []

def filter_func(x):
    try:
        val = x['id']
        val = x['instanceType']
        val = x['serverType']
        val = x['databaseType']
        
        return True
    except:
        return False
    
    
# Get the error runs in chunks (Apr 2 - Apr 6)
for instance in instances:    
    for database in databases:
        clear_output()
        display('{} {} {}'.format(instance, 'java', database))

        snapshot = db.collection('logs').where('timestamp', '==', -1).where('serverType', '==', 'java')
        snapshot = snapshot.where('instanceType', '==', instance).where('databaseType', '==', database).stream()
        
        temp = list(map(lambda x: x.to_dict(), snapshot))
        logs = logs + temp
        
    for database in databases:
        clear_output()
        display('{} {} {}'.format(instance, 'nodejs', database))

        snapshot = db.collection('logs').where('timestamp', '==', -1).where('serverType', '==', 'node')
        snapshot = snapshot.where('instanceType', '==', instance).where('databaseType', '==', database).where('requestSize', 'in', ['1', '5', '20', '50']).stream()
        
        temp = list(map(lambda x: x.to_dict(), snapshot))
        logs = logs + temp
print(len(logs))

'b8 nodejs sql'

218499


In [3]:
start = datetime(2020, 4, 2)
end = datetime(2020, 4, 6)
# Get the success runs in chunks (Apr 2 - Apr 6)
while start < end:
    end_interval = start + timedelta(minutes=5)

    start_s = start.timestamp() * 1000
    end_s = end_interval.timestamp() * 1000
    clear_output()
    display(start)

    temp = list(map(lambda x: x.to_dict(), db.collection('logs').order_by('timestamp').start_at({'timestamp': start_s}).end_at({'timestamp': end_s}).stream()))
    start = end_interval
    logs = logs + temp
print(len(logs))

datetime.datetime(2020, 4, 5, 23, 55)

1098595


In [4]:
start = datetime(2020, 4, 6)
end = datetime(2020, 4, 9)
# Get all runs in chunks (Apr 6+)
while start < end:
    end_interval = start + timedelta(minutes=5)

    start_s = start.timestamp() * 1000
    end_s = end_interval.timestamp() * 1000
    clear_output()
    display(start)

    temp = list(map(lambda x: x.to_dict(), db.collection('logs').order_by('timestamp').start_at({'timestamp': start_s}).end_at({'timestamp': end_s}).stream()))
    start = end_interval
    logs = logs + temp

logs = list(filter(filter_func, logs))
print(len(logs))

datetime.datetime(2020, 4, 8, 23, 55)

2065516


In [7]:
def force_str(val):
    if "node" in val:
        return "nodejs"
    return str(val)

def force_num(val):
    if pd.isnull(val):
        return float(-1)
    try:
        return float(val)
    except:
        return float(-1)
    
def mem_calc(x):
    if x.freeMem == -1 or x.totalMem == -1:
        return float(-1)
    
    return float((x.totalMem - x.freeMem) / x.totalMem)
    

df = pd.DataFrame(logs)

df['clientTotalTime'] = df.clientTotalTime.apply(force_num)
df['requestSize'] = df.requestSize.apply(force_num)
df['frequency'] = df.frequency.apply(force_num)
df['instanceType'] = df.instanceType.apply(force_str)
df['freeMem'] = df.freeMem.apply(force_num)
df['timeWrite'] = df.timeWrite.apply(force_num)
df['databaseType'] = df.databaseType.apply(force_str)
df['totalMem'] = df.totalMem.apply(force_num)
df['timeRead'] = df.timeRead.apply(force_num)
df['timeDelete'] = df.timeDelete.apply(force_num)
df['totalTime'] = df.totalTime.apply(force_num)
df['timestamp'] = df.timestamp.apply(force_num)
df['serverType'] = df.serverType.apply(force_str)
df['memUsage'] = df.apply(mem_calc, axis=1)
df.drop(['cpuUsage'], axis=1, inplace=True)

display(df.shape, df.dtypes, df.columns)
display(sorted(df['frequency'].unique()), sorted(df['requestSize'].unique()))

df.to_csv('data/logs.csv', index=False)

(2065516, 15)

timeRead           float64
timeDelete         float64
id                  object
totalTime          float64
timestamp          float64
serverType          object
clientTotalTime    float64
requestSize        float64
frequency          float64
instanceType        object
freeMem            float64
timeWrite          float64
databaseType        object
totalMem           float64
memUsage           float64
dtype: object

Index(['timeRead', 'timeDelete', 'id', 'totalTime', 'timestamp', 'serverType',
       'clientTotalTime', 'requestSize', 'frequency', 'instanceType',
       'freeMem', 'timeWrite', 'databaseType', 'totalMem', 'memUsage'],
      dtype='object')

[1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 15.0, 20.0, 40.0, 60.0]

[1.0, 5.0, 20.0, 50.0, 100.0, 200.0, 500.0]

In [6]:
df

Unnamed: 0,timeRead,timeDelete,id,totalTime,timestamp,serverType,clientTotalTime,requestSize,frequency,instanceType,freeMem,timeWrite,databaseType,totalMem,memUsage
0,-1.0,-1.0,--4qETfhcD,-1.0,-1.000000e+00,java,-1.0,50.0,15.0,b1,-1.000000e+00,-1.0,nosql,-1.000000e+00,-1.000000
1,-1.0,-1.0,--Lo0QBj3fC,-1.0,-1.000000e+00,java,-1.0,200.0,40.0,b1,-1.000000e+00,-1.0,nosql,-1.000000e+00,-1.000000
2,-1.0,-1.0,--P-Z58M2G,-1.0,-1.000000e+00,java,-1.0,1.0,15.0,b1,-1.000000e+00,-1.0,nosql,-1.000000e+00,-1.000000
3,-1.0,-1.0,--nUeIfVXJ,-1.0,-1.000000e+00,java,-1.0,100.0,15.0,b1,-1.000000e+00,-1.0,nosql,-1.000000e+00,-1.000000
4,-1.0,-1.0,--od3lhTv,-1.0,-1.000000e+00,java,-1.0,100.0,15.0,b1,-1.000000e+00,-1.0,nosql,-1.000000e+00,-1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2065511,39.0,62.0,8QIprzVld,387.0,1.586361e+12,nodejs,581.0,50.0,1.0,b8,2.041418e+09,155.0,nosql,2.147484e+09,0.049391
2065512,37.0,90.0,B4QMyuj_U,338.0,1.586361e+12,nodejs,515.0,50.0,1.0,b8,2.041123e+09,86.0,nosql,2.147484e+09,0.049528
2065513,26.0,46.0,CoW098SF1,339.0,1.586361e+12,nodejs,527.0,50.0,1.0,b8,2.041532e+09,166.0,nosql,2.147484e+09,0.049337
2065514,59.0,45.0,_useqzNqg,282.0,1.586361e+12,nodejs,474.0,50.0,1.0,b8,2.042364e+09,90.0,nosql,2.147484e+09,0.048950
