In [1]:
import pandas as pd
from firebase_admin import credentials, firestore, initialize_app
from datetime import datetime, timedelta
from IPython.display import clear_output

initialize_app(credentials.Certificate('cred.json'))
db = firestore.client()

In [2]:
keys_expected = 15
all_keys = ['clientTotalTime', 'cpuUsage', 'databaseType', 'freeMem', 'frequency', 'id', 'instanceType', 'requestSize', 'serverType', 'timeDelete', 'timeRead', 'timeWrite', 'timestamp', 'totalMem', 'totalTime']
servers = sorted(['java', 'nodejs'])
databases = sorted(['nosql', 'sql'])
instances = sorted(['b1', 'b2', 'b4', 'b8'])

logs = []

def filter_func(x):
    try:
        val = x['id']
        val = x['instanceType']
        val = x['serverType']
        val = x['databaseType']
        
        return True
    except:
        return False
    
    
# Get the error runs in chunks (Apr 2 - Apr 6)
for instance in instances:    
    for database in databases:
        clear_output()
        display('{} {} {}'.format(instance, 'java', database))

        snapshot = db.collection('logs').where('timestamp', '==', -1).where('serverType', '==', 'java')
        snapshot = snapshot.where('instanceType', '==', instance).where('databaseType', '==', database).stream()
        
        temp = list(map(lambda x: x.to_dict(), snapshot))
        logs = logs + temp
        
    for database in databases:
        clear_output()
        display('{} {} {}'.format(instance, 'nodejs', database))

        snapshot = db.collection('logs').where('timestamp', '==', -1).where('serverType', '==', 'node')
        snapshot = snapshot.where('instanceType', '==', instance).where('databaseType', '==', database).where('requestSize', 'in', ['1', '5', '20', '50']).stream()
        
        temp = list(map(lambda x: x.to_dict(), snapshot))
        logs = logs + temp


start = datetime(2020, 4, 2)
end = datetime(2020, 4, 6)
# Get the success runs in chunks (Apr 2 - Apr 6)
while start < end:
    end_interval = start + timedelta(minutes=15)

    start_s = start.timestamp() * 1000
    end_s = end_interval.timestamp() * 1000
    clear_output()
    display(start)

    temp = list(map(lambda x: x.to_dict(), db.collection('logs').order_by('timestamp').start_at({'timestamp': start_s}).end_at({'timestamp': end_s}).stream()))
    start = end_interval
    logs = logs + temp
    
    
start = datetime(2020, 4, 6)
end = datetime(2020, 4, 8)
# Get all runs in chunks (Apr 6+)
while start < end:
    end_interval = start + timedelta(minutes=15)

    start_s = start.timestamp() * 1000
    end_s = end_interval.timestamp() * 1000
    clear_output()
    display(start)

    temp = list(map(lambda x: x.to_dict(), db.collection('logs').order_by('timestamp').start_at({'timestamp': start_s}).end_at({'timestamp': end_s}).stream()))
    start = end_interval
    logs = logs + temp

logs = list(filter(filter_func, logs))
print(len(logs))

datetime.datetime(2020, 4, 7, 23, 45)

1491154


In [3]:
def force_str(val):
    if "node" in val:
        return "nodejs"
    return str(val)

def force_num(val):
    if pd.isnull(val):
        return float(-1)
    try:
        return float(val)
    except:
        return float(-1)
    
def mem_calc(x):
    if x.freeMem == -1 or x.totalMem == -1:
        return float(-1)
    
    return float((x.totalMem - x.freeMem) / x.totalMem)
    

df = pd.DataFrame(logs)
df.set_index('id', inplace = True)

df['clientTotalTime'] = df.clientTotalTime.apply(force_num)
df['requestSize'] = df.requestSize.apply(force_num)
df['cpuUsage'] = df.cpuUsage.apply(force_num)
df['frequency'] = df.frequency.apply(force_num)
df['instanceType'] = df.instanceType.apply(force_str)
df['freeMem'] = df.freeMem.apply(force_num)
df['timeWrite'] = df.timeWrite.apply(force_num)
df['databaseType'] = df.databaseType.apply(force_str)
df['totalMem'] = df.totalMem.apply(force_num)
df['timeRead'] = df.timeRead.apply(force_num)
df['timeDelete'] = df.timeDelete.apply(force_num)
df['totalTime'] = df.totalTime.apply(force_num)
df['timestamp'] = df.timestamp.apply(force_num)
df['serverType'] = df.serverType.apply(force_str)
df['memUsage'] = df.apply(mem_calc, axis=1)


display(df.shape, df.dtypes)

frequencies = sorted(df['frequency'].unique())
sizes = sorted(df['requestSize'].unique())

display(frequencies, sizes)

df.to_csv('logs.csv')
df.head()

(1491154, 15)

instanceType        object
freeMem            float64
timeWrite          float64
databaseType        object
totalMem           float64
timeRead           float64
timeDelete         float64
totalTime          float64
timestamp          float64
serverType          object
clientTotalTime    float64
requestSize        float64
cpuUsage           float64
frequency          float64
memUsage           float64
dtype: object

[1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 15.0, 20.0, 40.0, 60.0]

[1.0, 5.0, 20.0, 50.0, 100.0, 200.0, 500.0]

Unnamed: 0_level_0,instanceType,freeMem,timeWrite,databaseType,totalMem,timeRead,timeDelete,totalTime,timestamp,serverType,clientTotalTime,requestSize,cpuUsage,frequency,memUsage
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
--4qETfhcD,b1,-1.0,-1.0,nosql,-1.0,-1.0,-1.0,-1.0,-1.0,java,-1.0,50.0,-1.0,15.0,-1.0
--Lo0QBj3fC,b1,-1.0,-1.0,nosql,-1.0,-1.0,-1.0,-1.0,-1.0,java,-1.0,200.0,-1.0,40.0,-1.0
--P-Z58M2G,b1,-1.0,-1.0,nosql,-1.0,-1.0,-1.0,-1.0,-1.0,java,-1.0,1.0,-1.0,15.0,-1.0
--nUeIfVXJ,b1,-1.0,-1.0,nosql,-1.0,-1.0,-1.0,-1.0,-1.0,java,-1.0,100.0,-1.0,15.0,-1.0
--od3lhTv,b1,-1.0,-1.0,nosql,-1.0,-1.0,-1.0,-1.0,-1.0,java,-1.0,100.0,-1.0,15.0,-1.0
