In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
%matplotlib inline
from Tkinter import Tk
from tkFileDialog import askopenfilename
import os
from os import path
from IPython.display import HTML,display

In [46]:
def get_avg_write(scenarios,event,databases):
    databases = databases
    herzs = ["30hz","30hz","60hz","60hz","120hz","120hz"]
    column_labels = ["write(30hz)","write_with_read(30hz)",
                     "write(60hz)","write_with_read(60hz)",
                     "write(120hz)","write_with_read(120hz)"]
    avgs = np.zeros((0,len(databases)))

    _flag = True
    
    for herz in herzs:
        avg = np.zeros((1,0))
        for database in databases:
            directory = get_directory_path(database,herz,scenarios[0] if _flag else scenarios[1])
            file = directory+get_log_file_name(directory)[0]
            with open(file) as fp:
                lines = fp.readlines()
            times = []
            for line in lines:
                if database is "neo4j" and "Robot_id" in line and event[0] in line:
                    times.append(float(line.split(" ")[7]))
                elif "Robot_id" in line and event[1] in line:
                    times.append(float(line.split(" ")[8]))
            times = outlier_removal(times)
            avg = np.hstack([avg,[[round(np.mean(times),4)]]])
        _flag = False if (_flag is True) else True
        avgs = np.vstack([avgs,avg])
    mongodb_df_write_1 = pd.DataFrame(avgs, columns=databases,
                   index=column_labels)
    return mongodb_df_write_1

def get_avg_read(scenarios,query,databases):
    databases = databases
    herzs = ["30hz","30hz","60hz","60hz","120hz","120hz"]
    column_labels = ["read(30hz)","read_with_write(30hz)",
                     "read(60hz)","read_with_write(60hz)",
                     "read(120hz)","read_with_write(120hz)"]
    
    avgs = np.zeros((0,len(databases)))
    
    _flag = True
    
    for herz in herzs:
        avg = np.zeros((1,0))
        for database in databases:
            directory = get_directory_path(database,herz,scenarios[0] if _flag else scenarios[1])
            file = directory+get_log_file_name(directory)[0]
            with open(file) as fp:
                lines = fp.readlines()
            times = []
            for line in lines:
                if "Robot_id" in line and query in line:
                    times.append(float(line.split(" ")[8]))
            times = outlier_removal(times)
            avg = np.hstack([avg,[[round(np.mean(times),4)]]])
        _flag = not _flag
        avgs = np.vstack([avgs,avg])
    df = pd.DataFrame(avgs, columns=databases,
                   index=column_labels)
    return df


def get_replica_results():
        databases = ["neo4j","orientdb","couchdb","mongodb","cassandra","arangodb","mysql"]
        replication_results = np.empty((0,1))
        
        for database in databases:
            directory_path = "./logs/"+database+"/replication/"
            file_names = get_log_file_name(directory_path)
            write_file = file_names[0] if "write" in file_names[0] else file_names[1]
            read_file = file_names[0] if "read" in file_names[0] else file_names[1]
            
            
            result = []
            #get the write time stamp
            with open(directory_path+write_file) as fp:
                lines = fp.readlines()
            write_vals = {}
            for line in lines:
                if "replica_test_write" in line:
                    splitted_line = line.split(" ")
                    timestamp = splitted_line[9] + " " + splitted_line[10].split("\n")[0]
                    write_vals[splitted_line[7]] = datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S.%f')

            #get the read time stamp
            with open(directory_path+read_file) as fp:
                lines = fp.readlines()
            read_vals = {}
            for line in lines:
                if "replica_test_read" in line:
                    splitted_line = line.split(" ")
                    timestamp = splitted_line[9] + " " + splitted_line[10].split("\n")[0]
                    read_vals[splitted_line[7]] = datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S.%f')  

            for key,value in read_vals.iteritems():
                diff = value-write_vals[key]
                result.append(float(str(diff).split(".")[1])/1000000)
            
            replication_results = np.vstack([replication_results,[[round(np.average(result),4)]]])
            
        df = pd.DataFrame(replication_results,
                   index=databases,columns=["Average data replication time"])
        fig, axes = plt.subplots()
        fig.set_figheight(8)
        fig.set_figwidth(12)

        plot = df.plot(ax=axes,kind='bar',table=True,grid=True)

        axes.set_xticklabels([])
    #     plot.set_xlabel("Frequency of generating events (Herz)")
        plot.set_ylabel("Average replication time (Seconds)")

def get_directory_path(db_name,herz,scenario):
    return "./logs/"+db_name+"/"+scenario+"/"+herz+"/"

def get_log_file_name(directory):
    return [f for f in os.listdir(directory)]

def plot(df,title):
    
#     means = df
#     errors = df.std(axis=1)
#     print errors
    
    fig, axes = plt.subplots()
    fig.set_figheight(8)
    fig.set_figwidth(17)

#     plot = means.plot.bar(yerr=errors, ax=axes)
    plot = df.plot(ax=axes,kind='bar',table=True,grid=True,title=title)
    
    axes.set_xticklabels([])
#     plot.set_xlabel("Frequency of generating events (Herz)")
    plot.set_ylabel("Average query execution time (Seconds)")
#     plt.savefig(str(datetime.utcnow())+".png")
    
     
def outlier_removal(times):
    result = []
    
    mean = np.mean(times)
    std = np.std(times)
    
    for time in times:
        if abs(time-mean) < 2*std:
            result.append(time)
    return result

def get_avg(event):
    write_only = {"neo4j" : 0,"orientdb" : 0,"couchdb" : 0,"mongodb" : 0,"cassandra" : 0,"arangodb" : 0,"influxdb" : 0,"mysql" : 0}
    write_read = {"neo4j" : 0,"orientdb" : 0,"couchdb" : 0,"mongodb" : 0,"cassandra" : 0,"arangodb" : 0,"influxdb" : 0,"mysql" : 0}
    databases = ["neo4j","orientdb","couchdb","mongodb","cassandra","arangodb","influxdb","mysql"]
    i = 1;
    for index, row in event.iterrows():
        if i%2:
            for database in databases:
                write_only[database] += round(row[database],4)
        else:
            for database in databases:
                write_read[database] += round(row[database],4)
        i += 1
    
    for index,wo in write_only.iteritems():
        write_only[index] = wo/3
        
    for index,wr in write_read.iteritems():
        write_read[index] = wr/3
        
    print "Write only"
    for key, value in sorted(write_only.iteritems(), key=lambda (k,v): (v,k)):
        print "%s: %s" % (key, value)
    
    print "\n"+"Write and read together"
    for key, value in sorted(write_read.iteritems(), key=lambda (k,v): (v,k)):
        print "%s: %s" % (key, value)

In [5]:
# def outlier_removal(file_path,event):
# #     mu  = mean of the data
# # std = standard deviation of the data
# # IF abs(x-mu) > 3*std  THEN  x is outlier
#     with open(file_path) as fp:
#         lines = fp.readlines()
    
# #     times = np.zeros((1,0))
#     times = []
#     for line in lines:
#         if "Robot_id" in line and event in line:
# #             times = np.hstack([times,[[float(line.split(" ")[8])]]])
#             times.append(float(line.split(" ")[8]))
#     mean = np.mean(times)
#     std = np.std(times)
#     print mean,std
    
#     counter = 0
#     for time in times:
#         if abs(time-mean) > 2*std:
#             print time
#             counter +=1
#     print counter

# directory = get_directory_path("cassandra","120hz","rw_read")
# file_path = directory+get_log_file_name(directory)
# outlier_removal(file_path,"Query_id : 0")

0.00167591486082 0.00149422526985
0.00548791885376
0.00487780570984
0.0068039894104
0.00650691986084
0.00738501548767
0.00468015670776
0.00630211830139
0.00553703308105
0.00644207000732
0.0114879608154
0.00940895080566
0.00846695899963
0.00470280647278
0.00631213188171
0.00641202926636
0.00892210006714
0.00806593894958
0.00604319572449
0.0052318572998
0.00606393814087
0.00823497772217
0.00693511962891
0.006432056427
0.00688099861145
0.00574111938477
0.00668287277222
0.0110430717468
0.00482106208801
0.00498700141907
0.00867390632629
0.0046648979187
0.00627112388611
0.00658583641052
0.00612878799438
0.00821614265442
0.00486397743225
0.0046648979187
0.00613594055176
0.00648593902588
0.00502419471741
0.00680685043335
0.0106899738312
0.00635504722595
0.010055065155
0.00980496406555
0.00858092308044
0.0106289386749
0.00603413581848
0.00552701950073
0.00602412223816
0.0150949954987
0.00486397743225
0.008465051651
0.00655007362366
0.00555801391602
0.00594305992126
0.00477194786072
0.0072250366