In [1]:
import json
import numpy as np
import pandas as pd
import regex
import os

In [2]:
# Parse the json file of omptracing and return a dataframe
# The dataframe has three columns: Thread, Time, Event
# Thread: the thread id
# Time: the time when the event happens
# Event: the event name
# Input: path of the json file
# Output: a dataframe
def get_tracing(path):
    with open(path, 'r') as f:
        json_data = json.load(f)
    columns=["Thread","Time","Event"]
    result_df=pd.DataFrame(columns=columns)
    def _get_df_row(term):
        df_row={}
        if term["name"] == "Initialize" and term["pid"] == "All":
            df_row["Thread"]= term["pid"]
            df_row["Time"]= term["ts"]
            df_row["Event"]= "Start"
        elif term["name"] == "Finalize" and term["pid"] == "All":
            df_row["Thread"]= term["pid"]
            df_row["Time"]= term["ts"]
            df_row["Event"]= "End"
        elif term["tid"] == "Parallel Region" and term["pid"] == "All":
            df_row["Thread"]= term["pid"]
            df_row["Time"]= term["ts"]
            df_row["Event"]= term["tid"]
        else:
            thread_id = int(term["pid"].split(" ")[1])
            df_row["Thread"]= thread_id
            df_row["Time"]= term["ts"]
            df_row["Event"]= term["tid"]
        return df_row
    for term,i in zip(json_data,range(len(json_data))):
        result_df.loc[i]=_get_df_row(term)
    return result_df

In [3]:
# The loggest parallel thread time / sequential thread time
# Input: df, the dataframe of the tracing file
# Output: float, the loggest parallel thread time / sequential thread time
def thread_ratio(df):
    df=df[(df["Event"]=="Thread Region")]
    df = df.sort_values(by=["Thread","Time"])
    df = df.reset_index(drop=True)
    num_threads=np.sort(np.int64(df["Thread"].unique()))
    def _thread_time(df,i):
        tmp_df=df[df["Thread"]==i]
        tmp_df.reset_index(drop=True)
        tmp_time=0
        assert len(tmp_df)%2==0
        for j in range(0,len(tmp_df),2):
            tmp_time+=tmp_df.iloc[j+1]["Time"]-tmp_df.iloc[j]["Time"]
        return tmp_time
    seq_time=_thread_time(df,0)
    worker_time=[]
    for i in num_threads[1:]:
        worker_time.append(_thread_time(df,i))
    if worker_time==[]:
        return 0
    else:
        return max(worker_time)/seq_time

In [4]:
# The loggest overhead time / the parallel time
# Input: dataframe of the tracing file
# Output: float, the loggest overhead time  / the parallel time
def overhead_ratio(df):
    df_par=df[df["Event"]=="Parallel Region"]
    df_par=df_par.sort_values(by=["Thread","Time"])
    df_par=df_par.reset_index(drop=True)

    df_work=df[df["Event"]=="Work Region"]
    df_work=df_work.sort_values(by=["Thread","Time"])
    df_work=df_work.reset_index(drop=True)    

    df_total=df[(df["Event"]=="Thread Region")]
    num_threads=np.sort(np.int64(df_total["Thread"].unique()))
    def _diff(i):
        tmp=0
        tmp_df=df_work[df_work["Thread"]==i]
        for j in range(0,len(df_par),2):
            tmp+=tmp_df.iloc[j]["Time"]-df_par.iloc[j]["Time"]
        return tmp
    par_time=0
    for j in range(0,len(df_par.index),2):
        par_time+=df_par.iloc[j+1]["Time"]-df_par.iloc[j]["Time"]
    overhead_time=[]
    for i in num_threads:
        overhead_time.append(_diff(i))
    if overhead_time==[]:
        return 0
    else:
        return max(overhead_time)/par_time
    

In [5]:
# Get the longest sync time / parallel time
# Input: dataframe, the omptracing result
# Output: float, the longest sync time / parallel time
def sync_ratio(df):
    df_par=df[df["Event"]=="Parallel Region"]
    df_par=df_par.sort_values(by=["Thread","Time"])
    df_par=df_par.reset_index(drop=True)
    
    df_work=df[df["Event"]=="Work Region"]
    df_work=df_work.sort_values(by=["Thread","Time"])
    df_work=df_work.reset_index(drop=True)    

    df_total=df[(df["Event"]=="Thread Region")]
    num_threads=np.sort(np.int64(df_total["Thread"].unique()))
    def _diff(i):
        tmp=0
        tmp_df=df_work[df_work["Thread"]==i]
        for j in range(1,len(df_par.index),2):
            tmp+=df_par.iloc[j]["Time"]-tmp_df.iloc[j]["Time"]
        return tmp
    par_time=0
    for j in range(0,len(df_par.index),2):
        par_time+=df_par.iloc[j+1]["Time"]-df_par.iloc[j]["Time"]
    sync_time=[]
    for i in num_threads:
        sync_time.append(_diff(i))
    if sync_time==[]:
        return 0
    else:
        return max(sync_time)/par_time

In [6]:
# Get the longest parallel region time / total time
# Input: dataframe, the omptracing result
# Output: float, the longest parallel region time / total time
def parallel_region_ratio(df):
    run_time=df[df["Event"]=="End"]["Time"].values[0]-df[df["Event"]=="Start"]["Time"].values[0]
    tmp_df=df[(df["Thread"]=="All") & (df["Event"]=="Parallel Region")]
    tmp_df=tmp_df.sort_values(by=["Time"])
    assert(tmp_df.shape[0]%2==0)
    parallel_time=0
    for i in range(0,tmp_df.shape[0],2):
        start_time=tmp_df.iloc[i]["Time"]
        end_time=tmp_df.iloc[i+1]["Time"]
        parallel_time+=end_time-start_time
    return parallel_time/run_time

In [7]:
# Parse the valgrind log file
# Input: path of the log file
# Output: a dataframe of the counts and a dataframe of the ratios
def get_valgrind(path):
    logfile=open(path,"r")
    line_list=logfile.readlines()
    line_len=len(line_list)
    columns=["Ir","I1mr","ILmr","Dr","D1mr","DLmr","Dw","D1mw","DLmw","Function"]
    df_counts=pd.DataFrame(columns=columns,dtype='object')
    df_ratios=pd.DataFrame(columns=columns,dtype='object')
    # The stats of all data
    head_index=0
    for line_index in range(line_len):
        m=regex.match(r"Ir                 I1mr           ILmr           Dr               D1mr           DLmr           Dw               D1mw           DLmw",line_list[line_index])
        if m!=None:
            head_index=line_index
            break
    assert head_index!=0
    
    def _read_valgrind_line(line):
        tmp_list=regex.split(r"\(|\)| ",line)
        tmp_list = list(filter(None, tmp_list))
        if len(tmp_list)<10:
            return None,None,None
        counts=[]
        ratios=[]
        k=0
        for i in range(9):
            counts.append(int(tmp_list[k].replace(",","")))
            k=k+1
            tmp_ratio=tmp_list[k]
            if tmp_ratio[-1]=="%":
                ratios.append(float(tmp_ratio[:-1])/100)
                k=k+1
            else:
                ratios.append(0.0)
        if len(counts)==0:
            return None,None,None
        func=tmp_list[-1].replace("\n","")
        if len(func)>0 and func[0]==".":
            func=None
        return counts,ratios,func
    
    counts,ratios,func=_read_valgrind_line(line_list[head_index+2])
    df_counts.loc[0]=pd.Series(counts,index=columns[:-1],dtype='object')
    df_counts.loc[0]["Function"]=func
    df_ratios.loc[0]=pd.Series(ratios,index=columns[:-1],dtype='object')
    df_ratios.loc[0]["Function"]=func

    # The stats of functions
    begin_index=head_index+2
    for line_index in range(head_index+2,line_len):
        m=regex.match(r"Ir                 I1mr         ILmr         Dr               D1mr         DLmr         Dw              D1mw           DLmw          file:function",line_list[line_index])
        if m!=None:
            begin_index=line_index
            break
    assert begin_index!=0


    for line_index in range(begin_index+3,line_len):
    # for line_index in [23]:
        counts,ratios,func=_read_valgrind_line(line_list[line_index])
        if counts==None:
            end_index=line_index
            break
        else:
            k=len(df_counts)
            df_counts.loc[k]=pd.Series(counts,index=columns[:-1],dtype='object')
            df_counts.loc[k]["Function"]=func
            df_ratios.loc[k]=pd.Series(ratios,index=columns[:-1],dtype='object')
            df_ratios.loc[k]["Function"]=func
    return df_counts,df_ratios


In [8]:
def get_ratio_sync_paral(test_case):
    """ input test_case: the path of the test case"""
    jsonfiles = [file for file in os.listdir(test_case) if file.endswith(".json")]
    ratios = []
    for jsonfile in jsonfiles:
        print(jsonfile)
        result = get_tracing(os.path.join(test_case, jsonfile))
        parallel_ratio = parallel_region_ratio(result)
        s_ratio = sync_ratio(result)
        print("parllel region ratio:", parallel_ratio)
        print("sync region ratio   :", s_ratio)
        print("thread ratio        :", thread_ratio(result))
        print("sync / parallel:", s_ratio/parallel_ratio)
        ratios.append(s_ratio/parallel_ratio)
    print(ratios)
    avg_ratio = np.array(ratios).mean()
    return avg_ratio

In [9]:
result_genprime_atomic=get_tracing("lack_of_parallelism/genprime/omptracing_atomic.json")
result_genprime_critical=get_tracing("lack_of_parallelism/genprime/omptracing_critical.json")
result_sum_reduction=get_tracing("lack_of_parallelism/sum/omptracing_reduction.json")
result_sum_critical=get_tracing("lack_of_parallelism/sum/omptracing_critical.json")

In [10]:
get_ratio_sync_paral("lack_of_parallelism/genprime/")
get_ratio_sync_paral("lack_of_parallelism/sum")

omptracing_atomic.json
parllel region ratio: 0.9136004699041345
sync region ratio   : 4.006610293051377e-06
thread ratio        : 0
sync / parallel: 4.385516891723793e-06
omptracing_critical.json
parllel region ratio: 0.9794384503878646
sync region ratio   : 9.517424085682261e-07
thread ratio        : 0
sync / parallel: 9.717225295691928e-07
[4.385516891723793e-06, 9.717225295691928e-07]
omptracing_reduction.json
parllel region ratio: 0.12537657803997282
sync region ratio   : 0.027854393688330828
thread ratio        : 0
sync / parallel: 0.2221658472721295
omptracing_critical.json
parllel region ratio: 0.7678902588721925
sync region ratio   : 3.959937115212455e-06
thread ratio        : 0
sync / parallel: 5.156904999717604e-06
[0.2221658472721295, 5.156904999717604e-06]


0.11108550208856462

In [11]:
result_valgrind_counts,result_valgrind_ratios=get_valgrind("valgrind_sample.txt")