In [1]:
from datetime import datetime
import re
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio

def time_translator(time):
    # translate time(yyyy-mm-ddThh:mm:ss) into time(sec)
    return int(datetime.strptime(time, "%Y-%m-%dT%H:%M:%S").timestamp())

def time_to_seconds(time):
    # translate time(hh:mm:ss) into time(sec)
    t = datetime.strptime(time, "%H:%M:%S")
    return t.hour * 3600 + t.minute * 60 + t.second

def trans_x_time(df, unit, unittime, trans_firsttime):
    # cpu_use_rate.index * unittime + trans_firsttime
    # translate df.index(time block) into time(%Y-%m-%dT%H:%M:%S)
    time = [datetime.fromtimestamp(i * unittime + trans_firsttime).strftime('%Y-%m-%dT%H:%M:%S') for i in range(unit)]
    df.index = time
    

def init_the_time(firsttime, lasttime, unit):
    # set the time data, return trans_firsttime, trans_lasttime, unittime
    trans_lasttime = time_translator(lasttime)
    trans_firsttime = time_translator(firsttime)
    unittime = (trans_lasttime - trans_firsttime) // unit
    return trans_firsttime, trans_lasttime, unittime

def extend_node_list(nodelist):
    # icpnp[101-103] -> [icpnp101, icpnp102, icpnp103]
    # 使用正規表達式來解析字串
    pattern = r'([a-zA-Z]+)(\[\d+-\d+(?:,\d+-\d+)*\])'
    matches = re.findall(pattern, nodelist)

    # 將符合的結果整理成 list
    result = []
    for match in matches:
        ranges = match[1][1:-1].split(',')
        for r in ranges:
            start, end = map(int, r.split('-'))
            for i in range(start, end + 1):
                result.append(f"{match[0]}{i}")
    return result

def is_valid_datetime_format(date_string):
    try:
        datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%S")
        return True
    except ValueError:
        return False

def job_processor(row, trans_firsttime, unittime):
    """
    Process the job imformation into:
        jobstart (int):                *Begin unit* of the job
        jobend (int):                  *End unit* of the job
        nodelist (list[str]):          All nodes the job works
        allocated_cpu (list[float]):   Number of allocated CPU
        cpu_useratio (list[float]):    Utilization of CPU
        wait_time (int):               Waiting time of the job
        allocated_cpu(backfill):       Number of allocated CPU(backfill)
    
    Parameters:
        row (DataFrame):               One job imformation from dataframe
        trans_firsttime (int):         Begin time of timeline(sec)
        unittime (int):                Length of one unit time(sec)
    """
    
    try:
        jobstart = (time_translator(row.Start) - trans_firsttime) // unittime
        if ((time_translator(row.Start) - trans_firsttime) % unittime) != 0:
            jobstart += 1
    except:
        jobstart = None
    try:
        jobend = (time_translator(row.End) - trans_firsttime) // unittime
        if ((time_translator(row.End) - trans_firsttime) % unittime) != 0:
            jobend += 1
    except:
        jobend = None

    nodelist = row.NodeList
    if '[' in nodelist: nodelist = extend_node_list(nodelist)
    result = [jobstart, jobend, nodelist]
    
    # allocated_cpu part
    try: 
        #row.AllocCPUS / int(row.AllocNodes
        allocated_value = int(row.AllocCPUS) / int(row.AllocNodes)
        allocated_cpu = [allocated_value for _ in range(jobend - jobstart + 1)]
        
        if (jobend - jobstart) > 0:
            allocated_cpu[0] = allocated_value*(1 - (((time_translator(row.Start) - trans_firsttime) % unittime) / unittime))
            allocated_cpu[-1] = allocated_value*(((time_translator(row.End) - trans_firsttime) % unittime) / unittime)
        else:
            allocated_cpu[0] = allocated_value*((time_translator(row.End) - time_translator(row.Start)) / unittime)

        result.append(allocated_cpu)
        
    except:
        allocated_cpu = [0]
        result.append(allocated_cpu)
    
    # cpu_useratio part
    try:
        useratio_value = (time_to_seconds(row.TotalCPU) / (time_translator(row.End) - time_translator(row.Start)) / int(row.AllocCPUS))
        allocated_cpu = [useratio_value for _ in range(jobend - jobstart + 1)]
        
        if (jobend - jobstart) > 0:
            allocated_cpu[0] = allocated_value*(1 - (((time_translator(row.Start) - trans_firsttime) % unittime) / unittime))
            allocated_cpu[-1] = allocated_value*(((time_translator(row.End) - trans_firsttime) % unittime) / unittime)
        else:
            allocated_cpu[0] = allocated_value*((time_translator(row.End) - time_translator(row.Start)) / unittime)

        result.append(allocated_cpu)
    except:
        cpu_useratio = [0]
        result.append(cpu_useratio)
    
    # wait_time part
    try:
        wait_time = time_translator(row.Start) - time_translator(row.Submit)
        result.append(wait_time)
    except:
        wait_time = None
        result.append(wait_time)

    #AllocCPUS
    result.append(row.AllocCPUS)
    
    #work time
    try:
        work_time = time_translator(row.End) - time_translator(row.Start)
        result.append(work_time)
    except:
        work_time = None
        result.append(work_time)
        
    #cancel time
    # if 'CANCELLED' in row.State:
    #     try:
    #         cancel_time = time_translator(row.End) - time_translator(row.Submit)
    #         result.append(cancel_time)
    #     except:
    #         cancel_time = None
    #         result.append(cancel_time)
    # else:
    #     cancel_time = None
    #     result.append(cancel_time) 

    return result

def make_data(unit):
    # Creat the zero matrix
    data = np.zeros((unit, 598))
    # Translate into dataFrame
    data = pd.DataFrame(data)
    # Set the node name
    columns = []
    for i in range(101, 157):
        columns.append(f'icpnq{i}')
    for i in range(201, 257):
        columns.append(f'icpnq{i}')
    for i in range(301, 357):
        columns.append(f'icpnq{i}')
    for i in range(401, 457):
        columns.append(f'icpnq{i}')
    for i in range(501, 557):
        columns.append(f'icpnq{i}')
    for i in range(601, 657):
        columns.append(f'icpnq{i}')
    for i in range(701, 757):
        columns.append(f'icpnq{i}')
    for i in range(101, 157):
        columns.append(f'icpnp{i}')
    for i in range(201, 257):
        columns.append(f'icpnp{i}')
    for i in range(301, 349):
        columns.append(f'icpnp{i}')
    for i in range(1, 7):
        columns.append(f'gpn0{i}')
    for i in range(1, 41):
        columns.append(f'ncpn{i}')
    # Renew column name of dataFrame
    data.columns = columns
    return data

def add_value_to_data(data, timestart, timeend, nodelist, value):
    """
    Add value on data[timestart:timeend+1, nodelist]
    
    Parameters:
        data (DataFrame): DataFrame to catch the value(from make_data())
        timestart (int):  Begin time of the job
        timeend (int):    End time of the job
        nodelist (list):  Node list of the job
        value (float or list):    value you want to add
    """
    if isinstance(value, list):
        for i in range(len(value)):
            data.loc[timestart+i, nodelist] += value[i]
    else:
        data.loc[timestart:timeend+1, nodelist] += value
    return data

def plot_heatmap(df, title):
    # Draw heatmap
    fig = go.Figure(data=go.Heatmap(
        z=df.values,
        x=df.columns,
        y=df.index,
        colorscale=[[0, 'rgb(255,255,255)'], [0.0001, 'rgb(200,200,255)'], [1, 'rgb(0,0,255)']],
        showscale=False,
        colorbar=dict(thickness=20, ticklen=4),
        zmin=0,  # 最小值為0
        zmax=df.values.max()
    ))

    fig.update_layout(
        title={'text':title, 'font':{'size': 70}},
        xaxis_nticks=36,
        plot_bgcolor='White',  # 將背景設置為白色
        #width=500,  # 圖的寬度
        height=1800,
        yaxis={'tickfont':{'size':60}},  # 調整y軸標籤字體大小
        xaxis={'tickfont':{'size':60}},
        #xaxis={'tickfont':{'size':60}, 'range':['2024-07-01T00:00:00','2024-07-17T00:00:00']},  # 調整x軸標籤字體大小
        showlegend=False
    )

    fig.update_traces(hoverongaps=False)  # 不顯示空值的tooltip
    fig.update_traces(zmid=0, colorbar=dict(
        tickvals=[0, df.values.max()],
        ticktext=['0', str(df.values.max())]
    ))

    fig.add_shape(
        type='line', line=dict(dash='solid'),
        name = 'ct112,448 vs ct4k,8k',
        #%Y-%m-%dT%H:%M:%S
        x0 = df.columns[0],
        x1 = df.columns[-1],
        y0 = "icpnp101",
        y1 = "icpnp101"
    )

    fig.add_shape(
        type='line', line=dict(dash='solid'),
        name = 'ct4k,8k vs ct1k,2k',
        #%Y-%m-%dT%H:%M:%S
        x0 = df.columns[0],
        x1 = df.columns[-1],
        y0 = "icpnq256",
        y1 = "icpnq256"
    )

    fig.show()
    pio.write_image(fig, f'{title}.png', width=24*200, height=16*200, scale=2)

def plot_bar(data, x_title, y_title, title):
    
    df = pd.DataFrame({'id':[i for i in range(len(data))], 'wait':[time/3600 for time in data]})
    fig = px.bar(df, x='id', y='wait')
    fig.update_layout(
        title=title,
        xaxis_title=x_title,
        yaxis_title=y_title
    )
    fig.update_traces(marker_color='rgb(0, 0, 0)', marker_line_color='rgb(0, 0, 0)',
                      marker_line_width=1.5, opacity=0.6)
    
    fig.show()
    pio.write_image(fig, f'{title}.png', scale=2)

def plot_scatter(data, y_title, title):
    fig = go.Figure(go.Scatter(
        x=data.iloc[:, 0],
        y=data.iloc[:, 1],
        name=y_title,
        mode='markers',
        marker={'opacity':0.4}
        ))
    
    average_time = data.groupby('#CPU').describe().iloc[:,1]

    fig.update_traces(marker_color='rgb(0, 0, 0)', marker_line_color='rgb(0, 0, 0)',
                        marker_line_width=1.5, opacity=0.3)

    fig.add_trace(go.Scatter(x=average_time[average_time > 1].index, y=average_time[average_time > 1], name='average'))

    fig.update_layout(
        plot_bgcolor='White',
        yaxis={
            'type':'log'
        },
        xaxis={
            'title':'#CPU',
            'type':'log',
            'range':[-0.1,4.8]
        },
        title={'text':title, 'font':{'size':30}},
        xaxis_title={'text':'#CPU', 'font':{'size': 30}},
        yaxis_title={'text':'Seconds', 'font':{'size':30}},
        height=600,
        width=1700
    )

    fig.add_shape(
            type='line', line=dict(dash='dash'),
            name = 'min',
            label = {'text':'min'},
            x0 = 0,
            x1 = 60000,
            y0 = 60,
            y1 = 60
        )

    fig.add_shape(
            type='line', line=dict(dash='dash'),
            name = 'hour',
            label = {'text':'hour'},
            x0 = 0,
            x1 = 60000,
            y0 = 3600,
            y1 = 3600
        )

    fig.add_shape(
            type='line', line=dict(dash='dash'),
            name = 'day',
            label = {'text':'day'},
            x0 = 0,
            x1 = 60000,
            y0 = 3600*24,
            y1 = 3600*24
        )

    fig.add_shape(
            type='line', line=dict(dash='dash'),
            name = 'week',
            label = {'text':'week'},
            x0 = 0,
            x1 = 60000,
            y0 = 3600*24*7,
            y1 = 3600*24*7
        )

    fig.show()
    #pio.write_image(fig, f'{title}.png', scale=2)

def main():
    """
    Load dataframe and time setting -> Read row and calculate the value -> Draw pictures with value.
    You can set the dataframe(log file) path, time setting and workflow in main function.

    *If you want to analyze more information from the dataframe, it is easy to achieve by modifying the job_processor() and workflow in the main().*

    Parameters:
        log (DataFrame):  Target dataFrame
        firsttime (date): Begin of the data (yyyy-mm-ddThh:mm:ss)
        unit (int):       Number you want to cut the timeline (Resolution)
        NODEMAX (int):    Number of nodes
    """
    # File path and time setting
    log = pd.read_parquet('./data/20240801_F1_log.parquet')
    firsttime = log.query('Submit != "Unknown"').End.sort_values(ascending=1).iloc[0]
    unit = 300
    #NODEMAX = 598
    normal_job = 0 # see only not fail jobs or not (0 = all jobs)

    # Prepare dataframe for catching the value from job_processor
    # more picture -> more make_data()
    lasttime = log.query('End != "Unknown"').End.sort_values(ascending=0).iloc[0]
    trans_firsttime, trans_lasttime, unittime = init_the_time(firsttime, lasttime, unit)
    cpu_use_rate = make_data(unit)
    cpu_occupy = make_data(unit)
    cpu_occupy_backfill = make_data(unit)
    wait_time = []
    work_time = []

    
    for index, row in log.iterrows():
        if row.Group != '': #one job count once
            if is_valid_datetime_format(row.Start) and is_valid_datetime_format(row.End):
                if row.NodeList != 'None assigned':
                    if normal_job:
                        if row.State not in ['FAILED', 'TIMEOUT']:
                            result = job_processor(row, trans_firsttime, unittime)
                            # result = [jobstart(int), jobend(int), nodelist(list), allocated_cpu(int or list), cpu_useratio(float), wait_time(int), NCPUS, work_time(int)]
                            try:
                                # add_value_to_data(data, timestart, timeend, nodelist, value)
                                cpu_use_rate = add_value_to_data(cpu_use_rate, result[0], result[1], result[2], result[4])
                            except:
                                pass

                            try:
                                # add_value_to_data(data, timestart, timeend, nodelist, value)
                                cpu_occupy = add_value_to_data(cpu_occupy, result[0], result[1], result[2], result[3])
                                cpu_occupy_backfill = add_value_to_data(cpu_occupy_backfill, result[0], result[1], result[2], result[6])
                            except:
                                pass
                            if result[5] != None:
                                wait_time.append(result[5])

                            
                    
                    else:
                        result = job_processor(row, trans_firsttime, unittime)
                        # result = [jobstart(int), jobend(int), nodelist(list), allocated_cpu(int), cpu_useratio(float), wait_time(int), NCPUS, work_time(int)]
                        try:
                            # add_value_to_data(data, timestart, timeend, nodelist, value)
                            pass
                            #cpu_use_rate = add_value_to_data(cpu_use_rate, result[0], result[1], result[2], result[4])
                        except:
                            pass

                        try:
                            # add_value_to_data(data, timestart, timeend, nodelist, value)
                            cpu_occupy = add_value_to_data(cpu_occupy, result[0], result[1], result[2], result[3])
                        except:
                            pass
                        
                        # wait time
                        if result[5] != None:
                            wait_time.append([int(result[6]), int(result[5])])
                            
                        #work time
                        if result[7] != None:
                            work_time.append([int(result[6]), result[7]])

    #transform the x-axis of dfs ( unit -> yyyy-mm-ddThh:mm:ss )
    #trans_x_time(cpu_use_rate.index * unittime + trans_firsttime)
    trans_x_time(cpu_use_rate, unit, unittime, trans_firsttime)
    trans_x_time(cpu_occupy, unit, unittime, trans_firsttime)
    trans_x_time(cpu_occupy_backfill, unit, unittime, trans_firsttime)

    return cpu_use_rate, cpu_occupy, wait_time, work_time, cpu_occupy_backfill





if __name__ == '__main__':
    cpu_use_rate, cpu_occupy, wait_time, work_time, cpu_occupy_backfill = main()
    #plot_heatmap(cpu_use_rate.T, 'Utilization of CPU (Not fail job)')
    #plot_heatmap(cpu_occupy.T, 'Allocated CPU (July jobs)')
    #plot_scatter(data, y_title, title)
    #plot_scatter(wait_time, 'second', 'test')
    pass


In [5]:
df1 = cpu_occupy.T
df2 = cpu_occupy_backfill.T
fig = go.Figure(data=go.Heatmap(
    z=df1.values,
    x=df1.columns,
    y=df1.index,
    colorscale=[[0, 'rgb(255,255,255)'], [0.0001, 'rgb(200,200,255)'], [1, 'rgb(0,0,255)']],
    showscale=False,
    colorbar=dict(thickness=20, ticklen=4),
    zmin=0,  # 最小值為0
    zmax=df1.values.max()
))

fig.add_trace(
    go.Heatmap(
    z=df2.values,
    x=df2.columns,
    y=df2.index,
    colorscale=[[0, 'rgb(255,255,255)'], [0.0001, 'rgb(255,200,200)'], [1, 'rgb(255,0,0)']],
    showscale=False,
    colorbar=dict(thickness=20, ticklen=4),
    zmin=0,  # 最小值為0
    zmax=df2.values.max()
)
)

fig.update_layout(
    title={'text':'TEST', 'font':{'size': 70}},
    xaxis_nticks=36,
    plot_bgcolor='White',  # 將背景設置為白色
    #width=500,  # 圖的寬度
    height=1800,
    yaxis={'tickfont':{'size':60}},  # 調整y軸標籤字體大小
    xaxis={'tickfont':{'size':60}},
    #xaxis={'tickfont':{'size':60}, 'range':['2024-07-01T00:00:00','2024-07-17T00:00:00']},  # 調整x軸標籤字體大小
    showlegend=False
)

fig.update_traces(hoverongaps=False)  # 不顯示空值的tooltip
fig.update_traces(zmid=0, colorbar=dict(
    tickvals=[0, df1.values.max()],
    ticktext=['0', str(df1.values.max())]
))

fig.add_shape(
    type='line', line=dict(dash='solid'),
    name = 'ct112,448 vs ct4k,8k',
    #%Y-%m-%dT%H:%M:%S
    x0 = df1.columns[0],
    x1 = df1.columns[-1],
    y0 = "icpnp101",
    y1 = "icpnp101"
)

fig.add_shape(
    type='line', line=dict(dash='solid'),
    name = 'ct4k,8k vs ct1k,2k',
    #%Y-%m-%dT%H:%M:%S
    x0 = df1.columns[0],
    x1 = df1.columns[-1],
    y0 = "icpnq256",
    y1 = "icpnq256"
)

fig.show()

In [17]:
firsttime = '2024-01-01T00:00:00'
unit = 300
lasttime = log.query('End != "Unknown"').End.sort_values(ascending=0).iloc[0]
trans_firsttime, trans_lasttime, unittime = init_the_time(firsttime, lasttime, unit)
log = pd.read_parquet('./data/20240702_F1_log.parquet')
log = log[log.Submit < '2024-07-01T00:00:00']
row = log.iloc[5124:5150,:]
#row = row.drop_duplicates(subset='AssocID', keep='first')
row

Unnamed: 0,Account,AllocCPUS,AllocNodes,AllocTRES,AssocID,CPUTimeRAW,ElapsedRaw,Eligible,End,ExitCode,...,Submit,Suspended,SystemCPU,SystemComment,TimelimitRaw,TotalCPU,UID,User,UserCPU,WorkDir
5124,gov113006,224,2,"billing=224,cpu=224,mem=965164M,node=2",15911,124992,558,2024-02-05T15:18:21,2024-02-05T15:27:40,0:0,...,2024-02-05T15:18:21,00:00:00,00:00:00,,80.0,00:00:00,10070.0,p00lcy01,00:00:00,/home/p00lcy01/nuosc3d_4/bench_t4/pstro7/n002x...
5125,gov113006,112,1,"cpu=112,mem=482582M,node=1",15911,62496,558,2024-02-05T15:18:22,2024-02-05T15:27:40,0:0,...,2024-02-05T15:18:22,00:00:00,00:00:00,,,00:00:00,,,00:00:00,
5126,gov113006,224,2,"billing=224,cpu=224,mem=965164M,node=2",15911,124992,558,2024-02-05T15:18:22,2024-02-05T15:27:40,0:0,...,2024-02-05T15:18:22,00:00:00,00:00:00,,,00:00:00,,,00:00:00,
5127,gov113006,224,2,"cpu=224,mem=965164M,node=2",15911,64064,286,2024-02-05T15:18:22,2024-02-05T15:23:08,0:0,...,2024-02-05T15:18:22,00:00:00,00:00:00,,,00:00:00,,,00:00:00,
5128,gov113006,224,2,"cpu=224,mem=965164M,node=2",15911,60928,272,2024-02-05T15:23:08,2024-02-05T15:27:40,0:0,...,2024-02-05T15:23:08,00:00:00,00:00:00,,,00:00:00,,,00:00:00,
5129,gov113006,57344,512,"billing=57344,cpu=57344,mem=247081984M,node=512",15911,1548288,27,2024-02-05T15:40:38,2024-02-05T15:41:06,0:0,...,2024-02-05T15:40:38,00:00:00,00:00:00,,80.0,00:00:00,10070.0,p00lcy01,00:00:00,/home/p00lcy01/nuosc3d_4/bench_t4/pstro7/n512x...
5130,gov113006,112,1,"cpu=112,mem=482582M,node=1",15911,3024,27,2024-02-05T15:40:39,2024-02-05T15:41:06,0:0,...,2024-02-05T15:40:39,00:00:00,00:00:00,,,00:00:00,,,00:00:00,
5131,gov113006,57344,512,"billing=57344,cpu=57344,mem=247081984M,node=512",15911,1548288,27,2024-02-05T15:40:39,2024-02-05T15:41:06,0:0,...,2024-02-05T15:40:39,00:00:00,00:00:00,,,00:00:00,,,00:00:00,
5132,gov113006,1792,16,"cpu=1792,mem=7721312M,node=16",15911,30464,17,2024-02-05T15:40:39,2024-02-05T15:40:56,0:0,...,2024-02-05T15:40:39,00:00:00,00:00:00,,,00:00:00,,,00:00:00,
5133,gov113006,1792,16,"cpu=1792,mem=7721312M,node=16",15911,28672,16,2024-02-05T15:40:39,2024-02-05T15:40:55,0:0,...,2024-02-05T15:40:39,00:00:00,00:00:00,,,00:00:00,,,00:00:00,


In [30]:
#cpu_occupy.to_parquet('cpu_occupy.parquet')
#waiting_time.to_parquet('waiting_time.parquet')
#working_time.to_parquet('working_time.parquet')
#cpu_occupy = pd.read_parquet('cpu_occupy.parquet')
#plot_heatmap(cpu_occupy.T, 'Allocated CPU (All jobs)')
log = 0
cpu_occupy = 0


In [29]:
test = cpu_occupy.describe()
for i in range(len(test.columns)):
    print(f'{test.columns[i]} : {test.iloc[7, :].iloc[i]}')
    #cpu_occupy.describe().iloc[7, :]

icpnq101 : 122.82707691143932
icpnq102 : 122.82707691143932
icpnq103 : 122.82707691143932
icpnq104 : 122.82707691143932
icpnq105 : 122.82707691143932
icpnq106 : 122.82707691143932
icpnq107 : 122.82707691143932
icpnq108 : 122.82707691143932
icpnq109 : 122.82707691143932
icpnq110 : 122.82707691143932
icpnq111 : 122.82707691143932
icpnq112 : 122.82707691143932
icpnq113 : 122.82707691143932
icpnq114 : 122.82707691143932
icpnq115 : 169.47168693306102
icpnq116 : 122.82707691143932
icpnq117 : 122.82072278220087
icpnq118 : 122.82072278220087
icpnq119 : 122.82072278220087
icpnq120 : 122.82072278220087
icpnq121 : 112.0
icpnq122 : 112.0
icpnq123 : 112.0
icpnq124 : 112.0
icpnq125 : 112.0
icpnq126 : 112.0
icpnq127 : 112.0
icpnq128 : 112.0
icpnq129 : 112.0
icpnq130 : 175.89440042360863
icpnq131 : 112.0
icpnq132 : 112.0
icpnq133 : 169.5785976789778
icpnq134 : 112.0
icpnq135 : 112.0
icpnq136 : 112.0
icpnq137 : 112.0
icpnq138 : 112.0
icpnq139 : 112.0
icpnq140 : 112.0
icpnq141 : 112.0
icpnq142 : 112.0
i

In [2]:
alloc_cpu = []
how_long_it_wait = []
for pair in wait_time:
    alloc_cpu.append(pair[0])
    how_long_it_wait.append(pair[1])

waiting_time = pd.DataFrame({'#CPU':alloc_cpu, 'Waiting Time':how_long_it_wait})
wait_time = 0
waiting_time.to_parquet('waiting_time.parquet')
alloc_cpu = []
how_long_it_work = []
for pair in work_time:
    alloc_cpu.append(pair[0])
    how_long_it_work.append(pair[1])

working_time = pd.DataFrame({'#CPU':alloc_cpu, 'Working Time':how_long_it_work})
work_time = 0
working_time.to_parquet('working_time.parquet')

In [4]:
test = waiting_time.sort_values(by=['#CPU'])

In [15]:
waiting_time.groupby('#CPU').describe()

Unnamed: 0_level_0,Waiting Time,Waiting Time,Waiting Time,Waiting Time,Waiting Time,Waiting Time,Waiting Time,Waiting Time
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
#CPU,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1,808.0,312.381188,2254.078670,0.0,0.00,1.0,7.00,27736.0
2,253.0,1240.628458,15463.496522,0.0,0.00,1.0,1.00,243051.0
3,55.0,7.600000,38.761187,0.0,0.00,0.0,1.00,283.0
4,9810.0,99.696534,1094.035303,0.0,1.00,2.0,12.00,36581.0
5,59.0,3.220339,7.683542,0.0,0.00,0.0,1.00,30.0
...,...,...,...,...,...,...,...,...
57344,195.0,4424.169231,9299.140640,0.0,1.00,210.0,3404.50,48500.0
58240,2.0,25625.500000,35098.659298,807.0,13216.25,25625.5,38034.75,50444.0
59584,1.0,1.000000,,1.0,1.00,1.0,1.00,1.0
60480,14.0,714.071429,765.685863,1.0,2.00,426.0,1396.50,2011.0


In [54]:
#df = pd.DataFrame({'#CPU':[i for i in waiting_time.groupby('#CPU').describe().index], 'wait time':[time for time in waiting_time.groupby('#CPU').describe().iloc[:,1]]})
waiting_time = pd.read_parquet('waiting_time.parquet')
fig = px.bar(waiting_time.sort_values(by=['#CPU']), x='#CPU', y='Waiting Time', log_y=True)
fig.update_layout(
    title={'text':'#CPU v.s. wait time(log)', 'font':{'size': 30}},
    xaxis_title={'text':"#CPU", 'font':{'size':30}},
    yaxis_title={'text':'wait time (second)', 'font':{'size':30}},
    height=700,
    width=1500,
    yaxis=dict(tickfont=dict(size=20)),  # 調整y軸標籤字體大小
    xaxis=dict(tickfont=dict(size=20))  # 調整x軸標籤字體大小
    )

#fig.add_trace(go.Line(x=waiting_time.groupby('#CPU').describe().index, y=waiting_time.groupby('#CPU').describe().iloc[:,1]))


fig.update_traces(marker_color='rgb(0, 0, 0)', marker_line_color='rgb(0, 0, 0)',
                    marker_line_width=2, opacity=0.6)

fig.add_shape(
        type='line', line=dict(dash='dash'),
        name = 'hour',
        x0 = 0,
        x1 = waiting_time.sort_values(by=['#CPU'])['#CPU'].iloc[-1],
        y0 = 60,
        y1 = 60
    )

fig.add_shape(
        type='line', line=dict(dash='dash'),
        name = 'hour',
        x0 = 0,
        x1 = waiting_time.sort_values(by=['#CPU'])['#CPU'].iloc[-1],
        y0 = 3600,
        y1 = 3600
    )

fig.add_shape(
        type='line', line=dict(dash='dash'),
        name = 'hour',
        x0 = 0,
        x1 = waiting_time.sort_values(by=['#CPU'])['#CPU'].iloc[-1],
        y0 = 3600*24,
        y1 = 3600*24
    )

fig.add_shape(
        type='line', line=dict(dash='dash'),
        name = 'hour',
        x0 = 0,
        x1 = waiting_time.sort_values(by=['#CPU'])['#CPU'].iloc[-1],
        y0 = 3600*24*7,
        y1 = 3600*24*7
    )
    
#fig.show()
pio.write_image(fig, f'CPU_vs_wait_time_log_.png', scale=2)

In [55]:
#waiting_time = pd.read_parquet('waiting_time.parquet')
df = pd.DataFrame({'#CPU':[i for i in waiting_time.groupby('#CPU').describe().index], 'wait time':[time for time in waiting_time.groupby('#CPU').describe().iloc[:,1]]})
fig = px.bar(df, x='#CPU', y='wait time', log_y=True)
fig.update_layout(
    title={'text':'#CPU v.s. ave. wait time(log)', 'font':{'size': 30}},
    xaxis_title={'text':"#CPU", 'font':{'size':30}},
    yaxis_title={'text':'ave. wait time (second)', 'font':{'size':30}},
    height=700,
    width=1500,
    yaxis=dict(tickfont=dict(size=20)),  # 調整y軸標籤字體大小
    xaxis=dict(tickfont=dict(size=20)) 
    )
fig.update_traces(marker_color='rgb(0, 0, 0)', marker_line_color='rgb(0, 0, 0)',
                    marker_line_width=1.5, opacity=0.6)

fig.add_shape(
        type='line', line=dict(dash='dash'),
        name = 'minute',
        x0 = 0,
        x1 = working_time.sort_values(by=['#CPU'])['#CPU'].iloc[-1],
        y0 = 60,
        y1 = 60
    )

fig.add_shape(
        type='line', line=dict(dash='dash'),
        name = 'hour',
        x0 = 0,
        x1 = working_time.sort_values(by=['#CPU'])['#CPU'].iloc[-1],
        y0 = 3600,
        y1 = 3600
    )

fig.add_shape(
        type='line', line=dict(dash='dash'),
        name = 'day',
        x0 = 0,
        x1 = working_time.sort_values(by=['#CPU'])['#CPU'].iloc[-1],
        y0 = 3600*24,
        y1 = 3600*24
    )

fig.add_shape(
        type='line', line=dict(dash='dash'),
        name = 'week',
        x0 = 0,
        x1 = working_time.sort_values(by=['#CPU'])['#CPU'].iloc[-1],
        y0 = 3600*24*7,
        y1 = 3600*24*7
    )
    
#fig.show()
pio.write_image(fig, f'CPU_vs_ave_wait_time_log_.png', scale=2)

In [59]:
df = pd.DataFrame({'#CPU':[i for i in waiting_time.groupby('#CPU').describe().index], 'count':[count for count in waiting_time.groupby('#CPU').describe().iloc[:,0]]})
fig = px.bar(df, x='#CPU', y='count', log_y=True)
fig.update_layout(
    title={'text':'#CPU v.s. count', 'font':{'size': 30}},
    xaxis_title={'text':"#CPU", 'font':{'size':30}},
    yaxis_title={'text':'count', 'font':{'size':30}}
    )
fig.update_traces(marker_color='rgb(0, 0, 0)', marker_line_color='rgb(0, 0, 0)',
                    marker_line_width=1.5, opacity=0.6)
    
fig.show()

In [60]:
df = pd.DataFrame({'#CPU':[i for i in waiting_time.groupby('#CPU').describe().index], 'count':[count for count in waiting_time.groupby('#CPU').describe().iloc[:,0]]})
fig = px.bar(df, x='#CPU', y='count')
fig.update_layout(
    title='#CPU v.s. count',
    xaxis_title="#CPU",
    yaxis_title='count'
    )
fig.update_traces(marker_color='rgb(0, 0, 0)', marker_line_color='rgb(0, 0, 0)',
                    marker_line_width=1.5, opacity=0.6)
    
fig.show()

In [75]:
working_time = 0
df = 0
waiting_time = pd.read_parquet('waiting_time.parquet')
df = pd.DataFrame({'#CPU':[i for i in waiting_time.groupby('#CPU').describe().index], 'count':[count for count in waiting_time.groupby('#CPU').describe().iloc[:,0]]})
i = 0
cumulative = []
for c in df.iloc[:,1]:
    i += c
    cumulative.append(i)
df["cumulative"] = cumulative

fig = go.Figure()
fig_area = px.area(x=df.iloc[:,0], y=df.iloc[:,2])
for trace in fig_area.data:
    fig.add_trace(trace)
fig.add_trace(go.Bar(x=df.iloc[:,0], y=df.iloc[:,2]))
#fig.add_trace(go.Area(x=df.iloc[:,0], y=df.iloc[:,2]))
#fig = px.area(df, x='#CPU', y='cumulative')
#fig = px.bar(df, x='#CPU', y='cumulative')

fig.update_layout(
    title={'text':'#CPU v.s. cumulative count', 'font':{'size':30}},
    xaxis_title={'text':"#CPU", 'font':{'size':30}},
    yaxis_title={'text':'cumulative count', 'font':{'size':30}},
    showlegend=False,
    width=1500,
    yaxis=dict(tickfont=dict(size=20)),  # 調整y軸標籤字體大小
    xaxis=dict(tickfont=dict(size=20))
    )
fig.update_traces(marker_color='rgb(0, 0, 0)', marker_line_color='rgb(0, 0, 0)',
                    marker_line_width=1.5, opacity=0.6)

fig.add_shape(
        type='line', line=dict(dash='dash'),
        name = '95%',
        x0 = 0,
        x1 = df.iloc[-1,0],
        y0 = i*0.95,
        y1 = i*0.95
    )
    
fig.show()
pio.write_image(fig, f'CPU_vs_cumulative_count.png', scale=2)

In [23]:
alloc_cpu = []
how_long_it_work = []
for pair in work_time:
    alloc_cpu.append(pair[0])
    how_long_it_work.append(pair[1])

working_time = pd.DataFrame({'#CPU':alloc_cpu, 'Working Time':how_long_it_work})

In [72]:
waiting_time = 0
working_time = pd.read_parquet('working_time.parquet')
fig = px.bar(working_time.sort_values(by=['#CPU']), x='#CPU', y='Working Time', log_y=True)
fig.update_layout(
    title={'text':'#CPU v.s. work time(log)', 'font':{'size':30}},
    xaxis_title={'text':"#CPU", 'font':{'size':30}},
    yaxis_title={'text':'work time (second)', 'font':{'size':30}},
    height=700,
    width=1500,
    yaxis=dict(tickfont=dict(size=20)),
    xaxis=dict(tickfont=dict(size=20))
    )

#fig.add_trace(go.Line(x=waiting_time.groupby('#CPU').describe().index, y=waiting_time.groupby('#CPU').describe().iloc[:,1]))


fig.update_traces(marker_color='rgb(0, 0, 0)', marker_line_color='rgb(0, 0, 0)',
                    marker_line_width=2, opacity=0.6)

fig.add_shape(
        type='line', line=dict(dash='dash'),
        name = 'hour',
        x0 = 0,
        x1 = working_time.sort_values(by=['#CPU'])['#CPU'].iloc[-1],
        y0 = 60,
        y1 = 60
    )

fig.add_shape(
        type='line', line=dict(dash='dash'),
        name = 'hour',
        x0 = 0,
        x1 = working_time.sort_values(by=['#CPU'])['#CPU'].iloc[-1],
        y0 = 3600,
        y1 = 3600
    )

fig.add_shape(
        type='line', line=dict(dash='dash'),
        name = 'hour',
        x0 = 0,
        x1 = working_time.sort_values(by=['#CPU'])['#CPU'].iloc[-1],
        y0 = 3600*24,
        y1 = 3600*24
    )

fig.add_shape(
        type='line', line=dict(dash='dash'),
        name = 'hour',
        x0 = 0,
        x1 = working_time.sort_values(by=['#CPU'])['#CPU'].iloc[-1],
        y0 = 3600*24*7,
        y1 = 3600*24*7
    )
    
#fig.show()
pio.write_image(fig, f'CPU_vs_work_time_log.png', scale=2)

In [68]:
#working_time.groupby('#CPU').describe()
cumulative = 0
fig_area = 0
fig = 0
trace = 0

In [73]:
df = pd.DataFrame({'#CPU':[i for i in working_time.groupby('#CPU').describe().index], 'work time':[time for time in working_time.groupby('#CPU').describe().iloc[:,1]]})

fig = px.bar(df, x='#CPU', y='work time', log_y=True)
fig.update_layout(
    title={'text':'#CPU v.s. ave. work time(log)', 'font':{'size':30}},
    xaxis_title={'text':"#CPU", 'font':{'size':30}},
    yaxis_title={'text':'ave. work time (second)', 'font':{'size':30}},
    height=700,
    width=1500,
    yaxis=dict(tickfont=dict(size=20)),
    xaxis=dict(tickfont=dict(size=20))
    )

fig.update_traces(marker_color='rgb(0, 0, 0)', marker_line_color='rgb(0, 0, 0)',
                    marker_line_width=1.5, opacity=0.6)

fig.add_shape(
        type='line', line=dict(dash='dash'),
        name = 'minute',
        x0 = 0,
        x1 = working_time.sort_values(by=['#CPU'])['#CPU'].iloc[-1],
        y0 = 60,
        y1 = 60
    )

fig.add_shape(
        type='line', line=dict(dash='dash'),
        name = 'hour',
        x0 = 0,
        x1 = working_time.sort_values(by=['#CPU'])['#CPU'].iloc[-1],
        y0 = 3600,
        y1 = 3600
    )

fig.add_shape(
        type='line', line=dict(dash='dash'),
        name = 'day',
        x0 = 0,
        x1 = working_time.sort_values(by=['#CPU'])['#CPU'].iloc[-1],
        y0 = 3600*24,
        y1 = 3600*24
    )

fig.add_shape(
        type='line', line=dict(dash='dash'),
        name = 'week',
        x0 = 0,
        x1 = working_time.sort_values(by=['#CPU'])['#CPU'].iloc[-1],
        y0 = 3600*24*7,
        y1 = 3600*24*7
    )
    
#fig.show()
pio.write_image(fig, f'CPU_vs_ave_work_time_log.png', scale=2)

In [59]:
working_time

Unnamed: 0,#CPU,Working Time
0,2,0
1,2,99
2,2,379
3,2,2099
4,1,11590
...,...,...
48490,4,19
48491,8,1320
48492,4,25
48493,4,22


In [10]:
working_time = pd.read_parquet('working_time.parquet')
#working_time.groupby('#CPU').describe()
df = working_time.groupby('#CPU').describe().iloc[:, [1,2,3,7]]
df

Unnamed: 0_level_0,Working Time,Working Time,Working Time,Working Time
Unnamed: 0_level_1,mean,std,min,max
#CPU,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,2540.862624,12173.906153,0.0,172823.0
2,2960.051383,18806.317586,0.0,172829.0
3,638.872727,1749.796761,0.0,7201.0
4,154.754027,3792.286452,0.0,345618.0
5,24.508475,78.545039,0.0,412.0
...,...,...,...,...
57344,1995.707692,13104.553382,0.0,182120.0
58240,809.500000,159.099026,697.0,922.0
59584,0.000000,,0.0,0.0
60480,245.642857,158.303801,0.0,566.0


In [60]:
#fig = go.Figure()
# fig = go.Figure()
# t = 200
# fig = px.scatter(x=df.index[:t], y=df.iloc[:t,0], log_x=True)
# fig_line = px.scatter(x=df.index[t:], y=df.iloc[t:,3])
# for trace in fig_line.data:
#     fig.add_trace(trace)
# fig.show()
#fig = go.Figure()
# fig = go.Figure()
# fig.add_trace(go.Scatter(x=df.index, y=df.iloc[:,3],
#                     mode='markers',
#                     name='max')
#                     )
# fig.add_trace(go.Scatter(x=df.index, y=df.iloc[:,0],
#                     mode='markers',
#                     name='mean')
#                     )
# fig.update_yaxes(type="log")
c = ['hsl('+str(h)+',50%'+',50%)' for h in np.linspace(0, 360, len(df.index))]
# fig.show()
fig = go.Figure(data=[go.Box(
    y=working_time[working_time['#CPU'] == df.index[i]].iloc[:, 1],
    marker_color=c[i]
    ) for i in range(len(df.index))])

# format the layout
fig.update_layout(
    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
    yaxis=dict(zeroline=False, gridcolor='white'),
    paper_bgcolor='rgb(233,233,233)',
    plot_bgcolor='rgb(233,233,233)',
)

fig.show()

In [1]:
import dask.dataframe as dd

log = dd.read_parquet('./20240702_F1_log.parquet')

Unnamed: 0,Account,AllocCPUS,AllocNodes,AllocTRES,AssocID,CPUTimeRAW,ElapsedRaw,Eligible,End,ExitCode,...,Submit,Suspended,SystemCPU,SystemComment,TimelimitRaw,TotalCPU,UID,User,UserCPU,WorkDir
0,root,2,1,"billing=2,cpu=2,mem=515350M,node=1",6,0,0,2024-01-03T11:44:47,2024-01-03T11:44:47,0:0,...,2024-01-03T11:44:47,00:00:00,00:00:00,,UNLIMITED,00:00:00,0,root,00:00:00,/home
1,root,2,1,"billing=2,cpu=2,mem=515350M,node=1",6,0,0,2024-01-03T11:44:47,2024-01-03T11:44:47,0:0,...,2024-01-03T11:44:47,00:00:00,00:00:00,,,00:00:00,,,00:00:00,
2,root,2,1,"cpu=2,mem=515350M,node=1",6,0,0,2024-01-03T11:44:47,2024-01-03T11:44:47,0:0,...,2024-01-03T11:44:47,00:00:00,00:00:00,,,00:00:00,,,00:00:00,
3,root,7,0,,6,0,0,2024-01-03T16:38:08,2024-01-03T17:27:09,0:0,...,2024-01-03T16:38:08,00:00:00,00:00:00,,Partition_Limit,00:00:00,0,root,00:00:00,/root
4,root,7,0,,6,0,0,2024-01-04T10:49:39,2024-01-04T11:40:45,0:0,...,2024-01-04T10:49:39,00:00:00,00:00:00,,Partition_Limit,00:00:00,0,root,00:00:00,/root
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206635,mst110307,1344,12,"billing=1344,cpu=1344,mem=5789952M,node=12",12866,67200,50,2024-04-17T15:46:29,2024-04-17T15:47:38,0:0,...,2024-04-17T15:46:29,00:00:00,22:52.758,,2880,07:07:21,21695,b10606105,06:44:28,/work1/b10606105/pure-electrolyte/F4DEE-DEE/15...
206636,mst110307,112,1,"cpu=112,mem=482496M,node=1",12866,5600,50,2024-04-17T15:46:48,2024-04-17T15:47:38,0:0,...,2024-04-17T15:46:48,00:00:00,00:00.129,,,00:00.207,,,00:00.077,
206637,mst110307,1344,12,"billing=1344,cpu=1344,mem=5789952M,node=12",12866,67200,50,2024-04-17T15:46:48,2024-04-17T15:47:38,0:0,...,2024-04-17T15:46:48,00:00:00,00:00.004,,,00:00.007,,,00:00.003,
206638,mst110307,1344,12,"cpu=1344,mem=5789952M,node=12",12866,61824,46,2024-04-17T15:46:51,2024-04-17T15:47:37,0:0,...,2024-04-17T15:46:51,00:00:00,22:52.623,,,07:07:21,,,06:44:28,


In [26]:
import pandas as pd
import dask.dataframe as dd

d = {'col1': [1, 2, 3, 4], 'col2': [5, 6, 7, 8]}
df = dd.from_pandas(pd.DataFrame(d))
#df = pd.DataFrame(d)

In [29]:
d

{'col1': [1, 2, 3, 4], 'col2': [5, 6, 7, 8]}

In [30]:
for i,r in df.iterrows():
    print(r)
print('-------------')
for i,r in df.iterrows():
    r += 1
    print(r)
print('-------------')
for i,r in df.iterrows():
    r += 1
    print(r)

#dask怪怪的我猜可能是東西留在記憶體裡了所以數值一直累加

col1     8
col2    12
Name: 0, dtype: int64
col1     9
col2    13
Name: 1, dtype: int64
col1    10
col2    14
Name: 2, dtype: int64
col1    11
col2    15
Name: 3, dtype: int64
-------------
col1     9
col2    13
Name: 0, dtype: int64
col1    10
col2    14
Name: 1, dtype: int64
col1    11
col2    15
Name: 2, dtype: int64
col1    12
col2    16
Name: 3, dtype: int64
-------------
col1    10
col2    14
Name: 0, dtype: int64
col1    11
col2    15
Name: 1, dtype: int64
col1    12
col2    16
Name: 2, dtype: int64
col1    13
col2    17
Name: 3, dtype: int64


In [1]:
import sqlite3

In [2]:
con = sqlite3.connect("tutorial.db")
con.close()