In [1]:
import pandas as pd
import numpy as np

In [2]:
from dask.delayed import delayed
import dask.dataframe as dd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
def load_dd_delayed(keyword):
    with open('/data/clusterdata/cluster-trace-gpu-v2020/data/pai_%s.header' % keyword) as fin:
        lines = fin.readlines()
        col_names = lines[0].strip().split(',')

    col_names

    dfs = delayed(pd.read_csv)('/data/clusterdata/cluster-trace-gpu-v2020/data/pai_%s.tar.gz' % keyword, 
                               header=None, index_col=False, names=col_names, delimiter=',')

    dd_delayed = dd.from_delayed(dfs)

    return dd_delayed

In [4]:
def stratified_sample(df, strata, size=None, seed=None, keep_index= True):
    '''
    It samples data from a pandas dataframe using strata. These functions use
    proportionate stratification:
    n1 = (N1/N) * n
    where:
        - n1 is the sample size of stratum 1
        - N1 is the population size of stratum 1
        - N is the total population size
        - n is the sampling size
    Parameters
    ----------
    :df: pandas dataframe from which data will be sampled.
    :strata: list containing columns that will be used in the stratified sampling.
    :size: sampling size. If not informed, a sampling size will be calculated
        using Cochran adjusted sampling formula:
        cochran_n = (Z**2 * p * q) /e**2
        where:
            - Z is the z-value. In this case we use 1.96 representing 95%
            - p is the estimated proportion of the population which has an
                attribute. In this case we use 0.5
            - q is 1-p
            - e is the margin of error
        This formula is adjusted as follows:
        adjusted_cochran = cochran_n / 1+((cochran_n -1)/N)
        where:
            - cochran_n = result of the previous formula
            - N is the population size
    :seed: sampling seed
    :keep_index: if True, it keeps a column with the original population index indicator
    
    Returns
    -------
    A sampled pandas dataframe based in a set of strata.
    Examples
    --------
    >> df.head()
    	id  sex age city 
    0	123 M   20  XYZ
    1	456 M   25  XYZ
    2	789 M   21  YZX
    3	987 F   40  ZXY
    4	654 M   45  ZXY
    ...
    # This returns a sample stratified by sex and city containing 30% of the size of
    # the original data
    >> stratified = stratified_sample(df=df, strata=['sex', 'city'], size=0.3)
    Requirements
    ------------
    - pandas
    - numpy
    '''
    population = len(df)
    size = __smpl_size(population, size)
    tmp = df[strata]
    tmp['size'] = 1
    tmp_grpd = tmp.groupby(strata).count().reset_index()
    tmp_grpd['samp_size'] = round(size/population * tmp_grpd['size']).astype(int)

    # controlling variable to create the dataframe or append to it
    first = True 
    for i in range(len(tmp_grpd)):
        # query generator for each iteration
        qry=''
        for s in range(len(strata)):
            stratum = strata[s]
            value = tmp_grpd.iloc[i][stratum]
            n = tmp_grpd.iloc[i]['samp_size']

            if type(value) == str:
                value = "'" + str(value) + "'"
            
            if s != len(strata)-1:
                qry = qry + stratum + ' == ' + str(value) +' & '
            else:
                qry = qry + stratum + ' == ' + str(value)
        
        # final dataframe
        if first:
            stratified_df = df.query(qry).sample(n=n, random_state=seed).reset_index(drop=(not keep_index))
            first = False
        else:
            tmp_df = df.query(qry).sample(n=n, random_state=seed).reset_index(drop=(not keep_index))
            stratified_df = stratified_df.append(tmp_df, ignore_index=True)
    
    return stratified_df



def stratified_sample_report(df, strata, size=None):
    '''
    Generates a dataframe reporting the counts in each stratum and the counts
    for the final sampled dataframe.
    Parameters
    ----------
    :df: pandas dataframe from which data will be sampled.
    :strata: list containing columns that will be used in the stratified sampling.
    :size: sampling size. If not informed, a sampling size will be calculated
        using Cochran adjusted sampling formula:
        cochran_n = (Z**2 * p * q) /e**2
        where:
            - Z is the z-value. In this case we use 1.96 representing 95%
            - p is the estimated proportion of the population which has an
                attribute. In this case we use 0.5
            - q is 1-p
            - e is the margin of error
        This formula is adjusted as follows:
        adjusted_cochran = cochran_n / 1+((cochran_n -1)/N)
        where:
            - cochran_n = result of the previous formula
            - N is the population size
    Returns
    -------
    A dataframe reporting the counts in each stratum and the counts
    for the final sampled dataframe.
    '''
    population = len(df)
    size = __smpl_size(population, size)
    tmp = df[strata]
    tmp['size'] = 1
    tmp_grpd = tmp.groupby(strata).count().reset_index()
    tmp_grpd['samp_size'] = round(size/population * tmp_grpd['size']).astype(int)
    return tmp_grpd


def __smpl_size(population, size):
    '''
    A function to compute the sample size. If not informed, a sampling 
    size will be calculated using Cochran adjusted sampling formula:
        cochran_n = (Z**2 * p * q) /e**2
        where:
            - Z is the z-value. In this case we use 1.96 representing 95%
            - p is the estimated proportion of the population which has an
                attribute. In this case we use 0.5
            - q is 1-p
            - e is the margin of error
        This formula is adjusted as follows:
        adjusted_cochran = cochran_n / 1+((cochran_n -1)/N)
        where:
            - cochran_n = result of the previous formula
            - N is the population size
    Parameters
    ----------
        :population: population size
        :size: sample size (default = None)
    Returns
    -------
    Calculated sample size to be used in the functions:
        - stratified_sample
        - stratified_sample_report
    '''
    if size is None:
        cochran_n = round(((1.96)**2 * 0.5 * 0.5)/ 0.02**2)
        n = round(cochran_n/(1+((cochran_n -1) /population)))
    elif size >= 0 and size < 1:
        n = round(population * size)
    elif size < 0:
        raise ValueError('Parameter "size" must be an integer or a proportion between 0 and 0.99.')
    elif size >= 1:
        n = size
    return n

## Data Extraction

In [5]:
dd_delayed = load_dd_delayed('job_table')

In [6]:
job_df = dd_delayed.compute()

In [7]:
job_df

Unnamed: 0,job_name,inst_id,user,status,start_time,end_time
0,pai_job_table.csv,38e2d7187762a9241cc1cd5732dbe52a5d8b37ed7dafe1...,74238accb90b,Running,1053513.0,
1,9605ac7cc4c55a193fbe956b,e1cbdf28400847d65d00da4f0522ce7a43275fe9cb5d2a...,61d6b6dd5b15,Running,1097614.0,
2,a2d8872d080eb634a42ea9a6,27ddabc2f7490279c6d5bd95c8a75e5c96d841c6030659...,61d6b6dd5b15,Running,1103820.0,
3,e8baa72bfcd1b723cafab26a,70e4f598c8cdf40f24e0e7d3aba9a9dacde5342e1f2ffd...,61d6b6dd5b15,Terminated,1104396.0,1140852.0
4,4e87f469c73e13613908b658,5bb3500e7f1a42a9ce82544dad5f202547848e9727e221...,61d6b6dd5b15,Terminated,1109911.0,1233527.0
...,...,...,...,...,...,...
1055497,a155d69dca6bfd2dfb78ef0d,267c02fbda06c42fb83fc8034d2be1c51149127deb2292...,a4187a15253d,Terminated,6420116.0,6431658.0
1055498,e4e802c3b63829c1e2e3343b,37c6523c16f1574055fb637a82bb22a913122d58c8368a...,a4187a15253d,Running,6431847.0,
1055499,f11820cc3d678bcdc37faf08,d50242c89f3a25bd20ef700701777823c415ad0321dcb3...,a4187a15253d,Terminated,6431859.0,6431872.0
1055500,ee3480523c1bf79b0d02c260,6c6338d4ac59fd1157e569fd6b5de6b70975c65112e9ba...,a4187a15253d,Terminated,6431969.0,6435635.0


In [8]:
job_df.value_counts('status')

status
Terminated    732355
Failed        256555
Running        62928
Waiting         3663
dtype: int64

In [9]:
dd_delayed = load_dd_delayed('task_table')

In [10]:
task_df = dd_delayed.compute()

In [11]:
task_df[task_df.job_name == 'f11820cc3d678bcdc37faf08']

Unnamed: 0,job_name,task_name,inst_num,status,start_time,end_time,plan_cpu,plan_mem,plan_gpu,gpu_type
840157,f11820cc3d678bcdc37faf08,PyTorchWorker,1.0,Terminated,6431859.0,6431872.0,1800.0,58.59375,100.0,MISC


In [12]:
job_result_df = pd.merge(job_df[['job_name', 'inst_id', 'user']], task_df, on='job_name', how='inner')

In [13]:
job_result_df

Unnamed: 0,job_name,inst_id,user,task_name,inst_num,status,start_time,end_time,plan_cpu,plan_mem,plan_gpu,gpu_type
0,9605ac7cc4c55a193fbe956b,e1cbdf28400847d65d00da4f0522ce7a43275fe9cb5d2a...,61d6b6dd5b15,worker,20.0,Running,1097614.0,,,,,P100
1,a2d8872d080eb634a42ea9a6,27ddabc2f7490279c6d5bd95c8a75e5c96d841c6030659...,61d6b6dd5b15,worker,100.0,Running,1103820.0,,,,,P100
2,e8baa72bfcd1b723cafab26a,70e4f598c8cdf40f24e0e7d3aba9a9dacde5342e1f2ffd...,61d6b6dd5b15,worker,20.0,Terminated,1104396.0,1140852.0,600.0,29.296875,50.0,MISC
3,4e87f469c73e13613908b658,5bb3500e7f1a42a9ce82544dad5f202547848e9727e221...,61d6b6dd5b15,worker,30.0,Terminated,1109911.0,1233527.0,600.0,29.296875,50.0,MISC
4,dcf1bd23bb296a8efb35902b,4fe3a2e132a1ffd068bf3d6dca3906d24a37f95df2c98e...,61d6b6dd5b15,worker,5.0,Terminated,1129126.0,1129627.0,600.0,29.296875,100.0,MISC
...,...,...,...,...,...,...,...,...,...,...,...,...
1261044,a155d69dca6bfd2dfb78ef0d,267c02fbda06c42fb83fc8034d2be1c51149127deb2292...,a4187a15253d,JupyterTask,1.0,Terminated,6420116.0,6431658.0,200.0,3.925781,100.0,MISC
1261045,e4e802c3b63829c1e2e3343b,37c6523c16f1574055fb637a82bb22a913122d58c8368a...,a4187a15253d,PyTorchWorker,1.0,Running,6431847.0,,,,,
1261046,f11820cc3d678bcdc37faf08,d50242c89f3a25bd20ef700701777823c415ad0321dcb3...,a4187a15253d,PyTorchWorker,1.0,Terminated,6431859.0,6431872.0,1800.0,58.593750,100.0,MISC
1261047,ee3480523c1bf79b0d02c260,6c6338d4ac59fd1157e569fd6b5de6b70975c65112e9ba...,a4187a15253d,JupyterTask,1.0,Terminated,6431969.0,6435635.0,200.0,3.925781,100.0,MISC


In [14]:
job_result_df.value_counts('gpu_type')

gpu_type
MISC       696280
T4         226923
P100        72731
V100        28797
V100M32     18579
dtype: int64

In [15]:
dd_delayed = load_dd_delayed('group_tag_table')

In [16]:
group_tag_table_df = dd_delayed.compute()

In [17]:
group_tag_table_df

Unnamed: 0,inst_id,user,gpu_type_spec,group,workload
0,pai_group_tag_table.csv,58540f191766,,ba4e3be6472ded359f4b1ca53d9bd53e,
1,a8c39faded8b9a5b7436a47aff747835795cec39ef33c7...,58540f191766,,81b1eed4c4b0463b7e5a154f7c42db1e,
2,26bf18df11f3a989fa64ed808dc780ab24a7700941f7c1...,58540f191766,,046437ed700bdf7434aa88503cc97a8d,
3,a4fdd0229b8fcc227a8828daaa37500a29e125c1c38d1c...,58540f191766,,d50ee406f93d51ccf485b415861dd8f5,
4,4b4ab6c2a0691b35d8a9abb22ade8164b6be1bdb19c8a7...,58540f191766,,c5ee81877b36fdb5dac829aab7b4146a,
...,...,...,...,...,...
1055028,d072b733e39178d2f893e02ca46553b748afe9982789ff...,970063dba409,,fef6c044de369ba2bf168077064de744,
1055029,b81b2e0033aa5a5e591b908fb92ec79a0ecbd8c69f174e...,16808c52c63a,,7953bc96f287acc3cc7113e1aac89eed,
1055030,b8f69c79ceb61839dc74a742fddbb0e86f76b6d171f96c...,16808c52c63a,,a98575bef3fc5450b8a8b531d60dee7b,
1055031,701711437f4c950197e77f3c04ca23fa3bb6ac3c13dd25...,5a43fb583e5d,,1e32bc94f785407bea8179f77ae6152a,


In [18]:
group_tag_table_df.value_counts('workload')

workload
bert          54887
ctr           27083
nmt           10363
inception      5440
graphlearn     3754
resnet          541
xlnet           415
rl              249
vgg              66
dtype: int64

In [19]:
job_result_df = pd.merge(job_result_df, group_tag_table_df[['inst_id', 'group', 'workload']], on='inst_id', how='inner')

In [20]:
selected_df = job_result_df[(job_result_df.status == 'Terminated') & (job_result_df.workload.notnull())]

In [21]:
selected_df

Unnamed: 0,job_name,inst_id,user,task_name,inst_num,status,start_time,end_time,plan_cpu,plan_mem,plan_gpu,gpu_type,group,workload
515,5c69dd716653aa7be6ee77d3,83a28b86769dfb487f5eea1589862ac8269280a11f8191...,33e4fcf3d314,worker,2.0,Terminated,5452836.0,5452939.0,600.0,29.296875,100.0,V100,fbe13e29cea9d8c18612964c786ab0d7,bert
522,45bfe885631d2fc0f67c2b87,d61dd48fcecea78ffce62735d6b728371fa35a5697f1b1...,33e4fcf3d314,worker,5.0,Terminated,5461078.0,5461299.0,600.0,29.296875,100.0,MISC,fbe13e29cea9d8c18612964c786ab0d7,bert
537,762dade1375e5d7f06c3d637,23d6227dd31bb8affa714bb4ecacaa58d489aabb4a69c4...,33e4fcf3d314,worker,8.0,Terminated,5474754.0,5476760.0,600.0,29.296875,100.0,MISC,fbe13e29cea9d8c18612964c786ab0d7,bert
549,490fa420639aa44fd5b5b463,ae24a5194751c539ed3602978ecfab45eb1f6cb757c0b6...,33e4fcf3d314,worker,30.0,Terminated,5482097.0,5482459.0,600.0,29.296875,100.0,MISC,fbe13e29cea9d8c18612964c786ab0d7,bert
553,f24b418960c0f671a7966499,a79834dd88d13c47ca49c2b8544ccd529fa1a4598ead63...,33e4fcf3d314,worker,30.0,Terminated,5483829.0,5484732.0,600.0,29.296875,100.0,MISC,a91d89e7b403037001204c45a17a7a9a,bert
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1259058,bb86a612f4d545c5fb8836d6,7285648549a5bf5b2d8768428a4fbe1b842baf8dd52da7...,a4187a15253d,TensorboardTask,1.0,Terminated,5722718.0,5743470.0,100.0,1.972656,,,38dd1e31c5d5e26e7da460dbf799f6c7,bert
1259123,0f7e4da20d407697e34506ee,b43221ac6e39770df96b525ebba683a77a1772d564ed47...,61fabd2b354d,worker,20.0,Terminated,6061748.0,6064190.0,600.0,29.296875,100.0,P100,ff0f23df0fc88304a6138e69723ba320,graphlearn
1259124,0f7e4da20d407697e34506ee,b43221ac6e39770df96b525ebba683a77a1772d564ed47...,61fabd2b354d,ps,10.0,Terminated,6060811.0,6064207.0,600.0,29.296875,,,ff0f23df0fc88304a6138e69723ba320,graphlearn
1259155,7ff3e872b8be51537beeefde,b99050ffacf3f321400a836d0e9f9183d49fa4b695aada...,a4187a15253d,PyTorchWorker,1.0,Terminated,6419719.0,6421606.0,1800.0,58.593750,100.0,MISC,cd3e061c2953d442747217685b8cd059,bert


In [22]:
dd_delayed = load_dd_delayed('instance_table')

In [23]:
instance_df = dd_delayed.compute()

In [24]:
selected_df = pd.merge(selected_df, instance_df[['inst_id', 'inst_name', 'worker_name', 'machine']], on='inst_id', how='inner')

In [25]:
selected_df

Unnamed: 0,job_name,inst_id,user,task_name,inst_num,status,start_time,end_time,plan_cpu,plan_mem,plan_gpu,gpu_type,group,workload,inst_name,worker_name,machine
0,5c69dd716653aa7be6ee77d3,83a28b86769dfb487f5eea1589862ac8269280a11f8191...,33e4fcf3d314,worker,2.0,Terminated,5452836.0,5452939.0,600.0,29.296875,100.0,V100,fbe13e29cea9d8c18612964c786ab0d7,bert,d6eeb326a6ac4e4cfa2968387009ed6fa7a11b0ea004ae...,e7066f2ecab463fcc249d49eb618b7bda6f09c40c63e09...,d49058228ddcb1df3619cab2
1,5c69dd716653aa7be6ee77d3,83a28b86769dfb487f5eea1589862ac8269280a11f8191...,33e4fcf3d314,worker,2.0,Terminated,5452836.0,5452939.0,600.0,29.296875,100.0,V100,fbe13e29cea9d8c18612964c786ab0d7,bert,35e48c238a3a44df8402b8051e181efea509fd1ee22e7f...,1bedf5b67dee9db79033a65b3a3e339d85503e4f7c311c...,d49058228ddcb1df3619cab2
2,45bfe885631d2fc0f67c2b87,d61dd48fcecea78ffce62735d6b728371fa35a5697f1b1...,33e4fcf3d314,worker,5.0,Terminated,5461078.0,5461299.0,600.0,29.296875,100.0,MISC,fbe13e29cea9d8c18612964c786ab0d7,bert,2adb8f689b8e9e21923996e1f05698470f047956089914...,01e222fe1cbdb7d6acec5dd262d64d54fec243a647b29f...,78860d44ddf400bdea3a58ba
3,45bfe885631d2fc0f67c2b87,d61dd48fcecea78ffce62735d6b728371fa35a5697f1b1...,33e4fcf3d314,worker,5.0,Terminated,5461078.0,5461299.0,600.0,29.296875,100.0,MISC,fbe13e29cea9d8c18612964c786ab0d7,bert,79a1edb34552f3e7a3b197827c7dd2dea1b1b2f5ec218c...,c9fe7091f0d5b29b2747b4299de40184c8b38101e34a96...,0a77ce47d2dc5f1a13fa9075
4,45bfe885631d2fc0f67c2b87,d61dd48fcecea78ffce62735d6b728371fa35a5697f1b1...,33e4fcf3d314,worker,5.0,Terminated,5461078.0,5461299.0,600.0,29.296875,100.0,MISC,fbe13e29cea9d8c18612964c786ab0d7,bert,d5ad8cd6f89dea6a07337dd12b80ca208be4129b24fecf...,ece669376d21a6e1c075d6aaacab2fc815d3a79f2a9b20...,5236502e46b3fe5734587b68
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1615871,0f7e4da20d407697e34506ee,b43221ac6e39770df96b525ebba683a77a1772d564ed47...,61fabd2b354d,ps,10.0,Terminated,6060811.0,6064207.0,600.0,29.296875,,,ff0f23df0fc88304a6138e69723ba320,graphlearn,932aa55a40ebc2cf34857aa6de18e109e065b897e8db06...,3a9dd2fcca7d24b4d34368987c5a4b192027b52cbed738...,f2dbbdd8a06e1a6339d33ca5
1615872,7ff3e872b8be51537beeefde,b99050ffacf3f321400a836d0e9f9183d49fa4b695aada...,a4187a15253d,PyTorchWorker,1.0,Terminated,6419719.0,6421606.0,1800.0,58.593750,100.0,MISC,cd3e061c2953d442747217685b8cd059,bert,ec3a0cf96ee7be24c461b1132a3d1b185c5812ca309fdd...,abe014c4ad9f88eb8370bc67f275ad202183b0cc9e4cb5...,07a757904c2974820f7f9dce
1615873,7ff3e872b8be51537beeefde,b99050ffacf3f321400a836d0e9f9183d49fa4b695aada...,a4187a15253d,PyTorchWorker,1.0,Terminated,6419719.0,6421606.0,1800.0,58.593750,100.0,MISC,cd3e061c2953d442747217685b8cd059,bert,ec3a0cf96ee7be24c461b1132a3d1b185c5812ca309fdd...,444d9c6d3de87668f6f09f49038c3d7fe072cf63e4e7c9...,07a757904c2974820f7f9dce
1615874,7ff3e872b8be51537beeefde,b99050ffacf3f321400a836d0e9f9183d49fa4b695aada...,a4187a15253d,PyTorchWorker,1.0,Terminated,6419719.0,6421606.0,1800.0,58.593750,100.0,MISC,cd3e061c2953d442747217685b8cd059,bert,ec3a0cf96ee7be24c461b1132a3d1b185c5812ca309fdd...,4e0e26f837a6c8882a1e672ffd3c3484d1e2d8a83a816d...,12bcc4fceea93a30d7d0f324


In [26]:
dd_delayed = load_dd_delayed('machine_spec')

In [27]:
machine_spec_df = dd_delayed.compute()

In [28]:
machine_spec_df.columns = ['machine'] + [x+'_machine_spec' for x in machine_spec_df.columns.values[1:]]

In [29]:
machine_spec_df

Unnamed: 0,machine,gpu_type_machine_spec,cap_cpu_machine_spec,cap_mem_machine_spec,cap_gpu_machine_spec
0,pai_machine_spec.csv,CPU,96.0,512.0,0.0
1,75c536d5ba60528b3ef3ae40,CPU,96.0,512.0,0.0
2,6265a99de1a50cc6f1b03602,CPU,96.0,512.0,0.0
3,cb5c703eae0a123ad25e480d,CPU,96.0,512.0,0.0
4,fc6921f2af20c5337c43d4a9,CPU,96.0,512.0,0.0
...,...,...,...,...,...
1893,641527b3be32730e483adb33,V100M32,96.0,384.0,8.0
1894,c0c5d94d80e80974833e9dd1,V100M32,96.0,384.0,8.0
1895,3cc7f50829bef281c20cf1cc,V100M32,96.0,384.0,8.0
1896,a12e33726524543425a94398,V100M32,96.0,384.0,8.0


In [30]:
selected_df = pd.merge(selected_df, machine_spec_df, on='machine', how='inner')

In [31]:
selected_df

Unnamed: 0,job_name,inst_id,user,task_name,inst_num,status,start_time,end_time,plan_cpu,plan_mem,...,gpu_type,group,workload,inst_name,worker_name,machine,gpu_type_machine_spec,cap_cpu_machine_spec,cap_mem_machine_spec,cap_gpu_machine_spec
0,5c69dd716653aa7be6ee77d3,83a28b86769dfb487f5eea1589862ac8269280a11f8191...,33e4fcf3d314,worker,2.0,Terminated,5452836.0,5452939.0,600.0,29.296875,...,V100,fbe13e29cea9d8c18612964c786ab0d7,bert,d6eeb326a6ac4e4cfa2968387009ed6fa7a11b0ea004ae...,e7066f2ecab463fcc249d49eb618b7bda6f09c40c63e09...,d49058228ddcb1df3619cab2,V100,96.0,512.0,8.0
1,5c69dd716653aa7be6ee77d3,83a28b86769dfb487f5eea1589862ac8269280a11f8191...,33e4fcf3d314,worker,2.0,Terminated,5452836.0,5452939.0,600.0,29.296875,...,V100,fbe13e29cea9d8c18612964c786ab0d7,bert,35e48c238a3a44df8402b8051e181efea509fd1ee22e7f...,1bedf5b67dee9db79033a65b3a3e339d85503e4f7c311c...,d49058228ddcb1df3619cab2,V100,96.0,512.0,8.0
2,30927622a31d3e31ca883711,df762cc22acd0887e10484f9e2bd3ece5d944c7072e956...,ba601eb6b9ea,worker,8.0,Terminated,1737922.0,1738406.0,100.0,24.414062,...,V100,64de2da27f2a4530a2c60728b36284dd,bert,9f1c13cbd77c1f1007f489e8acc273ac2779a7593492b4...,ef97c766997bd3665f8f52c35ebb83cf50c84964320867...,d49058228ddcb1df3619cab2,V100,96.0,512.0,8.0
3,30927622a31d3e31ca883711,df762cc22acd0887e10484f9e2bd3ece5d944c7072e956...,ba601eb6b9ea,worker,8.0,Terminated,1737922.0,1738406.0,100.0,24.414062,...,V100,64de2da27f2a4530a2c60728b36284dd,bert,73788643a8455d43ce0a1d65e18147cc182be5e6b04c57...,912838588b64797adbf58419c45a50f41ccb15fadb11a1...,d49058228ddcb1df3619cab2,V100,96.0,512.0,8.0
4,385e98bcaaa59b7c7c6b5010,ab5298932d11de3c43b9d1739ece08960038d05508aa5c...,b5bf56100cf9,tensorflow,1.0,Terminated,4851277.0,4851645.0,100.0,29.296875,...,V100,8b3fef7bbfc89cf7dad58794e7ca3b98,bert,9625d71fced90d0f93a7ca6bedeb9ea1f028a88117d0b8...,f055afbb0447a90c46be21e69e6c3b815655e35a167943...,d49058228ddcb1df3619cab2,V100,96.0,512.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1615742,f174fe170050478be3a32d68,df9d6c96657a3fedbddb6e029d03bfe0b8dea62d9588a9...,26270a7a2e87,worker,1.0,Terminated,2950773.0,3220398.0,1000.0,58.593750,...,V100M32,a63b596d43b0eba7afff11fce7c589a0,nmt,719db15382def73e3a1d5973de5ff23d99169a3b561aeb...,04ee19082ca50b62be75f1e1a9c13bcdb161f5da3b69a3...,fd7232a5ac9d094ead14b852,V100M32,96.0,384.0,8.0
1615743,845082b83968be8a3f563367,a2c1a2af7577f178e7bd97cacd14e39fc2f7d74dc7e5d3...,26270a7a2e87,worker,1.0,Terminated,2950890.0,3222937.0,1000.0,58.593750,...,V100M32,541ae81871cf0c60bee016070b4e6702,nmt,7f3d4860550be89368b67399235956b1a42af805c77391...,4a82bb42283765ff94efc95d54e51ccc503a6aa74e23cf...,c2bf718f298f4173af121fc7,V100M32,96.0,384.0,8.0
1615744,8f90fa02d46a1bf32ded330c,26ede553249031223a7692c9656fb78065cbc3dfb27b0c...,26270a7a2e87,worker,1.0,Terminated,2950952.0,3221426.0,1000.0,58.593750,...,V100M32,3b8d06184fe3f27dc011e40581c4f57f,nmt,153a1515cd37927022cd874cc1827e28fd331098e03b8e...,3cb6759c8bd292986f6a6a5bba6fd2a6b666ccb4a52496...,c2bf718f298f4173af121fc7,V100M32,96.0,384.0,8.0
1615745,a1d4f67253c3cca3af1e361f,6f30d443dcbcdebcf75f48655b230e32247ccb38072211...,5b1345f03aa1,worker,50.0,Terminated,1149563.0,1149794.0,600.0,48.828125,...,MISC,c8c19dd35cdd68439c367a85b2af4388,graphlearn,34949cb76aca439404dd51b54301c59b05e0633112b1ab...,e5e66ccbc626b2e1ed9fe131a2bf49d4a26b376ffe0dd5...,62803827c565a380482650cd,P100,64.0,512.0,2.0


In [32]:
dd_delayed = load_dd_delayed('sensor_table')

In [33]:
sensor_df = dd_delayed.compute()

In [34]:
sensor_df.columns

Index(['job_name', 'task_name', 'worker_name', 'inst_id', 'machine',
       'gpu_name', 'cpu_usage', 'gpu_wrk_util', 'avg_mem', 'max_mem',
       'avg_gpu_wrk_mem', 'max_gpu_wrk_mem', 'read', 'write', 'read_count',
       'write_count'],
      dtype='object')

In [35]:
selected_df = pd.merge(selected_df, sensor_df[['inst_id', 'gpu_name', 'cpu_usage', 'gpu_wrk_util', 'avg_mem', 'max_mem', 'avg_gpu_wrk_mem', 'max_gpu_wrk_mem',
                                              'read', 'write', 'read_count', 'write_count']], on='inst_id', how='inner')

In [36]:
selected_df

Unnamed: 0,job_name,inst_id,user,task_name,inst_num,status,start_time,end_time,plan_cpu,plan_mem,...,cpu_usage,gpu_wrk_util,avg_mem,max_mem,avg_gpu_wrk_mem,max_gpu_wrk_mem,read,write,read_count,write_count
0,5c69dd716653aa7be6ee77d3,83a28b86769dfb487f5eea1589862ac8269280a11f8191...,33e4fcf3d314,worker,2.0,Terminated,5452836.0,5452939.0,600.0,29.296875,...,69.777778,0.000000,1.640234,3.710938,0.921131,4.491211,2.042608e+08,1.574263e+08,12103.818182,6818.863636
1,5c69dd716653aa7be6ee77d3,83a28b86769dfb487f5eea1589862ac8269280a11f8191...,33e4fcf3d314,worker,2.0,Terminated,5452836.0,5452939.0,600.0,29.296875,...,69.777778,0.000000,1.640234,3.710938,0.921131,4.491211,2.042608e+08,1.574263e+08,12103.818182,6818.863636
2,385e98bcaaa59b7c7c6b5010,ab5298932d11de3c43b9d1739ece08960038d05508aa5c...,b5bf56100cf9,tensorflow,1.0,Terminated,4851277.0,4851645.0,100.0,29.296875,...,19.384615,9.823529,1.868179,3.766602,2.284682,4.489258,5.731487e+06,2.826514e+06,3112.318841,3118.188406
3,0813c1ed08f61363964e4da3,d56013a343c89945a03a0e181b38dd943dc10d3b2bd365...,68c76f05c603,tensorflow,1.0,Terminated,5371716.0,5373955.0,100.0,29.296875,...,30.256236,10.455982,2.296227,2.547852,1.425726,1.493164,3.120095e+06,1.854956e+07,1254.844944,852.519101
4,2ddd434fed889e20c8458ce1,0c318c38918d06c7263d6a7c9edaed4d00a09cffb0cd47...,68c76f05c603,tensorflow,1.0,Terminated,5374913.0,5376321.0,100.0,29.296875,...,45.959559,16.591241,2.167719,2.429688,1.401987,1.458008,4.683575e+06,2.988751e+07,1623.239130,1303.775362
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36685590,9ec9d27e7c3c823035624a55,0c020cc89dcf8090c11656f617625092488ad20c4721f1...,26270a7a2e87,worker,1.0,Terminated,2426239.0,2528827.0,1000.0,58.593750,...,16.255374,10.403256,5.128001,6.196289,8.745916,8.752930,2.168539e+07,2.337102e+05,1035.265461,452.112627
36685591,591ba41540fbd238de2b8c99,ceddc43f3d33a36ad9756d9c52057bca3e70d61eadb617...,26270a7a2e87,worker,1.0,Terminated,2951611.0,3077790.0,1000.0,58.593750,...,16.726958,9.873926,17.133849,31.055664,8.742178,8.752930,2.086325e+07,2.354672e+05,968.722475,413.409234
36685592,f174fe170050478be3a32d68,df9d6c96657a3fedbddb6e029d03bfe0b8dea62d9588a9...,26270a7a2e87,worker,1.0,Terminated,2950773.0,3220398.0,1000.0,58.593750,...,9.354421,4.072437,16.894432,30.716797,5.750357,5.752930,7.649719e+06,1.206485e+05,717.546809,221.026928
36685593,845082b83968be8a3f563367,a2c1a2af7577f178e7bd97cacd14e39fc2f7d74dc7e5d3...,26270a7a2e87,worker,1.0,Terminated,2950890.0,3222937.0,1000.0,58.593750,...,9.600373,4.134147,16.949359,30.934570,5.772328,8.752930,7.566471e+06,1.198557e+05,705.133700,219.654390


dd_delayed = load_dd_delayed('machine_metric')

In [37]:
machine_metric_df = dd_delayed.compute()

In [38]:
sampled_workload_data = pd.read_csv('experiments/100_001_sampled_workload_data.csv', index_col=0)
sampled_workload_data.drop('index', axis=1, inplace=True)

In [39]:
sampled_workload_data

Unnamed: 0,job_name,inst_id,user,task_name,inst_num,status,start_time,end_time,plan_cpu,plan_mem,...,gpu_wrk_util,avg_mem,max_mem,avg_gpu_wrk_mem,max_gpu_wrk_mem,read,write,read_count,write_count,duration
0,71712e3c961b5bbdb2ce0440,c5be8813bf90aaf9ca6f02f744ccd95596e9850928a498...,2cbd33b08024,ps,5.0,Terminated,4625696.0,4628399.0,600.0,29.296875,...,7.541667,1.204708,2.723633,0.347158,0.829102,2.068066e+06,3.426616e+05,312.218182,219.376623,2703.0
1,8aa3a3a479479aecb41ea324,3808e75bd3204e01e84e33a9f6f873513e52bcc7afb33f...,2cbd33b08024,ps,5.0,Terminated,4642480.0,4644075.0,600.0,29.296875,...,12.049123,1.505760,2.671875,0.445045,0.827148,1.234028e+06,3.724553e+05,264.849650,173.374126,1595.0
2,b68f25260271d65caccca8e7,5229275f7f92865554f934e97f576ef99db0c079bba59d...,d4d51aca8806,worker,16.0,Terminated,4058070.0,4058321.0,50.0,29.296875,...,23.727273,1.037900,1.853516,0.445890,0.774414,1.010761e+07,3.781242e+06,2401.913043,590.652174,251.0
3,8aa3a3a479479aecb41ea324,3808e75bd3204e01e84e33a9f6f873513e52bcc7afb33f...,2cbd33b08024,ps,5.0,Terminated,4642480.0,4644075.0,600.0,29.296875,...,11.706294,1.481600,2.667969,0.449140,0.829102,2.358233e+06,4.527896e+05,404.843206,271.797909,1595.0
4,446cd4921e0f00ab5ce1c2ea,12b9ec70cc7c63ebc7184b8a0c363513a8c3cb19746c74...,2cbd33b08024,ps,2.0,Terminated,5521030.0,5526746.0,600.0,20.000000,...,13.005300,2.357810,3.056641,0.645605,0.827148,1.037695e+06,1.527555e+05,201.449250,181.084731,5716.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99996,e49cae3b33b5c205ed8542b0,5283a27d4f19d767256f97c18880a70d69f4d97ca655a3...,47baba33cffe,ps,1.0,Terminated,5890601.0,5893675.0,600.0,29.296875,...,81.609635,1.940527,1.995117,1.200031,1.213867,1.941028e+06,1.165604e+06,402.442786,171.301824,3074.0
99997,23f6e20075e637b0976b4ac8,d9a01656ea5d0f9792168159dbb61790d3b0228a633ac8...,47baba33cffe,ps,3.0,Terminated,4297415.0,4306572.0,600.0,29.296875,...,85.691919,1.803602,2.001953,1.975061,2.213867,1.035467e+06,4.269076e+05,259.100393,115.694335,9157.0
99998,bf7ee12301073cc75cba61a2,aa5054c3d153f66bf8e1dba6d33b3bb3a9317f1cc7daaa...,dbb3d4806d21,ps,5.0,Terminated,5448779.0,5450118.0,600.0,29.296875,...,44.271654,1.859159,2.024414,1.172409,1.213867,3.300603e+06,3.495205e+06,568.968627,295.933333,1339.0
99999,bb95f250974bf5a332f3ea8c,e13b3a5539d62d7b08cad9470ab3b3a33eb54640369337...,dbb3d4806d21,ps,5.0,Terminated,5822706.0,5822846.0,600.0,29.296875,...,0.000000,0.341797,0.735352,0.000000,0.000000,5.726675e+07,6.596024e+07,8070.600000,3965.400000,140.0


In [40]:
selected_df

Unnamed: 0,job_name,inst_id,user,task_name,inst_num,status,start_time,end_time,plan_cpu,plan_mem,...,cpu_usage,gpu_wrk_util,avg_mem,max_mem,avg_gpu_wrk_mem,max_gpu_wrk_mem,read,write,read_count,write_count
0,5c69dd716653aa7be6ee77d3,83a28b86769dfb487f5eea1589862ac8269280a11f8191...,33e4fcf3d314,worker,2.0,Terminated,5452836.0,5452939.0,600.0,29.296875,...,69.777778,0.000000,1.640234,3.710938,0.921131,4.491211,2.042608e+08,1.574263e+08,12103.818182,6818.863636
1,5c69dd716653aa7be6ee77d3,83a28b86769dfb487f5eea1589862ac8269280a11f8191...,33e4fcf3d314,worker,2.0,Terminated,5452836.0,5452939.0,600.0,29.296875,...,69.777778,0.000000,1.640234,3.710938,0.921131,4.491211,2.042608e+08,1.574263e+08,12103.818182,6818.863636
2,385e98bcaaa59b7c7c6b5010,ab5298932d11de3c43b9d1739ece08960038d05508aa5c...,b5bf56100cf9,tensorflow,1.0,Terminated,4851277.0,4851645.0,100.0,29.296875,...,19.384615,9.823529,1.868179,3.766602,2.284682,4.489258,5.731487e+06,2.826514e+06,3112.318841,3118.188406
3,0813c1ed08f61363964e4da3,d56013a343c89945a03a0e181b38dd943dc10d3b2bd365...,68c76f05c603,tensorflow,1.0,Terminated,5371716.0,5373955.0,100.0,29.296875,...,30.256236,10.455982,2.296227,2.547852,1.425726,1.493164,3.120095e+06,1.854956e+07,1254.844944,852.519101
4,2ddd434fed889e20c8458ce1,0c318c38918d06c7263d6a7c9edaed4d00a09cffb0cd47...,68c76f05c603,tensorflow,1.0,Terminated,5374913.0,5376321.0,100.0,29.296875,...,45.959559,16.591241,2.167719,2.429688,1.401987,1.458008,4.683575e+06,2.988751e+07,1623.239130,1303.775362
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36685590,9ec9d27e7c3c823035624a55,0c020cc89dcf8090c11656f617625092488ad20c4721f1...,26270a7a2e87,worker,1.0,Terminated,2426239.0,2528827.0,1000.0,58.593750,...,16.255374,10.403256,5.128001,6.196289,8.745916,8.752930,2.168539e+07,2.337102e+05,1035.265461,452.112627
36685591,591ba41540fbd238de2b8c99,ceddc43f3d33a36ad9756d9c52057bca3e70d61eadb617...,26270a7a2e87,worker,1.0,Terminated,2951611.0,3077790.0,1000.0,58.593750,...,16.726958,9.873926,17.133849,31.055664,8.742178,8.752930,2.086325e+07,2.354672e+05,968.722475,413.409234
36685592,f174fe170050478be3a32d68,df9d6c96657a3fedbddb6e029d03bfe0b8dea62d9588a9...,26270a7a2e87,worker,1.0,Terminated,2950773.0,3220398.0,1000.0,58.593750,...,9.354421,4.072437,16.894432,30.716797,5.750357,5.752930,7.649719e+06,1.206485e+05,717.546809,221.026928
36685593,845082b83968be8a3f563367,a2c1a2af7577f178e7bd97cacd14e39fc2f7d74dc7e5d3...,26270a7a2e87,worker,1.0,Terminated,2950890.0,3222937.0,1000.0,58.593750,...,9.600373,4.134147,16.949359,30.934570,5.772328,8.752930,7.566471e+06,1.198557e+05,705.133700,219.654390


In [41]:
stratified_sample_report(selected_df, ['workload'], 1_000)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp['size'] = 1


Unnamed: 0,workload,size,samp_size
0,bert,10938642,298
1,ctr,9128957,249
2,graphlearn,4886295,133
3,inception,10780275,294
4,nmt,13537,0
5,resnet,60863,2
6,rl,849626,23
7,vgg,11768,0
8,xlnet,15632,0


In [42]:
selected_not_already_sampled = selected_df[~selected_df.inst_id.isin(sampled_workload_data.inst_id)]

In [43]:
selected_not_already_sampled

Unnamed: 0,job_name,inst_id,user,task_name,inst_num,status,start_time,end_time,plan_cpu,plan_mem,...,cpu_usage,gpu_wrk_util,avg_mem,max_mem,avg_gpu_wrk_mem,max_gpu_wrk_mem,read,write,read_count,write_count
0,5c69dd716653aa7be6ee77d3,83a28b86769dfb487f5eea1589862ac8269280a11f8191...,33e4fcf3d314,worker,2.0,Terminated,5452836.0,5452939.0,600.0,29.296875,...,69.777778,0.000000,1.640234,3.710938,0.921131,4.491211,2.042608e+08,1.574263e+08,12103.818182,6818.863636
1,5c69dd716653aa7be6ee77d3,83a28b86769dfb487f5eea1589862ac8269280a11f8191...,33e4fcf3d314,worker,2.0,Terminated,5452836.0,5452939.0,600.0,29.296875,...,69.777778,0.000000,1.640234,3.710938,0.921131,4.491211,2.042608e+08,1.574263e+08,12103.818182,6818.863636
2,385e98bcaaa59b7c7c6b5010,ab5298932d11de3c43b9d1739ece08960038d05508aa5c...,b5bf56100cf9,tensorflow,1.0,Terminated,4851277.0,4851645.0,100.0,29.296875,...,19.384615,9.823529,1.868179,3.766602,2.284682,4.489258,5.731487e+06,2.826514e+06,3112.318841,3118.188406
3,0813c1ed08f61363964e4da3,d56013a343c89945a03a0e181b38dd943dc10d3b2bd365...,68c76f05c603,tensorflow,1.0,Terminated,5371716.0,5373955.0,100.0,29.296875,...,30.256236,10.455982,2.296227,2.547852,1.425726,1.493164,3.120095e+06,1.854956e+07,1254.844944,852.519101
4,2ddd434fed889e20c8458ce1,0c318c38918d06c7263d6a7c9edaed4d00a09cffb0cd47...,68c76f05c603,tensorflow,1.0,Terminated,5374913.0,5376321.0,100.0,29.296875,...,45.959559,16.591241,2.167719,2.429688,1.401987,1.458008,4.683575e+06,2.988751e+07,1623.239130,1303.775362
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36685590,9ec9d27e7c3c823035624a55,0c020cc89dcf8090c11656f617625092488ad20c4721f1...,26270a7a2e87,worker,1.0,Terminated,2426239.0,2528827.0,1000.0,58.593750,...,16.255374,10.403256,5.128001,6.196289,8.745916,8.752930,2.168539e+07,2.337102e+05,1035.265461,452.112627
36685591,591ba41540fbd238de2b8c99,ceddc43f3d33a36ad9756d9c52057bca3e70d61eadb617...,26270a7a2e87,worker,1.0,Terminated,2951611.0,3077790.0,1000.0,58.593750,...,16.726958,9.873926,17.133849,31.055664,8.742178,8.752930,2.086325e+07,2.354672e+05,968.722475,413.409234
36685592,f174fe170050478be3a32d68,df9d6c96657a3fedbddb6e029d03bfe0b8dea62d9588a9...,26270a7a2e87,worker,1.0,Terminated,2950773.0,3220398.0,1000.0,58.593750,...,9.354421,4.072437,16.894432,30.716797,5.750357,5.752930,7.649719e+06,1.206485e+05,717.546809,221.026928
36685593,845082b83968be8a3f563367,a2c1a2af7577f178e7bd97cacd14e39fc2f7d74dc7e5d3...,26270a7a2e87,worker,1.0,Terminated,2950890.0,3222937.0,1000.0,58.593750,...,9.600373,4.134147,16.949359,30.934570,5.772328,8.752930,7.566471e+06,1.198557e+05,705.133700,219.654390


In [44]:
sampled_test = stratified_sample(selected_not_already_sampled, ['workload'], size=1_000, seed=42, keep_index= True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp['size'] = 1
  stratified_df = stratified_df.append(tmp_df, ignore_index=True)
  stratified_df = stratified_df.append(tmp_df, ignore_index=True)
  stratified_df = stratified_df.append(tmp_df, ignore_index=True)
  stratified_df = stratified_df.append(tmp_df, ignore_index=True)
  stratified_df = stratified_df.append(tmp_df, ignore_index=True)
  stratified_df = stratified_df.append(tmp_df, ignore_index=True)
  stratified_df = stratified_df.append(tmp_df, ignore_index=True)
  stratified_df = stratified_df.append(tmp_df, ignore_index=True)


In [45]:
len(set(sampled_test.inst_id).intersection(set(sampled_workload_data.inst_id)))

0

In [46]:
selected_df['cpu_usage']

0           69.777778
1           69.777778
2           19.384615
3           30.256236
4           45.959559
              ...    
36685590    16.255374
36685591    16.726958
36685592     9.354421
36685593     9.600373
36685594     9.613537
Name: cpu_usage, Length: 36685595, dtype: float64

In [47]:
sampled_test.to_csv('experiments/1_000_sampled_test_data.csv')


In [48]:
keys = ['inst_name', 'workload', 'job_name', 'worker_name', 'inst_id']

In [49]:
sampled_with_machines = pd.merge(sampled_workload_data, machine_spec_df, on='machine', how='left')

In [50]:
sampled_with_machines

Unnamed: 0,job_name,inst_id,user,task_name,inst_num,status,start_time,end_time,plan_cpu,plan_mem,...,max_gpu_wrk_mem,read,write,read_count,write_count,duration,gpu_type_machine_spec,cap_cpu_machine_spec,cap_mem_machine_spec,cap_gpu_machine_spec
0,71712e3c961b5bbdb2ce0440,c5be8813bf90aaf9ca6f02f744ccd95596e9850928a498...,2cbd33b08024,ps,5.0,Terminated,4625696.0,4628399.0,600.0,29.296875,...,0.829102,2.068066e+06,3.426616e+05,312.218182,219.376623,2703.0,P100,64.0,512.0,2.0
1,8aa3a3a479479aecb41ea324,3808e75bd3204e01e84e33a9f6f873513e52bcc7afb33f...,2cbd33b08024,ps,5.0,Terminated,4642480.0,4644075.0,600.0,29.296875,...,0.827148,1.234028e+06,3.724553e+05,264.849650,173.374126,1595.0,P100,64.0,512.0,2.0
2,b68f25260271d65caccca8e7,5229275f7f92865554f934e97f576ef99db0c079bba59d...,d4d51aca8806,worker,16.0,Terminated,4058070.0,4058321.0,50.0,29.296875,...,0.774414,1.010761e+07,3.781242e+06,2401.913043,590.652174,251.0,T4,96.0,512.0,2.0
3,8aa3a3a479479aecb41ea324,3808e75bd3204e01e84e33a9f6f873513e52bcc7afb33f...,2cbd33b08024,ps,5.0,Terminated,4642480.0,4644075.0,600.0,29.296875,...,0.829102,2.358233e+06,4.527896e+05,404.843206,271.797909,1595.0,P100,64.0,512.0,2.0
4,446cd4921e0f00ab5ce1c2ea,12b9ec70cc7c63ebc7184b8a0c363513a8c3cb19746c74...,2cbd33b08024,ps,2.0,Terminated,5521030.0,5526746.0,600.0,20.000000,...,0.827148,1.037695e+06,1.527555e+05,201.449250,181.084731,5716.0,P100,64.0,512.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99996,e49cae3b33b5c205ed8542b0,5283a27d4f19d767256f97c18880a70d69f4d97ca655a3...,47baba33cffe,ps,1.0,Terminated,5890601.0,5893675.0,600.0,29.296875,...,1.213867,1.941028e+06,1.165604e+06,402.442786,171.301824,3074.0,T4,96.0,512.0,2.0
99997,23f6e20075e637b0976b4ac8,d9a01656ea5d0f9792168159dbb61790d3b0228a633ac8...,47baba33cffe,ps,3.0,Terminated,4297415.0,4306572.0,600.0,29.296875,...,2.213867,1.035467e+06,4.269076e+05,259.100393,115.694335,9157.0,T4,96.0,512.0,2.0
99998,bf7ee12301073cc75cba61a2,aa5054c3d153f66bf8e1dba6d33b3bb3a9317f1cc7daaa...,dbb3d4806d21,ps,5.0,Terminated,5448779.0,5450118.0,600.0,29.296875,...,1.213867,3.300603e+06,3.495205e+06,568.968627,295.933333,1339.0,T4,96.0,512.0,2.0
99999,bb95f250974bf5a332f3ea8c,e13b3a5539d62d7b08cad9470ab3b3a33eb54640369337...,dbb3d4806d21,ps,5.0,Terminated,5822706.0,5822846.0,600.0,29.296875,...,0.000000,5.726675e+07,6.596024e+07,8070.600000,3965.400000,140.0,T4,96.0,512.0,2.0


In [51]:
machine_metric_df = machine_metric_df[['worker_name', 'machine', 'machine_cpu_iowait', 'machine_cpu_kernel', 'machine_cpu_usr', 
                   'machine_gpu', 'machine_load_1', 'machine_net_receive', 'machine_num_worker', 'machine_cpu']]

KeyError: "['machine_cpu_iowait', 'machine_cpu_kernel', 'machine_cpu_usr', 'machine_gpu', 'machine_load_1', 'machine_net_receive', 'machine_num_worker', 'machine_cpu'] not in index"

In [None]:
machine_metric_df.columns = ['worker_name', 'machine'] + [x+'_machine_metric' for x in machine_metric_df.columns[2:]]

In [None]:
sampled_with_machines_final = pd.merge(sampled_with_machines, machine_metric_df, on='worker_name', how='left')

In [None]:
sampled_with_machines_final

In [None]:
sampled_with_machines_final.to_csv('experiments/100_001_sampled_workload_data_full.csv', index=False)