In [1]:
from datetime import time as dt_time
import numpy as np
from os import walk
import pandas as pd

In [2]:
dfs = []
workload_rawdata_filenames = next(walk("RawData"))[2]

for workload_rawdata_filename in workload_rawdata_filenames:
    workload = workload_rawdata_filename[:-4]
    
    with open(f'RawData/{workload_rawdata_filename}') as f:
        data = []
        
        for line in f:
            i = 79
            words = line[:i].split()
            
            time = dt_time.fromisoformat(words[0][1:-1])
            duration = float(words[1][1:-1]) if  not("?" in words[1][1:-1]) else 0
            device = words[2]
            event = words[3]
            
            
            features = line[i+1:-1].replace("{", "").replace("}", "").split(",")
            features = [f.split("=")[1].strip() for f in features]
            cpu_id = int(features[0])
            name = features[1]
            ino = int(features[2])
            index = int(features[3])
            
            data.append((workload, event, duration, ino, index))
        
        columns = ["workload", "event", "duration", "ino", "index"]
        tracepoints_df = pd.DataFrame(data, columns=columns)
        dfs.append(tracepoints_df)

display(dfs[0].head())

Unnamed: 0,workload,event,duration,ino,index
0,readrandom,writeback_dirty_page,0.0,1342340,0
1,readrandom,writeback_dirty_page,0.034154,2805381,128
2,readrandom,writeback_dirty_page,2.8e-05,2805381,1208
3,readrandom,writeback_dirty_page,2e-06,2805381,966
4,readrandom,writeback_dirty_page,0.662213,2228260,0


In [3]:
dfs[0] = dfs[0].drop(range(200, 336))
dfs[1] = dfs[1].drop(range(200, 405345))

In [4]:
def cumulative_moving_average(x):
    size = x.shape[0]
    return np.cumsum(x) / np.arange(1, size+1)

def cumulative_moving_standard_deviation(x):
    size = x.shape[0]
    return np.sqrt(np.cumsum(x**2) / np.arange(1, size+1) - cumulative_moving_average(x)**2)

def difference(x):
    return x - pd.concat([pd.Series([0]), x[:-1]], ignore_index=True)

def count_last_second(x):
    counts = []
    
    for i in range(len(x)):
        j = i
        count = 1
        d = x[i]
        
        while True:
            if j == 0 or d > 1:
                break
            
            j -= 1
            count += 1
            d += x[j]
            
        counts.append(count)
    
    return pd.Series(counts)
 

In [5]:
new_dfs = []

columns = ["workload", "tracepoints were traced last second", "cumulative moving mean of page offsets", \
    "cumulative moving standard deviation of page offsets", "absolute page offset differences for consecutive tracepoints", "inode number"]

for df in dfs:
    workload = df["workload"]
    count_tracepoints = count_last_second(df["duration"])
    data = zip(df["workload"], count_last_second(df["duration"]), cumulative_moving_average(df["index"]),\
        cumulative_moving_standard_deviation(df["index"]), difference(df["index"]), df["ino"])

    new_dfs.append(pd.DataFrame(data, columns=columns))

In [6]:
for i in range(len(new_dfs)):
    new_dfs[i].to_csv(f"ProcessedData/{workload_rawdata_filenames[i][:-4]}.csv", index=False)

whole_data_df = pd.concat(new_dfs, ignore_index=True)
whole_data_df.to_csv("ProcessedData/whole_data.csv", index=False)

display(whole_data_df)

Unnamed: 0,workload,tracepoints were traced last second,cumulative moving mean of page offsets,cumulative moving standard deviation of page offsets,absolute page offset differences for consecutive tracepoints,inode number
0,readrandom,1,0.000000e+00,0.000000e+00,0,1342340
1,readrandom,2,6.400000e+01,6.400000e+01,128,2805381
2,readrandom,3,4.453333e+02,5.418126e+02,1080,2805381
3,readrandom,4,5.755000e+02,5.205773e+02,-242,2805381
4,readrandom,5,4.604000e+02,5.194157e+02,-966,2228260
...,...,...,...,...,...,...
482,readseq,5,2.763820e+06,3.841151e+06,-67,2805381
483,readseq,6,2.691117e+06,3.815985e+06,-101,2805381
484,readseq,7,2.622114e+06,3.790685e+06,-1061,2805378
485,readseq,8,2.556583e+06,3.765307e+06,860,2805381
