In [1]:
import re
import json

JSON = True
CSV = False

ParseType = JSON
OutType = CSV

In [2]:
class RAW:
    cpu_no:str
    time:str
    start_done:str
    host_no:str
    tag:str
    cmd_type:str
    lba:str
    txlen:str

    def fromJson(self, j):
        self.__dict__ = json.loads(j)
        
    def __init__(self):
        self.cpu_no = '0'
        self.time = '0'
        self.start_done = '0'
        self.host_no = '0'
        self.tag = '0'
        self.cmd_type = '0'
        self.lba = '0'
        self.txlen = '0'

In [3]:
class CMD:
    idx:int
    cpu_no:str
    time_start:str
    time_end:str
    host_no:str
    tag:str
    cmd_type:str
    lba:str
    txlen:str
    queue_cnt:int
    nextlba:str
    continuity:int  # 1: True, 0: False

    def Send(self, idx, raw:RAW, queueCnt):
        self.idx = idx
        self.cpu_no = raw.cpu_no
        self.time_start = raw.time
        self.host_no = raw.host_no
        self.tag = raw.tag
        self.cmd_type = raw.cmd_type
        self.lba = raw.lba
        self.txlen = raw.txlen
        self.queue_cnt = queueCnt
        self.continuity = 0

    def Complete(self, raw:RAW):
        self.time_end = raw.time
        self.nextlba = str(int(self.lba) + int(self.txlen))
    
    def getCsvHeader():
        return f'idx,\
issue_time,\
cmd_type,\
lba,\
size(KB),\
latency(us),\
queue_cnt,\
cpu_no,\
host_no,\
tag,\
nextlba,\
continuity'
                
    def toCSV(self):
        return f'{self.idx},\
{self.time_start},\
{self.cmd_type},\
{self.lba},\
{int(int(self.txlen)/2)},\
{int(float(self.time_end)*1E6-float(self.time_start)*1E6)},\
{self.queue_cnt},\
{self.cpu_no},\
{self.host_no},\
{self.tag},\
{self.nextlba},\
{self.continuity}'


In [4]:
def parseLine(line:str):
    try:
        sp = line.split('cmnd=')
        header = sp[0]
        data = sp[1]
        raw = RAW()

        # Header
        raw.cpu_no = re.findall(r'\[([^]]+)\]', header)[0]
        raw.time = re.findall(r'\s([\d.]+)\:', header)[0]
        raw.start_done = re.findall(r'scsi_dispatch_cmd_([\w]+)\:', header)[0]
        raw.host_no = re.findall(r'host_no=([\d]+)\s', header)[0]
        raw.tag = re.findall(r'driver_tag=([\d]+)\s', header)[0]

        # Data
        raw.cmd_type = re.findall(r'\(([\w]+)\s', data)[0]
        if re.match('READ|WRITE', raw.cmd_type):
            raw.lba = re.findall(r'lba=([\d]+)\s', data)[0]
            raw.txlen = re.findall(r'txlen=([\d]+)\s', data)[0]
        else:
            raw.lba = '0'
            raw.txlen = '0'
        
        return raw
    except:
        print(raw.__dict__)
        print(line)

In [5]:
def parseTrace(filename, parseType = JSON):
    infile = open(filename, 'r')
    if parseType == JSON:
        outfile = open(filename + '.json', 'w')
    else:
        outfile = open(filename + '.parse', 'w')

    idx = 0
    while True:
        line = infile.readline()
        if not line : break
        if 'LOST' in line : 
            print(line)
            continue
        if 'WRITE_SAME_16' in line : continue
        
        raw = parseLine(line)
        if parseType == JSON:
            outfile.write(f'{json.dumps(raw.__dict__)}\n')
        else:
            if idx == 0:
                outfile.write(f'idx,{",".join(raw.__dict__.keys())}\n')
            outfile.write(f'{idx},{",".join(raw.__dict__.values())}\n')
        idx += 1
        
    infile.close()
    outfile.close()

    return outfile.name

In [6]:
def checkContinuity(filename, outType = CSV):
    infile = open(filename, 'r')
    outfile = open(filename + '.csv', 'w')

    if outType == CSV:    # to skip header
        header = infile.readline()
        outfile.write(header)

    prevLine = infile.readline().split(',')[:-1]

    while True:
        line = infile.readline()
        if not line : break

        curLine = line.split(',')[:-1]
        if prevLine[-1] == curLine[3]:  #prevLine[-1]=nextLba, curLine[3]=lba
            outfile.write(f'{",".join(curLine)},{1}\n')
        else:
            outfile.write(f'{",".join(curLine)},{0}\n')

        prevLine = curLine

    infile.close()
    outfile.close()

    return outfile.name
        

In [7]:
def calcData(filename, parseType = JSON, outType = CSV):
    infile = open(filename, 'r')
    calcfilename = filename + '.calc'
    outfile = open(calcfilename, 'w')
    Queue = dict()
    idx = 0
    
    if parseType == CSV:    # to skip header
        infile.readline()
    if outType == CSV:
        outfile.write(f'{CMD.getCsvHeader()}\n')
        
    while True:
        line = infile.readline()
        if not line : break

        raw = RAW()
        if parseType == JSON:
            raw.fromJson(line)
        
        # TODO: need to implement
        # if parseType == CSV:
        #     raw.fromCsv(line)


        if raw.start_done == 'start':
            Queue[raw.lba] = CMD()
            Queue[raw.lba].Send(idx, raw, Queue.__len__())
            idx += 1

        elif raw.start_done == 'done' and raw.lba in Queue:
            Queue[raw.lba].Complete(raw)
            # print(Queue[raw.lba].__dict__)
            outfile.write(f'{Queue[raw.lba].toCSV()}\n')
            Queue.pop(raw.lba, None)
    
    infile.close()
    outfile.close()

    return outfile.name


In [8]:
def getTimestamp(line):
    return float(line.split(',')[1])

def getChunksizekb(line):
    return int(line.split(',')[4])

In [9]:
def calcThroughput(filename, unitTime_ms = 1000):
    beginTime = 0
    endTime = 0
    with open(filename, 'r') as infile:
        Lines = infile.readlines()
        beginTime = getTimestamp(Lines[1])
        endTime = getTimestamp(Lines[-1])

    # Tick = (endTime - beginTime) / unitTime_ms
    Tick = unitTime_ms / 1000

    dicThroughput = {}
    with open(filename, 'r') as infile:
        Timespan = beginTime + Tick
        totalSize_kb = 0
        for line in infile:
            if 'idx' in line : continue # skip header line
            if(Timespan < getTimestamp(line)):
                dicThroughput[Timespan] = totalSize_kb/1024
                totalSize_kb = 0
                Timespan += Tick

            totalSize_kb += getChunksizekb(line)
        
        # for ramain lines
        if(totalSize_kb > 0):
            beforeTime = Timespan - Tick
            dicThroughput[endTime] = totalSize_kb/1024 * (endTime - beforeTime) / Tick

    outfile = str.replace(filename, '.json.calc.csv', '_throughput.csv')
    with open(outfile, 'w') as outfile:
        outfile.write(f'timeStamp,throughput(MB/s)\n')
        for key in dicThroughput:
            outfile.write(f'{key},{dicThroughput[key]}\n')

In [10]:
filename = './raw/trace_scsi.txt'
parsedFile = parseTrace(filename, ParseType)
calcFile = calcData(parsedFile, ParseType, OutType)
csvFile = checkContinuity(calcFile)
calcThroughput(csvFile)

CPU:6 [LOST 478 EVENTS]

CPU:3 [LOST 848 EVENTS]

CPU:4 [LOST 378 EVENTS]

CPU:0 [LOST 1070 EVENTS]

CPU:2 [LOST 35413 EVENTS]

