In [None]:
import pandas as pd
import numpy as np

### MPI Log Parse

This code will take a log file dump from one of our MPI Tests and average the `timeDelta` column for rows with a common `testName`. It will produce a file of the form

```
testName,timeDelta
```

Note: this resulting file may contain rows with duplicate `testName` entries.  In our Baseline test, those duplicates had the exact same value and could be disgarded.

There may be corrupted lines, though their appearance appears to be irregular in our test cases.  The following Regex will help you identify malformed data lines manually in a text editor:

```Regex
^(?!.*,.*,.*,.*,.*,.*).*$
```

In [None]:
testNameColumn = 'testName'
originColumn = 'thisID'
destinationColumn = 'thatID'
timeColumn = 'timeDelta'

fileName = '20170426-22-25-baseline-1000msg.csv'
outFileName = 'baseline-1000msg-avgs-testNames.csv'

In [None]:
# completeData = pd.read_csv("20170426-22-25-baseline-1000msg.csv", 
#                    dtype={originColumn: np.int32, destinationColumn: np.int32, timeColumn: np.float64},
#                    comment='#')
completeData = pd.read_csv(fileName, comment='#')

# remove lines that aren't data we need
completeData = completeData[completeData[testNameColumn].str.contains('^[0-9]+-[0-9]+.*$')]
completeData[timeColumn] = completeData[timeColumn].astype(float)

# uncomment to check your input data
# completeData

In [None]:
testNames = np.unique(completeData[testNameColumn].values)
# print(testNames)
testNamesOutput = []
averages = []
# originColumns = []
# destinationColumns = []
for testName in testNames:
#     prints the current testName being averaged to the console
#     print(testName)
    data = completeData[completeData[testNameColumn].str.contains(testName)]
    procs = np.unique(data[originColumn].values)
    for proc in procs:
        otherProcs = np.delete(procs, proc)
        for otherProc in otherProcs:
            values = data[(data[originColumn] == proc) & 
                          (data[destinationColumn] == otherProc) | 
                          (data[destinationColumn] == proc) & 
                          (data[originColumn] == otherProc)]
            average = 0.0;
            if values.size > 0:
                average = (float) (values[timeColumn].sum()) / values.size
            testNamesOutput = np.append(testNamesOutput, testName)
            averages = np.append(averages, average)
#             originColumns = np.append(originColumns, proc)
#             destinationColumns = np.append(destinationColumns, otherProc)

dfData = {testNameColumn: testNamesOutput, timeColumn: averages}
outputDF = pd.DataFrame(dfData)
outputDF.columns = [testNameColumn, timeColumn]
outputDF.to_csv("" + outFileName + ".csv")
print(outFileName + ".csv saved")