# Bulk processing

Take all the jump files in a data lake and process them as a group.  Produce the mean results for scores, max speed, curves, and the 5-second partial results.

Conforming to Lucyfer's default configuration, the data lake starts at the `./data` directory and includes any and all files below it.

In [None]:
from copy import deepcopy

from ssscoring import convertFlySight2SSScoring
from ssscoring import dropNonSkydiveDataFrom
from ssscoring import getSpeedSkydiveFrom
from ssscoring import isValidJump
from ssscoring import jumpAnalysisTable
from ssscoring.notebook import processJump

import csv
import os
import os.path as path

import pandas as pd

In [None]:
DATA_LAKE_ROOT = './data' # Lucyfer default
FLYSIGHT_HEADER = set([ 'time', 'lat', 'lon', 'hMSL', 'velN', 'velE', 'velD', 'hAcc', 'vAcc', 'sAcc', 'heading', 'cAcc', 'gpsFix', 'numSV', ])
IGNORE_LIST = [ '.ipynb_checkpoints', ]
MIN_JUMP_FILE_SIZE = 1024*1024

## Get a list of all FlySight files in the data lake

This also discards all files that don't reflect a valid jump:

- Detect the files by size and discard any file smaller than `MIN_JUMP_FILE_SIZE`
- Detect that each file has the FlySight header on the first line

In [None]:
def validFlySightHeaderIn(fileCSV: str) -> bool:
    """
    Checks if a file is a CSV in FlySight format.

    Arguments
    ---------
        fileCSV
    A file name to verify as a valid FlySight file

    Returns
    -------
    `True` if `fileCSV` is a FlySight CSV file, otherwise `False`.
    """
    delimiters =  [',', ]
    with open(fileCSV, 'r') as inputFile:
        dialect = csv.Sniffer().sniff(inputFile.readline(), delimiters = delimiters)
        if dialect.delimiter in delimiters:
            inputFile.seek(0)
            header = next(csv.reader(inputFile))
        else:
            return False

    return FLYSIGHT_HEADER.issubset(header)

In [None]:
jumpFiles = list()
for root, dirs, files in os.walk(DATA_LAKE_ROOT):
    if any(name in root for name in IGNORE_LIST):
        continue
    for fileName in files:
        if 'CSV' in fileName:
            jumpFileName = path.join(root, fileName)
            stat = os.stat(jumpFileName)
            if stat.st_size >= MIN_JUMP_FILE_SIZE and validFlySightHeaderIn(jumpFileName):
                jumpFiles.append(jumpFileName)

---
## Process all files

In [None]:
jumpResults = list()
for jumpFile in jumpFiles:
    jumpResults.append(processJump(
        convertFlySight2SSScoring(
            pd.read_csv(jumpFile, skiprows = (1, 1)))))

In [None]:
jumpResults[0]._fields

In [None]:
jumpResults[0].table

In [None]:
jumpResults[0].table.columns

In [None]:
x = pd.pivot_table(jumpResults[0].table, columns = jumpResults[0].table.time)
x.drop('altitude (ft)')