### Set up ###

In [3]:
import numpy as np
import math

# Settings for converting CSV to Npy
# In our case we had 712 training files, 417 testing files, and we stored a max of 50 CSV files per Npy file
numRecordsPerFile = 10000;
numFeatures = 2800;
numTrainingFiles = 712;
numTestingFiles = 417;
filesPerNpy = 50;


[  0  90 180 270 360 450 540 630 712]


### Load and save the training data ###

In [7]:
# File names for training and testing data
trainingInputPrefix = # Path to CSV files for training data (just the prefix)
trainingOutputPrefix = # Path to CSV files for training targets (just the prefix)

# Create start and stop points for CSV files per Npy
stopPoints = np.append(np.arange(filesPerNpy, numTrainingFiles, filesPerNpy), numTrainingFiles)
startPoints = np.arange(0, numTrainingFiles, filesPerNpy)
counter = 0;

for stop in stopPoints:
# Loop through every file
      
    print(stop)
    
    # Initialize index and arrays
    index = 0;
    X_train = np.zeros((numRecordsPerFile*filesPerNpy, numFeatures))
    Y_train = np.zeros((numRecordsPerFile*filesPerNpy, 1))
    
    for file in range(startPoints[counter], stop):
        trainingInputFileName = trainingInputPrefix + str(file) + ".csv"
        trainingOutputFileName = trainingOutputPrefix + str(file) + ".csv"
        trainingInputFile = open(trainingInputFileName)
        trainingOutputFile = open(trainingOutputFileName)

        # Read all lines from the files
        trainingInputFileLines = trainingInputFile.readlines()
        trainingOutputFileLines = trainingOutputFile.readlines()

        # Loop through every line and append values to array
        for i in range(0, len(trainingInputFileLines)):
            featureLine = [float(s) for s in trainingInputFileLines[i].split(',')]
            outputLine = [float(s) for s in trainingOutputFileLines[i].split(',')]
            # inputFeatureLine = [((feature - np.min(featureLine))/(np.max(featureLine)-np.min(featureLine))) for feature in featureLine[0:numFeatures]]
            if not(math.isnan(outputLine[0])):
                X_train[index, :] = featureLine[0:numFeatures]
                Y_train[index, :] = outputLine[0]
                index += 1

        trainingInputFile.close()
        trainingOutputFile.close()

     # Trim and reshape into 2D format - will still need to reshape X_train for 
    X_train = X_train[0:index, :];
    Y_train = Y_train[0:index];

    # Save the training arrays
    saveTrainX = 'X_train_AllN_small' + str(counter)
    saveTrainY = 'Y_train_AllN_small' + str(counter)
    np.save(saveTrainX, X_train)
    np.save(saveTrainY, Y_train)
    
    print("Checkpoint: files " + str(startPoints[counter]) + " to " + str(stop) + " saved as NPY")
    
    counter += 1

print("Checkpoint: training data done")

[ 15  30  45  60  75  90 105 120 135 150 165 180 195 210 225 240 255 270
 285 300 315 330 345 360 375 390 405 420 435 450 465 480 495 510 525 540
 555 570 585 600 615 630 645 660 675 690 705 712]
[  0  15  30  45  60  75  90 105 120 135 150 165 180 195 210 225 240 255
 270 285 300 315 330 345 360 375 390 405 420 435 450 465 480 495 510 525
 540 555 570 585 600 615 630 645 660 675 690 705]
15
0
0
[2.000e+00 1.000e+00 0.000e+00 ... 9.999e+03 9.999e+03 9.999e+03]
[26.060226]
Checkpoint: files 0 to 15 saved as NPY
30


OSError: [Errno 22] Invalid argument

### Load and save testing data ###

In [5]:
testingInputPrefix = # Path to CSV files for testing data (just the prefix)
testingOutputPrefix = # Path to CSV files for testing targets (just the prefix)

# Set up start and stop points
stopPoints = np.append(np.arange(filesPerNpy, numTestingFiles, filesPerNpy), numTestingFiles)
print(stopPoints)
startPoints = np.arange(0, numTestingFiles, filesPerNpy)
print(startPoints)
counter = 0;

for stop in stopPoints:
    # Initialize index and arrays
    print(stop)
    index = 0;
    indexWithNan = 0;
    X_test = np.zeros((numRecordsPerFile*filesPerNpy, numFeatures))
    Y_test = np.zeros((numRecordsPerFile*filesPerNpy, 1))
    X_test_with_nan = np.zeros((numRecordsPerFile*filesPerNpy, numFeatures))
    Y_test_with_nan = np.zeros((numRecordsPerFile*filesPerNpy, 1))
    position = np.zeros((numRecordsPerFile*filesPerNpy, 4))
    

    # Populate the arrays for testing
    for testingFile in range(startPoints[counter], stop):
        testingInputFileName = testingInputPrefix + str(testingFile) + ".csv"
        testingOutputFileName = testingOutputPrefix + str(testingFile) + ".csv"
        testingInputFile = open(testingInputFileName)
        testingOutputFile = open(testingOutputFileName)

        # Read all lines from the files
        testingInputFileLines = testingInputFile.readlines()
        testingOutputFileLines = testingOutputFile.readlines()

        for i in range(0, len(testingInputFileLines)):
            featureLine = [float(s) for s in testingInputFileLines[i].split(',')]
            outputLine = [float(s) for s in testingOutputFileLines[i].split(',')]

            if not(math.isnan(outputLine[0])):
                X_test[index, :] = featureLine[0:numFeatures]
                Y_test[index, :] = outputLine[0]
                
                index += 1
                
            X_test_with_nan[indexWithNan, :] = featureLine[0:numFeatures]
            Y_test_with_nan[indexWithNan, :] = outputLine[0]
            position[indexWithNan, :] = outputLine[1:5]
            indexWithNan += 1

        testingInputFile.close()
        testingOutputFile.close()

    # Trim and reshape into expected format
    X_test = X_test[0:index, :]
    Y_test = Y_test[0:index]
    X_test_with_nan = X_test_with_nan[0:indexWithNan, :]
    Y_test_with_nan = Y_test_with_nan[0:indexWithNan]
    position = position[0:indexWithNan, :]
    
    # Save the files
    saveTestX = 'X_test_AllN_small' + str(counter)
    saveTestY = 'Y_test_AllN_small' + str(counter)
    savePosition = 'Position_test_AllN_small' + str(counter)
    saveXwithNan = 'X_test_with_nan_AllN_small' + str(counter)
    saveYwithNan = 'Y_test_with_nan_AllN_small' + str(counter)
    np.save(saveTestX, X_test)
    np.save(saveTestY, Y_test)
    np.save(savePosition, position)
    np.save(saveXwithNan, X_test_with_nan)
    np.save(saveYwithNan, Y_test_with_nan)

    print("Checkpoint: files " + str(startPoints[counter]) + " to " + str(stop) + " saved as NPY")
    counter += 1


print("Checkpoint: Testing data done")

[ 15  30  45  60  75  90 105 120 135 150 165 180 195 210 225 240 255 270
 285 300 315 330 345 360 375 390 405 417]
[  0  15  30  45  60  75  90 105 120 135 150 165 180 195 210 225 240 255
 270 285 300 315 330 345 360 375 390 405]
15
Nan check
0
0
0
Shapes
(149704, 2800)
(149704, 1)
(149999, 2800)
(149999, 1)
(149999, 4)
Values check
[[25.522497]
 [25.573751]
 [25.674286]
 ...
 [25.203331]
 [25.459997]
 [25.389996]]
[[41.  2. 22.  1.]
 [41.  3. 22.  1.]
 [43.  3. 22.  1.]
 ...
 [89. 21. 32. 13.]
 [90. 21. 32. 13.]
 [91. 21. 32. 13.]]
Checkpoint: files 0 to 15 saved as NPY
30
Nan check
0
0
0
Shapes
(149489, 2800)
(149489, 1)
(150000, 2800)
(150000, 1)
(150000, 4)
Values check
[[25.341427]
 [25.401661]
 [25.446001]
 ...
 [25.936249]
 [25.853333]
 [25.826664]]
[[92. 21. 32. 13.]
 [93. 21. 32. 13.]
 [94. 21. 32. 13.]
 ...
 [34. 52. 48. 10.]
 [35. 52. 48. 10.]
 [36. 52. 48. 10.]]
Checkpoint: files 15 to 30 saved as NPY
45
Nan check
0
0
0
Shapes
(150000, 2800)
(150000, 1)
(150000, 2800)
(1500