In [1]:
# Main code
import os
import sys
import numpy as np
from sklearn import preprocessing

import gallodata
import features
import synth

DATA_PATH = os.path.join(os.getcwd(), '..', '..', '..', 'data')
NEO_DATA_SRC = '1006/NeoSpectra'
GROUND_TRUTH_FILE = os.path.join(DATA_PATH, '1006/Optimal Solutions Data Set from ASL.xlsx')
GROUND_TRUTH_INOCULATION_FILE = os.path.join(DATA_PATH, '1006/Optimal Solutions Data Set.xlsx')
ADD_TANK_LABEL = True
BLIND_TEST_MODE = False

NEO_DATA_FILENAME1 = os.path.join(DATA_PATH, NEO_DATA_SRC, 'Gallo09262017.csv')
NEO_DATA_FILENAME2 = os.path.join(DATA_PATH, NEO_DATA_SRC, 'Gallo09272017.csv')
NEO_DATA_FILENAME3 = os.path.join(DATA_PATH, NEO_DATA_SRC, 'Gallo09282017.csv')
NEO_DATA_FILENAME4 = os.path.join(DATA_PATH, NEO_DATA_SRC, 'Gallo09292017.csv')
NEO_DATA_FILENAME5 = os.path.join(DATA_PATH, NEO_DATA_SRC, 'Gallo10022017.csv')
NEO_DATA_FILENAME6 = os.path.join(DATA_PATH, NEO_DATA_SRC, 'Gallo10032017.csv')
NEO_DATA_FILENAME7 = os.path.join(DATA_PATH, NEO_DATA_SRC, 'Gallo10042017.csv')
NEO_DATA_FILENAME8 = os.path.join(DATA_PATH, NEO_DATA_SRC, 'Gallo10052017.csv')
NEO_DATA_FILENAME9 = os.path.join(DATA_PATH, NEO_DATA_SRC, 'Gallo10062017.csv')

#Extended multiplicative scatter correction
USE_EMSC = False
NORMALIZE = True
EMSC_ORDER = 2
USE_DERIVATIVE = False
DERIVATIVE_ORDER = 2
USE_TRANSMITTANCE_ONLY = False
USE_TRANSMITTANCE = False

USE_ALL_DAYS = False
USE_ALL_DAYS_SPLIT = 5.0/6.0
NORMALIZE_OUTPUT = False
USE_SYNTHETIC_DATA = True
SYNTH_ADDITION_FACTOR = 1

**Generate the ground truth data for all the target variables:**

In [2]:
output_file = 'groundTruthMap.log'
originalStdOut = sys.stdout
sys.stdout = open(output_file, 'w')
print('Writing to output file:', output_file)

groundTruthMap = {}
groundTruthMap = gallodata.generateGroundTruth(GROUND_TRUTH_FILE, groundTruthMap)
groundTruthMap = gallodata.generateGroundTruthInnoculation(GROUND_TRUTH_INOCULATION_FILE, groundTruthMap)
groundTruthMap = gallodata.generateGroundTruthGrapeColorMap(GROUND_TRUTH_INOCULATION_FILE, groundTruthMap)

**Obtain the train/test data from files:**

In [3]:
'''
NeoSpectra evaluation
'''
trainFilesMap = { '09/26/2017' : NEO_DATA_FILENAME1,
                  '09/27/2017' : NEO_DATA_FILENAME2,
                  '09/28/2017' : NEO_DATA_FILENAME3,
                  '09/29/2017' : NEO_DATA_FILENAME4,                  
                  '10/03/2017' : NEO_DATA_FILENAME6,
                  '10/05/2017' : NEO_DATA_FILENAME8,
                  '10/06/2017' : NEO_DATA_FILENAME9
                  }
testFilesMap = {'10/02/2017' : NEO_DATA_FILENAME5}

# This function will generate the test and train data used originally for each target metric,
# it will merge them and save them in a NumPy format so we can use the merged data directly in HyperSpace.
def train_test_merge(trainFilesMap, testFilesMap):
    extractedDataTrainMap = gallodata.getNeospectraDataFromFiles(groundTruthMap, trainFilesMap)
    print ('extractedDataTrainMap......................',extractedDataTrainMap)
    if USE_SYNTHETIC_DATA:
        extractedDataTrainMap = synth.getNeoSynthData(extractedDataTrainMap, SYNTH_ADDITION_FACTOR)
    
    # Assign the features (level of absorption values) of training data to xDataTrain
    # And, features (level of absorption values) of training data to xDataBlind 

    xDataTrain = extractedDataTrainMap['features']
    extractedDataTestMap = gallodata.getNeospectraDataFromFiles(groundTruthMap, testFilesMap, BLIND_TEST_MODE)
    xDataBlind = extractedDataTestMap['features'] 

    # false, So, we will skip the ‘if’ block.
    if USE_ALL_DAYS:
        allFilesMap = trainFilesMap
        for key in testFilesMap:
            allFilesMap[key] = testFilesMap[key]
        
        totalExtractedDataMap = gallodata.getNeospectraDataFromFiles(groundTruthMap, allFilesMap)
        
        extractedDataTrainMap = {}
        extractedDataTestMap = {}
        n = totalExtractedDataMap['features'].shape[0]
        p = np.random.permutation(n)
        for key in totalExtractedDataMap:
            totalExtractedDataMap[key] = totalExtractedDataMap[key][p]
        for key in totalExtractedDataMap:
            data = totalExtractedDataMap[key]
            extractedDataTrainMap[key],extractedDataTestMap[key] = np.split(data, [int(USE_ALL_DAYS_SPLIT*data.shape[0])])    
            print(extractedDataTrainMap[key].shape,extractedDataTestMap[key].shape)

        xDataTrain = extractedDataTrainMap['features']
        xDataBlind = extractedDataTestMap['features'] 

        print('Generated unified randomized data')

    # false, So, we will skip the ‘if’ block.
    if USE_TRANSMITTANCE_ONLY:
        xDataTrain = xDataTrainN
        xDataBlind = xDataBlindN
    elif USE_TRANSMITTANCE:
        xDataTrain = np.concatenate((xDataTrain, xDataTrainN),axis=1)
        xDataBlind = np.concatenate((xDataBlind, xDataBlindN),axis=1)
    
    # false, So, we will skip the ‘if’ block.
    if USE_DERIVATIVE:
        xDataTrain = features.getGradients(xDataTrain, DERIVATIVE_ORDER)
        xDataBlind = features.getGradients(xDataBlind, DERIVATIVE_ORDER)
    
    # Standardize features by removing the mean and scaling to unit variance
    # The idea behind StandardScaler is that it will transform your data such that its distribution will have a mean value 0 and standard deviation of 1. 
    # Given the distribution of the data, each value in the dataset will have the sample mean value subtracted, and then divided by the standard deviation of the whole dataset.
    scaler = preprocessing.StandardScaler()
    if NORMALIZE:
        scaler.fit(xDataTrain)
        xDataTrain = scaler.transform(xDataTrain)
        xDataBlind = scaler.transform(xDataBlind)

    # false, So, we will skip the ‘if’ block.
    if USE_EMSC:
        xDataTrain = features.emsc(xDataTrain, EMSC_ORDER)
        xDataBlind = features.emsc(xDataBlind, EMSC_ORDER)

    TargetFeaturesList = ['Sugar', 'Alcohol']
    NormalizedFeaturesList = []
    # false, So, we will skip the ‘if’ block.
    if NORMALIZE_OUTPUT:
        for targetMetric in NormalizedFeaturesList:
            yTrain = extractedDataTrainMap[targetMetric]
            scaler = preprocessing.StandardScaler()
        scaler.fit(yTrain.reshape([-1,1]))
        extractedDataTrainMap['normalized_'+targetMetric] = scaler.transform(yTrain.reshape([-1,1]))
        extractedDataTestMap['normalized_'+targetMetric] = scaler.transform(extractedDataTestMap[targetMetric].reshape([-1,1]))

    excelData = None
    # We assign the tank numbers of test data to a 1D array.
    if ADD_TANK_LABEL:
        headerData = np.array((['Actual Tank']))
        yTest = extractedDataTestMap['Tank']
        excelDataSub = np.array((yTest)).reshape([-1,1])
        excelDataSub = np.concatenate((headerData.reshape([1,-1]), excelDataSub.astype('str')), axis=0) 
        excelData = excelDataSub
    
    for targetMetric in TargetFeaturesList:
        yTrain = None
        yTest = None
        # NORMALIZE_OUTPUT = false; So, We skip the 'if' block and 'if not' block
        if NORMALIZE_OUTPUT and targetMetric in NormalizedFeaturesList:
            yTrain = extractedDataTrainMap['normalized_'+targetMetric]
            if not BLIND_TEST_MODE:
                yTest = extractedDataTestMap['normalized_'+targetMetric]
        else:
            yTrain = extractedDataTrainMap[targetMetric]
            if not BLIND_TEST_MODE:
                yTest = extractedDataTestMap[targetMetric]
                
        X_combined = np.r_[xDataTrain, xDataBlind]
        y_combined = np.r_[yTrain, yTest]
        
        data_name = 'data_' + targetMetric + '.npy'
        
        with open(data_name, 'wb') as f:
            np.save(f, X_combined)
            np.save(f, y_combined)
            
        print('Data sets have been merged and saved in current directory under ' + data_name)
        print('To load the merged data sets, load one by one in order for each target metric: X_combined, y_combined in ' + data_name)

**From the train/test split, generate the merged data "X" and the merged regression target "y", necessary to perform hyperparamter optimization:**

In [4]:
output_file = 'train_test_merge.log'
sys.stdout = open(output_file, 'w')
print('Writing to output file:', output_file)

train_test_merge(trainFilesMap, testFilesMap)

In [5]:
sys.stdout = originalStdOut

**After getting to this point and having obtained the necessary merged data, we proceed to do our hyperparameter optimization outside of this notebook. All you need to do is run the following command (in the current directory):**

**for 2 params:**

   **$ mpirun -n 4 python cheers_svr_2params.py --results /path/to/results_dir**
   
**for 3 params, you will possible need to submit the job to an HPC cluster. If you don't, then just do:**

   **$ mpirun -n 8 python cheers_svr_3params.py --results /path/to/results_dir**
   
**Where n = 2^k, with k = # of hyperparameters and n = # of MPI processes/ranks required**

**Lastly, we can come back to reinstantiate our model with our newly optimized hyperparameters.**