In [1]:
from xgboost import XGBRegressor
import numpy as np

TRAIN_PATH = 'C:\\Users\\sixsa\\code\\UCI\\helix\\tracks_1m_updated_asymmetric_higher.txt'
HELIX_VAL_PATH = 'C:\\Users\\sixsa\\code\\UCI\\helix\\tracks_100k_updated_asymmetric_higher.txt'
NON_HELIX_VAL_PATH = 'C:\\Users\\sixsa\\code\\UCI\\helix\\sintracks_100k_updated_asymmetric_higher.txt'

In [2]:
with open(TRAIN_PATH, 'r') as file:
    content = file.read()
    data_points = content.split('EOT')

    data_points = [dp.strip() for dp in data_points if dp.strip()]
    data_points = [dp.split('\n') for dp in data_points]
    data_points = [[[float(cell) for cell in row.split(', ')] for row in dp] for dp in data_points]
    targets = [dp[0] for dp in data_points]
    PARAMETERS_TRAINING = np.array(targets)
    input_points = [dp[1:] for dp in data_points]
    inputs = []
    for input in input_points:
        combined = []
        for coordinate in input:
            combined += coordinate
        inputs.append(combined)
    POINTS_TRAINING = np.array(inputs)

with open(HELIX_VAL_PATH, 'r') as file:
    content = file.read()
    data_points = content.split('EOT')

    data_points = [dp.strip() for dp in data_points if dp.strip()]
    data_points = [dp.split('\n') for dp in data_points]
    data_points = [[[float(cell) for cell in row.split(', ')] for row in dp] for dp in data_points]
    targets = [dp[0] for dp in data_points]
    PARAMETERS_VALIDATION_HELIX = np.array(targets)
    input_points = [dp[1:] for dp in data_points]
    inputs = []
    for input in input_points:
        combined = []
        for coordinate in input:
            combined += coordinate
        inputs.append(combined)
    POINTS_VALIDATION_HELIX = np.array(inputs)

with open(NON_HELIX_VAL_PATH, 'r') as file:
    content = file.read()
    data_points = content.split('EOT')

    data_points = [dp.strip() for dp in data_points if dp.strip()]
    data_points = [dp.split('\n') for dp in data_points]
    data_points = [[[float(cell) for cell in row.split(', ')] for row in dp] for dp in data_points]
    targets = [dp[0] for dp in data_points]
    PARAMETERS_VALIDATION_NON_HELIX = np.array(targets)
    input_points = [dp[1:] for dp in data_points]
    inputs = []
    for input in input_points:
        combined = []
        for coordinate in input:
            combined += coordinate
        inputs.append(combined)
    POINTS_VALIDATION_NON_HELIX = np.array(inputs)
print(POINTS_VALIDATION_HELIX.shape)
print(PARAMETERS_VALIDATION_HELIX.shape)

(100000, 30)
(100000, 5)


In [3]:
def train():
    encoder = XGBRegressor(n_estimators=1500, device="cuda")
    encoder.fit(POINTS_TRAINING, PARAMETERS_TRAINING) # train "encoder"
    # make encoder predict parameters
    predicted_parameters = encoder.predict(POINTS_TRAINING)
    decoder = XGBRegressor(n_estimators=1500, device="cuda")
    decoder.fit(predicted_parameters, POINTS_TRAINING)

    return encoder, decoder

In [4]:
def calc_parameter_error(encoder):
    min_per_column = np.min(PARAMETERS_VALIDATION_HELIX, axis=0)
    max_per_column = np.max(PARAMETERS_VALIDATION_HELIX, axis=0)
    range_per_column = max_per_column - min_per_column
    print("ranges")
    print(range_per_column)
    predictions = encoder.predict(POINTS_VALIDATION_HELIX)
    print(PARAMETERS_VALIDATION_HELIX)
    print(predictions)
    result_mae = np.abs(PARAMETERS_VALIDATION_HELIX - predictions)
    result_mae = np.mean(result_mae, axis=0)
    print('mae')
    print(result_mae)
    print('mae / range * 100')
    percent_error = (result_mae / range_per_column) * 100
    print(percent_error)

In [12]:
def calc_classification(encoder, decoder, threshold):
    predicted_points_helical = decoder.predict(encoder.predict(POINTS_VALIDATION_HELIX)) # this will be 2d
    reshaped_val_helix_points = POINTS_VALIDATION_HELIX.reshape((POINTS_VALIDATION_HELIX.shape[0], 10, 3))
    reshaped_predicted_points_helical = predicted_points_helical.reshape((POINTS_VALIDATION_HELIX.shape[0], 10, 3))
    distances_helical = np.linalg.norm(reshaped_val_helix_points - reshaped_predicted_points_helical, axis=-1)
    total_distance_per_entry_helical = np.sum(distances_helical, axis=-1)

    predicted_points_helical = decoder.predict(encoder.predict(POINTS_VALIDATION_NON_HELIX)) # this will be 2d
    reshaped_val_non_helix_points = POINTS_VALIDATION_HELIX.reshape((POINTS_VALIDATION_NON_HELIX.shape[0], 10, 3))
    reshaped_predicted_points_non_helical = reshaped_val_non_helix_points.reshape((POINTS_VALIDATION_NON_HELIX.shape[0], 10, 3))
    distances_helical = np.linalg.norm(reshaped_val_non_helix_points - reshaped_predicted_points_non_helical, axis=-1)
    total_distance_per_entry_non_helical = np.sum(distances_helical, axis=-1)

    combined_distances = np.concatenate((total_distance_per_entry_helical, total_distance_per_entry_non_helical), axis=0)

    target = np.concatenate((np.ones(total_distance_per_entry_helical.shape), np.zeros(total_distance_per_entry_non_helical.shape)))
    predictions = 1 * (combined_distances < threshold)
    print(target)
    print(predictions)

    print(np.mean(target == predictions))

In [6]:
encoder, decoder = train()

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [9]:
calc_parameter_error(encoder)

ranges
[4.50e-02 6.28e+00 1.75e+02 9.04e+00 2.67e+00]
[[ 1.8900e-02  1.5200e+00  1.5884e+02  5.4000e-01 -1.0000e-01]
 [ 2.9000e-03  4.7000e+00  1.7385e+02  1.9300e+00 -5.3000e-01]
 [ 3.5000e-03  1.0000e+00  7.0020e+01  8.0000e-01  4.7000e-01]
 ...
 [ 4.9000e-03  3.7000e-01  1.6733e+02  6.9000e-01  2.2000e-01]
 [ 7.6000e-03  6.0900e+00  1.6003e+02 -8.0000e-01  4.7000e-01]
 [ 1.6100e-02  2.4900e+00  1.8783e+02  1.7900e+00  1.9000e-01]]
[[ 7.8018652e-03  1.5185254e+00  1.6725746e+02  5.4300082e-01
  -9.6691042e-02]
 [ 1.1893836e-02  4.6918778e+00  1.4604335e+02  1.9221940e+00
  -5.3957123e-01]
 [ 1.1871860e-02  1.0030295e+00  6.8422546e+01  7.9901224e-01
   4.7254717e-01]
 ...
 [ 9.0857008e-03  3.6029395e-01  1.7869379e+02  6.9477385e-01
   2.1904367e-01]
 [ 7.4387845e-03  6.0914426e+00  1.6330972e+02 -7.8289264e-01
   4.7350496e-01]
 [ 1.4929137e-02  2.5007203e+00  1.7228375e+02  1.7729076e+00
   1.8790159e-01]]
mae
[4.35857459e-03 1.31228874e-02 6.03657407e+00 9.40111038e-03
 3.37082843

In [17]:
calc_classification(encoder, decoder, threshold=10)

[1. 1. 1. ... 0. 0. 0.]
[1 1 1 ... 1 1 1]
0.49738
