In [1]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import norm

In [2]:
# Get the mass vs. species data
data = pd.read_csv("train_data.csv")

In [3]:
# ----------------------------Generate training data----------------------------
training_size = 100
# Get how many classes we need from the original data
output_dim = data.shape[0]
print("Num output classes: ", output_dim)

# Select a bunch of random rows in the data
np.random.seed(2)
masses = []
species = []
for i in range(training_size):
    rand_index = np.random.randint(output_dim)
    masses.append(data.at[rand_index, "Precise Mass"])
    # Use one-hot encodig of species strings to integers.
    species.append(rand_index)
masses = np.array(masses)

# Generate noise for the same number of rows'
noise = 1e-6 * np.random.standard_normal(size = training_size)

# Combine the noise and randomly selected masses to create our training data
x_data = masses + noise*masses
y_data = species
# print("x_data: ", x_data)
# print("y_data: ", y_data)

Num output classes:  48


In [4]:
# Make the known species masses into a row, then repeat for the length of the training data to prepare for mass array operations
# in the next cell
precise_masses = np.reshape(np.array(data["Precise Mass"]), (1,-1))
precise_masses = precise_masses.repeat(training_size,0)
print(precise_masses)

[[ 1.0084  2.0147 12.0005 ... 96.9696 42.977  42.9996]
 [ 1.0084  2.0147 12.0005 ... 96.9696 42.977  42.9996]
 [ 1.0084  2.0147 12.0005 ... 96.9696 42.977  42.9996]
 ...
 [ 1.0084  2.0147 12.0005 ... 96.9696 42.977  42.9996]
 [ 1.0084  2.0147 12.0005 ... 96.9696 42.977  42.9996]
 [ 1.0084  2.0147 12.0005 ... 96.9696 42.977  42.9996]]


In [5]:
# TODO Accuracy goes down from 100% --> 0% as this uncertainty gets too small. Possibly use:
#      1) More accurate uncertainties?
#      2) A warning / default classification when vector becomes all 0s ([0,0,0,0,...0])?
# Store uncertainty and unknown species to be classified
unc = 0.001

# Calculate standard deviations and resulting probabilities for each mass according to its uncertainty pdf.
# We transform the x_data from 1 x n to n x 1, then broadcast it to n x 48 while subtracting precise_masses and dividing
# by the uncertainty to get the number of standard deviations each measured mass is from each of the 48 known species
# masses, then use norm.cdf to convert to probabilities.
prob_matrix = np.zeros((training_size,len(data)))
prob_matrix = (np.reshape(x_data, (-1,1)) - precise_masses) / unc
prob_matrix = (norm.cdf(-abs(prob_matrix)))

# Calculate relative probabilities by scaling row sum to 1
rel_prob_matrix = 1/np.reshape(np.sum(prob_matrix,1),(-1,1)) * prob_matrix
# Print out a particular row
print(rel_prob_matrix[0,:])

[0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 1.00000000e+00 3.82224017e-70 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]


In [6]:
# Now let's compare our array with the real labels and determine the classification accuracy of the Gaussian Process
y_pred = np.argmax(rel_prob_matrix, 1)
print("Predicted: ", y_pred)
print("Actual: ", y_data)

class_acc = (y_pred == y_data).mean()*100
print("Classification Accuracy: ", str(class_acc) + "%")

Predicted:  [40 15 45  8 22 43 18 11 40  7 34 31 11 21 47 31 26 20 37 39  3 38  4 42
 43 39 38 42 33  3  5 24  4 46  6 31 19 31  2 16 46 12  4 26 15 39 46  8
 45 15 41 45  8 17 22  9 41 46 26 19 32 43 32 26  8 12 10 40 34  9 37  6
 22  6 19 18  1  4 40 17  6 37 33 18 20 26 23 22 43 37 10  8 26 35 27 32
 16 21 43 29]
Actual:  [40, 15, 45, 8, 22, 43, 18, 11, 40, 7, 34, 31, 11, 21, 47, 31, 26, 20, 37, 39, 3, 38, 4, 42, 43, 39, 38, 42, 33, 3, 5, 24, 4, 46, 6, 31, 19, 31, 2, 16, 46, 12, 4, 26, 15, 39, 46, 8, 45, 15, 41, 45, 8, 17, 22, 9, 41, 46, 26, 19, 32, 43, 32, 26, 8, 12, 10, 40, 34, 9, 37, 6, 22, 6, 19, 18, 1, 4, 40, 17, 6, 37, 33, 18, 20, 26, 23, 22, 43, 37, 10, 8, 26, 35, 27, 32, 16, 21, 43, 29]
Classification Accuracy:  100.0%
