In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from scipy.special import entr, rel_entr
from tqdm import tqdm
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
pd.set_option('max_columns',None)

# Introduction

This notebook investigates the following question: If the generation model for the TPS Feb 2022 data were known completely and perfectly, would the classification task be trivial? For that purpose, let's assume the ideal situation where samples are independently drawn from the reference FBC spectra of the 10 bacteria in question, and no error is introduced. In that situation, can we achieve perfect classification by simply finding the reference FBC spectrum that is closest to the sample?

To make the notebook slightly more interesting, we also investigate the choice of several obvious distance functions.

# The Setup

To begin, let's load the reference spectra from the dataset I prepared earlier.

In [None]:
fbc_spectra = pd.read_csv('../input/tps022022-reference-fbc-spectra/train_ref_fbc_spec.csv')

In [None]:
fbc_spectra

In [None]:
boc_cols = []
for aa in range(11):
    for tt in range(11):
        for gg in range(11):
            for cc in range(11):
                if aa+tt+gg+cc == 10:
                    boc_cols.append(F'A{aa}T{tt}G{gg}C{cc}')
                    
bacteria = np.sort(np.unique(fbc_spectra.Resistance))

# Creating the datasets

The sampling is done straightforwardly. Unlike the implementation provided along with the paper, which presamples the reference spectrum a certain number of times and then draws with replacement, we choose to do it correctly, i.e., each sample is drawn independently.

In [None]:
def sample_spectrum(num_samples, num_reads, spectrum, rng):
    boc_reads = rng.choice(len(boc_cols),(num_samples,num_reads), p=spectrum)
    samples = []
    for i in range(num_samples):
        samples.append(np.bincount(boc_reads[i,:],minlength=len(boc_cols)))
    return np.array(samples)/num_reads

In [None]:
def generate_dataset(num_reads, num_samples_per_species):
    rng = np.random.default_rng(42)
    samples = []
    target = []
    for species in bacteria:
        spectrum = fbc_spectra[fbc_spectra.Resistance==species][boc_cols].to_numpy().flatten()
        samples.append(sample_spectrum(num_samples_per_species,num_reads,spectrum,rng))
        target.append(np.full((num_samples_per_species,),species))
    return np.concatenate(samples), np.concatenate(target)

Let's visualize these datasets by projecting onto the first two principal components, using only 100 samples per species for time sake.

In [None]:
%%time
fig = plt.figure(figsize=(12,4))
for j,num_reads in enumerate([100000,1000,100]):
    X,y = generate_dataset(num_reads,100)
    x = PCA().fit_transform(X)
    fig.add_subplot(1,3,j+1)
    for i,species in enumerate(bacteria):
        plt.plot(x[y==species,0],x[y==species,1],'.',alpha=0.1,color=F'C{i}')
    plt.axis('equal')
    plt.title(F'num_reads={num_reads}')
plt.tight_layout()
    

OK it doesn't look too promising for the low `num_reads`. We'll carry on anyway.

# Choice of the distance function

There are several obvious choices for distance in this feature space. Since each row is an empirical probability distribution, the natural choice is the Kullback-Leibler divergence. For comparison, we also consider other convenient (but theoretically incorrect) choices such as \\(\ell^2\\), \\(\ell^1\\) and \\(\ell^\infty\\) norms.

In [None]:
def calculate_distances(X, spectra, p = 0):
    distances = []
    for i in range(spectra.shape[0]):
        spectrum = spectra[i,:].reshape((1,-1))
        if p == 0: # KL divergence
            distances.append(np.sum(rel_entr(X,spectrum),axis=1,keepdims=True))
        else:
            distances.append(np.linalg.norm(X-spectrum,ord=p,axis=1,keepdims=True))
    return np.concatenate(distances,axis=1)

# The Results

For this experiment, we will be generating datasets for `num_reads` 100000, 1000, and 100, with 5000 samples for each species. We are skipping the case `num-reads=1000000` because the answer to the question posed in the title of this notebook is most likely "yes", and it would take a long time to generate the dataset. 

The classification algorithm is simply to take the species whose reference spectrum is closest to the sample in question.

In [None]:
num_samples_per_species = 5000
results  = []
for num_reads in tqdm([100000,1000,100]):
    row = [num_reads]
    X,y = generate_dataset(num_reads, num_samples_per_species)
    for p in [0,2,1,np.inf]:
        distances = calculate_distances(X,fbc_spectra[boc_cols].to_numpy(),p)
        preds = bacteria[np.argmin(distances,axis=1)]
        row.append(np.sum(preds==y)/len(y))
    results.append(row)

The results (accuracy) tabulated:

In [None]:
pd.DataFrame(results,columns=['num_reads','KL Divergence','l_2 norm', 'l_1 norm', 'max norm'])

# Conclusion

Based on this simulation in the ideal situation (no error reads), a simple distance-based classification does not give acceptable results for `num_reads` 1000 or 100. While efforts to model the error rates or to replicate the data generation model would be interesting from an intellectual perspective, it is unclear that classification would benefit from such knowledge alone, without substantial effort in feature engineering and/or discriminative modeling.
