In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Simple Principle component with neural net approach for classification. 
Don't make things too complicated :)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input
from sklearn.decomposition import PCA, SparsePCA
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Model
import tensorflow as tf
import random

In [None]:
train_data = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2022/train.csv')

Let's first take a quick look at our data set and how many times each bacterium was measured. Furthermore, let's look at the emerging 10-mer DNA sequences. 

In [None]:
df = train_data['target'].value_counts()
print(df)
names = list(df.keys())

In [None]:
cols = np.array((train_data.columns), dtype='str')[1:-1]
cols

Next, we consider the mean measured spectra. For this we use two different approaches. In the first section, we consider the mean with standard deviation in an error bar plot. 

In [None]:
fig, axs = plt.subplots(2, 5, figsize=(30, 15))
means = np.zeros((10, 286))
stdv = np.zeros((10, 286))

for i in range(10):
    subset = np.array(train_data[train_data['target']==names[i]])
    subset = np.array(subset[:, 1:-1], dtype=np.float64)
    N = np.sum(subset, 1)
    mean = np.mean(subset, 0)
    means[i, :] = mean
    std = np.std(subset, 0)
    stdv[i, :] = std
    
    x = np.linspace(0, len(mean)-1, len(mean))
    
    axs[i%2, i%5].plot(x, mean, 'r', lw=2, label='Mean')
    axs[i%2, i%5].vlines(x=x, ymin=mean-std, ymax=mean+std, color='b', alpha=0.4, label='stdv.')
    axs[i%2, i%5].set_title(names[i])
    axs[i%2, i%5].grid()
    axs[i%2, i%5].set_xlim([-2, 286])
    axs[i%2, i%5].set_xticks([0, 50, 100, 150, 200, 250])
    axs[i%2, i%5].set_xticklabels(cols[0::50], rotation=-45)
    #axs[i%2, i%5].set_xlabel('DNA Sequence')
    axs[i%2, i%5].legend(loc='upper left')

del subset, mean, std, fig, axs

Next, we break down the distribution of the emerging 10-mer DNA sequences in a heatmap. here we can see nicely that only certain 10-mer sequences appear and only some of them are really suitable for classification. For this focus on the important 10-mers, we will use the principle component analysis in the following steps. In addition, the standard deviations of the 10-mer measurements are shown. Here it can be seen that, as expected, the deviations per 10-mer sequence hardly vary across the individual classes, so that a fixed standard deviation per 10-mer can be used for the subsequent data augmentation.

In [None]:
fig, axs = plt.subplots(1,1,figsize=(30, 10))
im = axs.matshow(means, aspect='auto')
axs.set_yticks(range(10))
axs.grid()
#axs.set_xlabel('DNA Sequence')
axs.set_yticklabels(names)
axs.set_xticks([0, 25, 50, 75, 100, 125, 150, 175, 200, 225, 250, 275])
axs.set_xticklabels(cols[0::25], rotation=-45)
axs.set_title('Mean Normalized Raman Spectrum')
cax = fig.add_axes([axs.get_position().x1+0.01,axs.get_position().y0,0.02,axs.get_position().height])
axs.tick_params(axis="x", bottom=True, top=False, labelbottom=True, labeltop=False)
fig.colorbar(im, cax=cax)  

fig, axs = plt.subplots(1,1,figsize=(30, 10))
im = axs.matshow(stdv, aspect='auto')
axs.set_yticks(range(10))
axs.grid()
#axs.set_xlabel('DNA Sequence')
axs.set_yticklabels(names)
axs.set_xticks([0, 25, 50, 75, 100, 125, 150, 175, 200, 225, 250, 275])
axs.set_xticklabels(cols[0::25], rotation=-45)
axs.set_title('Standard diviation of the messurements')
axs.tick_params(axis="x", bottom=True, top=False, labelbottom=True, labeltop=False)
cax = fig.add_axes([axs.get_position().x1+0.01,axs.get_position().y0,0.02,axs.get_position().height])
fig.colorbar(im, cax=cax)

fig, axs = plt.subplots(1,1, figsize=(30, 5))
axs.errorbar(range(len(cols)), np.mean(stdv, 0), yerr=np.std(stdv, 0), fmt='o')
axs.set_xticks([0, 25, 50, 75, 100, 125, 150, 175, 200, 225, 250, 275])
axs.set_xticklabels(cols[0::25], rotation=-45)
axs.set_title('mean standard diviation of all meassurnents')
axs.grid()
axs.set_xlim([0, 286])
del  means, fig, axs

Let's now create a training and validation dataset that will be used for training the neural network and PCA. For the PCA, the individual measurements are normalized to their mean and standard deviation (z-score).

In [None]:
sort_dataset = []
label = []
for i in range(10):
    subset = np.array(train_data[train_data['target']==names[i]])
    if i == 0:
        sort_dataset = np.array(subset[:, 1:-1], dtype=np.float32)
        label = i*np.ones(np.shape(subset)[0])
    else:
        sort_dataset = np.append(sort_dataset, np.array(subset[:, 1:-1], dtype=np.float32), axis=0)
        label = np.append(label, i*np.ones(np.shape(subset)[0]), axis=0)
        

train_feat, test_feat, train_label, test_label = train_test_split(sort_dataset, label, test_size=0.2, shuffle=True) 

#del train_data, subset, sort_dataset, label

train_feat = ((train_feat.T-np.mean(train_feat.T, 0))/np.std(train_feat.T, 0)).T
test_feat = ((test_feat.T-np.mean(test_feat.T, 0))/np.std(test_feat.T, 0)).T

print('Trainset: %.f, %.f' %(np.shape(train_feat)[0], np.shape(train_feat)[1]))
print('Train Label: %.f ' %np.shape(train_label))
print('Testset: %.f, %.f' %(np.shape(test_feat)[0], np.shape(test_feat)[1]))
print('Test Label: %.f' %np.shape(test_label))


Next, we consider the variance of the data set, which we can explain using the individual pca components. To do this, we look at the variance of the individual components, as well as the commulated explained variance. We can see that with the first 100 pca components we can already explain more than 90% of the occurring variance of the data. This should be a good starting value for the input dimension of the neural network, which can be varied afterwards. 

In [None]:
pca = PCA(n_components=286, whiten=True, random_state=42)
#pca = SparsePCA(n_components=286, random_state=42)


pca.fit(train_feat) # input shape (NSample, NFeature)
pca_vectors = pca.components_ # shape [n_components, n_features]

exp_var = pca.explained_variance_ratio_
fig, axs = plt.subplots(1,3, figsize=(24,8))
axs[0].plot(range(len(exp_var)), exp_var, 'o')
axs[0].set_title('Explained variance by each principle component')

axs[1].plot(range(len(exp_var)), np.cumsum(exp_var))
axs[1].set_title('Cumulative Sum of pc')

axs[2].matshow(pca_vectors[0:10, :].T, aspect='auto')
axs[2].set_title('')
axs[2].set_xlabel(' Principle Component ')
axs[2].set_ylabel('DNA Sequence')

for i in range(2):
    axs[i].grid()
    axs[i].set_xlabel('principle component')
    axs[i].set_ylabel('explained variance')
    
del fig, axs

In [None]:
train_trans = pca.transform(train_feat)

Let us now take a look at the data in the first two pca components.

In [None]:
fig1, axs1 = plt.subplots(1,1, figsize=(15, 10))
for i in range(10):
    idx = np.array(np.where(train_label==i)).squeeze()
    axs1.plot(train_trans[idx, 0],train_trans[idx, 1], 'o', alpha=0.6)
axs1.legend(names, bbox_to_anchor=(1.4, 1), loc='upper right', fontsize=15)
axs1.grid()
axs1.set_xlabel('PC1 (%.2f)'%(exp_var[0]*100), fontsize=20)
axs1.set_ylabel('PC2 (%.2f)'%(exp_var[1]*100), fontsize=20)

del fig1, axs1, train_trans

Let us now set up the input pipeline and the neural network. The network is a simple combination of dense layers with dropout and batch normalization. Additionally we have the possibility to become a L2 regularization. (Warning spoiler: while searching for the hyperparameters it turned out that L2 regularization is not needed for good performance on the test set). The input pipeline has the task to give a noise on the training data to account for the errors within the individual measurements and to prevent overfitting. The noise consists of random numbers from a normal distribution with a mean of 0 and a standard deviation adjusted to the 10-measurement sequence (mean standard deviation of the individual 10-measurements from above).

In [None]:
def NN_Classifier(input_dim, Nclasses, NLayer=5, lr=1e-3, dp_rate=0.4, l2_rate=1e-4):
    inp = Input(batch_shape=(None, input_dim))
    x = Dense(1024,  kernel_regularizer=regularizers.l2(l2_rate), activation='relu')(inp)
    x = BatchNormalization()(x)
    x = Dropout(dp_rate)(x)

    for i in range(NLayer):
        x = Dense(1024,  kernel_regularizer=regularizers.l2(l2_rate), activation='relu')(x)
        x = BatchNormalization()(x)
        x = Dropout(dp_rate)(x)
        
    out = Dense(Nclasses, activation='softmax')(x)
    model = Model(inputs=inp, outputs=out)
    
    tf.keras.optimizers.Adam(learning_rate=lr)
    model.compile(optimizer='Adam', loss= tf.keras.losses.categorical_crossentropy, metrics=[tf.keras.metrics.CategoricalAccuracy()])
    model.summary()
    return model

class augmenation_pipeline():
    def __init__(self):
        self.data = [] # Trainingsdata
        self.stdv = [] # stdv
        self.label =  []
        self.pca_vectors = []
        self.one_hot_label = []
        self.batch_size = 512
        self.NClasses = 10
        self.noise_width = 1
        
    def train_generator(self):
        NData, NFeat = np.shape(self.data)
                
        self.generate_label()
        
        NRuns_per_epoch = NData//self.batch_size
        for i in range(NRuns_per_epoch):
            start = i*self.batch_size
            
            noise = self.generate_noise(NData, NFeat, self.label[start:start+self.batch_size])
            data = self.data[start:start+self.batch_size, :] + noise
            feat = self.pca_dimension_reduction(self.normalization(data).T)
            
            label =  self.one_hot_label[start:start+self.batch_size, :]
            
            yield feat, label
    
    def test_generator(self):
        NData, NFeat = np.shape(self.data)
                
        self.generate_label()
        
        NRuns_per_epoch = NData//self.batch_size
        for i in range(NRuns_per_epoch):
            start = i*self.batch_size
            
            data = (self.data[start:start+self.batch_size, :])
            
            feat = self.pca_dimension_reduction(self.normalization(data).T)
            
            label =  self.one_hot_label[start:start+self.batch_size, :]
            
            yield feat, label
    
    def prediction_generator(self, data):
        return self.pca_dimension_reduction(self.normalization(data).T)
    
    def generate_noise(self, NData, NFeat, label):
        noise = np.zeros((self.batch_size, NFeat))
        for i in range(NFeat):
            noise[:,i] = np.random.normal(0, self.noise_width*np.mean(self.stdv[:, i]), size=self.batch_size)
        return noise

    def generate_label(self):
        self.one_hot_label = np.zeros((len(self.label), self.NClasses))
    
        for i in range(len(self.label)):
            self.one_hot_label[i, int(self.label[i])] = 1 # dim [sample, Nclasses]
            
    def pca_dimension_reduction(self, data):
        # Input:
        # dim. pca_vec = [component, feature]
        # dim. data = [feature, sample]
        # Output:
        # dim. [sample, component]
        return np.matmul(self.pca_vectors, data).T
    
    def normalization(self, data):
        return ((data.T-np.mean(data.T, 0))/np.std(data.T, 0)).T
        



now let's build the training setup.

In [None]:
Npc = 150
lr = 1e-3
l2 = 0#1e-10
dr = 0.4
bs = 256
Nlayer = 10
nwd = 1

model = NN_Classifier(Npc, 10, Nlayer, lr, dr, l2)
train_generator = augmenation_pipeline()
test_generator = augmenation_pipeline()

train_generator.data = train_feat
train_generator.stdv = stdv
train_generator.label = train_label
train_generator.batch_size = bs
train_generator.pca_vectors = pca_vectors[0:Npc, :]
train_generator.noise_width = nwd

test_generator.data = test_feat
test_generator.stdv = stdv
test_generator.label = test_label
test_generator.batch_size = bs
test_generator.pca_vectors = pca_vectors[0:Npc, :]

feat_size = (bs, Npc,)
label_size = (bs, 10, )

train_dataset = tf.data.Dataset.from_generator(generator=train_generator.train_generator, output_types=(tf.float32, tf.float32), output_shapes=(feat_size, label_size)) 
test_dataset = tf.data.Dataset.from_generator(generator=test_generator.test_generator, output_types=(tf.float32, tf.float32), output_shapes=(feat_size, label_size)) 

Let's the training begin 

In [None]:
tf.keras.backend.clear_session()
custom_early_stopping = EarlyStopping(
    monitor='val_categorical_accuracy', 
    patience=10, 
    min_delta=0.0001, 
    mode='max',
    restore_best_weights=True
)
hist = model.fit(train_dataset, validation_data=(test_dataset), batch_size=bs, epochs=200, callbacks=[custom_early_stopping])

as we can see, the different batteries can be classified well.

In [None]:
fig, axs = plt.subplots(1,2, figsize=(10, 5))
axs[0].plot(range(len(hist.history['loss'])), hist.history['loss'], 'r', label='train')
axs[0].plot(range(len(hist.history['loss'])), hist.history['val_loss'], 'b', label='validation')
axs[1].plot(range(len(hist.history['loss'])), hist.history['categorical_accuracy'], 'r', label='train')
axs[1].plot(range(len(hist.history['loss'])), hist.history['val_categorical_accuracy'], 'b', label='validation')
for i in range(2):
    axs[i].grid()
    axs[i].set_xlabel('Epochs')
    axs[i].legend()
axs[0].set_ylabel('categorical crossentropy')
axs[1].set_ylabel('categorical accuracy')

Let's now add a little analysis of the klassiefizier. The output of the model can be seen as a probability distribution, which indicates which bacterium is involved for a given input. First, let's look at the mean probability distribution of each class. In addition, we look at the incorrectly assigned bacteria and see what they were swapped with.

In [None]:
def label_id_to_label(pred_label_id, label_name):
    id_label = np.argmax(pred_label_id, 1)

    label = []
    for i in range(len(id_label)):
        label.append(label_name[id_label[i]])

    return label

pred_label_id = model.predict(test_generator.prediction_generator(test_feat))
pred_label = np.argmax(pred_label_id, 1)

wsk_label = (pred_label_id.T/np.sum(pred_label_id, 1)).T

conf_mat = np.zeros((10, 10))
false_mat = np.zeros((10, 10))

for i in range(10):
    idx = np.array(np.where(test_label==i)).squeeze()
    tmp = np.sum(wsk_label[idx], 0)/len(idx)
    conf_mat[i, :] = tmp
    
    id_false = np.array(np.where(pred_label[idx] != i)).squeeze()
    if (len(id_false) > 1):
        for j in range(len(id_false)):
            idf = int(pred_label[idx[id_false[j]]])
            false_mat[i, idf] = false_mat[i, idf]+1
        false_mat[i, :] = false_mat[i, :]/len(id_false)
        
    elif  (len(id_false) == 1):
        false_mat[i, id_false]=1
    
del tmp

In [None]:
fig, axs = plt.subplots(1,2, figsize=(28,8))
label_ticks = np.linspace(0, 9, 10)
grid_ticks =  np.linspace(-0.5,9.5 , 11)

pcol = axs[0].matshow(conf_mat, aspect='auto', cmap='Reds', alpha=0.6)
clb = fig.colorbar(pcol, ax=axs[0]) 
clb.set_label('probability',size=15)

for i in range(10):
    for j in range(10):
        axs[0].text(j-0.4, i, '%.3f'%(conf_mat[i,j]))

pcol = axs[1].matshow(false_mat, aspect='auto', cmap='Reds', alpha=0.6)
clb = fig.colorbar(pcol, ax=axs[1]) 
clb.set_label('probability',size=15)

for i in range(10):
    for j in range(10):
        axs[1].text(j-0.4, i, '%.3f'%(false_mat[i,j]))

for i in range(2):
    axs[i].set_xticks(label_ticks, fontsize=12)
    axs[i].set_yticks(label_ticks, fontsize=12)
    axs[i].set_xticklabels(names, rotation=-75)
    axs[i].set_yticklabels(names)

    axs[i].set_xticks(grid_ticks, minor=True)
    axs[i].set_yticks(grid_ticks, minor=True)
    axs[i].grid(which='minor')

    axs[i].tick_params(axis="x", bottom=True, top=False, labelbottom=True, labeltop=False)
    axs[i].set_xlabel('predited label', fontsize=15)
    axs[i].set_ylabel('true label', fontsize=15)

axs[0].set_title('mean output distribution per class', fontsize=20)
axs[1].set_title('distribution of false predicted inputs', fontsize=20)

It follows the classification of the submission data.

In [None]:
sub_data = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2022/test.csv')
subm = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2022/sample_submission.csv')

#sub_data = convert_data_back_to_histogram(sub_data)
sub_data = np.array(sub_data, dtype=np.float32) 
sub_data = sub_data[:, 1:]

In [None]:
#submission_feat = pca_dimension_reduction(pca_vectors[0:Npc, :], test_data.T)
#test_generator.data = sub_data
prediction = model.predict(test_generator.prediction_generator(sub_data))
    
pred_label = label_id_to_label(prediction, names)

subm['target']=pred_label
subm.to_csv('submission.csv', index=False)

last but not least some code for the hyperparameter fitting. Based on random parameter combinations to get an impression of the behavior of the individual parameters.

In [None]:
hyperfitting = False

if hyperfitting == True : 
    del model, train_generator, test_generator
    tf.keras.backend.clear_session()
    NRuns = 50
    
    custom_early_stopping = EarlyStopping(monitor='val_categorical_accuracy', patience=10, min_delta=0.001, mode='max')
    
    BatchSize = [128, 256, 512, 1024]
    learning_rates = [1e-2, 5e-3, 1e-3, 5e-4, 1e-4, 5e-5]
    dropout_rates = [0.3, 0.4, 0.5, 0.6]
    l2_rates = [5e-3, 1e-3, 5e-4, 1e-4]
    Npca = [2, 50, 100, 150, 200, 268]
    Nlayer = [5, 10, 20, 50, 100]
    book = np.zeros((NRuns, 7))
    
    for i in range(NRuns):
        print('Run %.f/%.f' % (i+1, NRuns))
        Npc = int(np.array(random.sample(Npca, 1), dtype=np.int32))
        book[i, 0] = Npc
        lr = float(np.array(random.sample(learning_rates, 1), dtype=np.float32))
        book[i, 1] = lr
        dpr = float(np.array(random.sample(dropout_rates, 1), dtype=np.float32))
        book[i, 2] = dpr
        l2 = float(np.array(random.sample(l2_rates, 1), dtype=np.float32))
        book[i, 3] = l2
        bs = int(np.array(random.sample(BatchSize, 1), dtype=np.int32))
        book[i, 4] = bs 
        nl = int(np.array(random.sample(Nlayer, 1), dtype=np.int32))
        book[i, 5] = nl
        
        model = NN_Classifier(int(Npc), 10, int(nl), float(lr), float(dpr), float(l2))

        model = NN_Classifier(Npc, 10, nl, lr, dr, l2)
        train_generator = augmenation_pipeline()
        test_generator = augmenation_pipeline()

        train_generator.data = train_feat
        train_generator.stdv = stdv
        train_generator.label = train_label
        train_generator.batch_size = bs
        train_generator.pca_vectors = pca_vectors[0:Npc, :]

        test_generator.data = test_feat
        test_generator.stdv = stdv
        test_generator.label = test_label
        test_generator.batch_size = bs
        test_generator.pca_vectors = pca_vectors[0:Npc, :]

        feat_size = (bs, Npc,)
        label_size = (bs, 10, )

        train_dataset = tf.data.Dataset.from_generator(generator=train_generator.train_generator, output_types=(tf.float32, tf.float32), output_shapes=(feat_size, label_size)) 
        test_dataset = tf.data.Dataset.from_generator(generator=test_generator.test_generator, output_types=(tf.float32, tf.float32), output_shapes=(feat_size, label_size))
        
        hist = model.fit(train_dataset, validation_data=(test_dataset), batch_size=bs, epochs=200, callbacks=[custom_early_stopping])
        book[i, 6] = np.max(hist.history['val_categorical_accuracy'])
        
        del model, train_generator, test_generator
        tf.keras.backend.clear_session()
    
    frame = pd.DataFrame(book, columns=['N_PCA', 'LR', 'DR', 'L2', 'BS', 'NL', 'ACC'])

In [None]:
if hyperfitting == True:
    frame = frame.sort_values(by=['ACC'], ascending=False)
    print(frame)