# Training of a super simple model for celltype classification

In [1]:
import tensorflow as tf
!which python
!python --version
print(tf.VERSION)
print(tf.keras.__version__)
!pwd #  start jupyter under notebooks/ for correct relative paths

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
import datetime
import inspect
import pandas as pd
import numpy as np
import seaborn as sns
from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from depiction.models.examples.celltype.celltype import one_hot_encoding, one_hot_decoding

## a look at the data
labels are categories 1-20, here's the associated celltype:

In [None]:
meta_series = pd.read_csv('../data/single-cell/metadata.csv', index_col=0)
meta_series

There are 13 unbalanced classes, and over 80k samples

In [None]:
data_df = pd.read_csv('../data/single-cell/data.csv')
data_df.groupby('category').count()['CD45']

In [None]:
data_df.sample(n=10)

In [None]:
print(inspect.getsource(one_hot_encoding)) # from keras, but taking care of 1 indexed classes
print(inspect.getsource(one_hot_decoding))

In [None]:
classes = data_df['category'].values
labels = one_hot_encoding(classes)

#scale the data from 0 to 1
min_max_scaler = MinMaxScaler(feature_range=(0, 1), copy=True)
data = min_max_scaler.fit_transform(data_df.drop('category', axis=1).values)
data.shape

In [None]:
one_hot_decoding(labels)

In [None]:
data_train, data_test, labels_train, labels_test = train_test_split(
    data, labels, test_size=0.33, random_state=42, stratify=data_df.category)

In [None]:
labels

In [None]:
batchsize = 32

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((data_train, labels_train))
dataset = dataset.shuffle(2 * batchsize).batch(batchsize)
dataset = dataset.repeat()

testset = tf.data.Dataset.from_tensor_slices((data_test, labels_test))
testset = testset.batch(batchsize)

## I don't know how a simpler network would look like

In [None]:
model = tf.keras.Sequential()
# Add a softmax layer with output units per celltype:
model.add(layers.Dense(
    len(meta_series), activation='softmax',
    batch_input_shape=tf.data.get_output_shapes(dataset)[0]
))

In [None]:
model.summary()

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(0.001),
              loss='categorical_crossentropy',
              metrics=[tf.keras.metrics.categorical_accuracy])

In [None]:
# evaluation on testset on every epoch
# log_dir="logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
# tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
model.fit(
    dataset,
    epochs=20, steps_per_epoch=np.ceil(data_train.shape[0]/batchsize),
    validation_data=testset, #  callbacks=[tensorboard_callback]
)

## Is such a simple model interpretable?

In [None]:
# Save entire model to a HDF5 file
model.save('./celltype_model.h5')

In [None]:
# tensorboard --logdir logs/fit

In [None]:
# To recreate the exact same model, including weights and optimizer.
# model = tf.keras.models.load_model('../data/models/celltype_dnn_model.h5')

# What is the effect of increasing model complexity? 
Play around by adding some layers, train and save the model under some name to use with the other notebook.

![title](https://i.kym-cdn.com/photos/images/newsfeed/000/531/557/a88.jpg)

In [None]:
model = tf.keras.Sequential()
# Adds a densely-connected layers with 64 units to the model:
model.add(layers.Dense(64, activation='relu', batch_input_shape=tf.data.get_output_shapes(dataset)[0])) # 
# ...
# do whatever you want
# model.add(layers.Dense(64, activation='relu'))
# model.add(layers.Dropout(0.5))
# ...
# Add a softmax layer with output units per celltype:
model.add(layers.Dense(len(meta_series), activation='softmax'))