In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

This notebook is an extension of the previous notebook. It mostly deals with distributed training approach and will contain a standard framework which we will carry out in our successive notebooks.Any production level system should be horizontally scalable . This means that the training jobs should be distributed into clusters of devices like CPU's , GPU's , TPU's . Doing it alone by hand would be a monumentous task and hence the nice guys from google wrapped up all the hard work into their estimator api.

In [None]:
def train_test_split_write(dataset,train_split = 0.8,eval_split = 0.1):
    np.random.seed(0)
    mask_train = [x < train_split for x in np.random.random(len(dataset))]
    mask_eval = [x >= train_split and x < (train_split + eval_split) for x in np.random.random(len(dataset))]
    mask_test = [x >= (train_split + eval_split) for x in np.random.random(len(dataset))]
    dataset[mask_train].to_csv('../working/mushrooms_train.csv')
    dataset[mask_eval].to_csv('../working/mushrooms_eval.csv')
    dataset[mask_test].to_csv('../working/mushrooms_test.csv')

Here we create our training , evaluation and testing files and write it down to disk

In [None]:
CSV_DEFAULTS = ['?' for item in range(24)]
CSV_COLUMN_NAMES = ['cap-shape', 'cap-surface', 'cap-color', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'habitat']
import tensorflow as tf
def generate_feature_columns(dataset):
    temp = dataset[CSV_COLUMN_NAMES]
    features = []
    for item in temp:
        col_name = item
        col_classes = dataset[item].unique()
        feat_col = tf.feature_column.categorical_column_with_vocabulary_list(col_name,col_classes)
        one_hot = tf.feature_column.indicator_column(feat_col)
        features.append(one_hot)
    return features

The above code generates the feature columns

In [None]:
def format_label(x):
    if(x=='p'):
        return 1
    else:
        return 0

def read_dataset(csv_path):  
    data = pd.read_csv(csv_path,index_col=[0])
    data['class'] = data['class'].map(format_label)
    data = tf.data.Dataset.from_tensor_slices((dict(data[CSV_COLUMN_NAMES]),data['class']))
    return data

def train_input_funnction(batch_size,epochs = 10):
    data = read_dataset('../working/mushrooms_train.csv')
    data = data.shuffle(buffer_size=6600).repeat(count=epochs).batch(batch_size)
    return data
    
def eval_input_funnction(batch_size,epochs = 10):
    data = read_dataset('../working/mushrooms_eval.csv')
    data = data.shuffle(buffer_size=810).repeat(count=epochs).batch(batch_size)
    return data

def predict_input_funnction(batch_size,epochs = 10):
    data = read_dataset('../working/mushrooms_test.csv')
    data = data.shuffle(buffer_size=880).repeat(count=epochs).batch(batch_size)
    return data

Preprocessing functions for data sources

In [None]:
import pandas as pd
import numpy as np
dataset = pd.read_csv('/kaggle/input/mushroom-classification/mushrooms.csv')
def load_data(dataset):
    train_test_split_write(dataset)
    return generate_feature_columns(dataset)

feat_cols = load_data(dataset)

Here most of the preprocessing steps are completed and the appropriate feature columns are generated. Up till now most of the functions match that of the previous notebook . From here onwards we shall go for the distributed training approach and make necessary modifications to the code. Also we. will use custom estimators which are not part of the standard premade estimators library

In [None]:
def model_fn(features, labels, mode):
    model = tf.keras.Sequential([
      tf.keras.layers.DenseFeatures(feat_cols),
      tf.keras.layers.Dense(1,activation = 'relu'),
      tf.keras.layers.Dense(1,activation = 'softmax')
    ])
    
    logits = model(features, training=False)
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {'logits': logits}
        return tf.estimator.EstimatorSpec(labels=labels, predictions=predictions)
    
    optimizer = tf.compat.v1.train.AdamOptimizer()
    
    loss = tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.NONE)(labels, logits)
    
    if mode == tf.estimator.ModeKeys.EVAL:
        return tf.estimator.EstimatorSpec(mode = mode, loss=loss)

    return tf.estimator.EstimatorSpec(
          mode=mode,
          loss=loss,
          train_op=optimizer.minimize(
          loss, tf.compat.v1.train.get_or_create_global_step()))
    

The model function returns and estimator spec which will determine how the model will run. We can add a ton of configs here , cutom loggers , custom metrices , custom loss functions , the list goes on. We wont worry much about the performance as of now . We will only structure our code here and create a standardised framework which we will follow in the next notebooks.

The model function defines our models. Its losses , optimizers ect. As with the previous notrbooks we have mixed the best of both worlds and created our model. Which means that we have taken the keras approach for model building and then wrapped it up with the estimator api. The end result of this api is an estimator spec which will help tensorflow to build an estimator and run it a distributed manner depening on the devices it has access to. This code can easily be run on cloud Machine Learning Engine , which is googles home grown model training and hosting infrastructure. 

We wont go into the details of this now , and only focus on the code structure.

In [None]:
strategy = tf.distribute.MirroredStrategy()
print(strategy)

print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

This defines that tensor flow will use a distributed approach for training the data.

In [None]:
config = tf.estimator.RunConfig(save_checkpoints_steps = 100,log_step_count_steps=10)

classifier = tf.estimator.Estimator(
    model_fn=model_fn, model_dir='model/', config=config)


tf.estimator.train_and_evaluate(
    classifier,
    train_spec=tf.estimator.TrainSpec(input_fn=lambda : train_input_funnction(10),max_steps = 100),
    eval_spec=tf.estimator.EvalSpec(input_fn=lambda : eval_input_funnction(10),
                                   steps = None,start_delay_secs = 1,throttle_secs = 1)
)

The final code block trains the model for the given number of steps and saves checkpoints in a pre defined folder.

In [None]:
%%bash
rm -r model

Code to clean up log files and checkpoints to retrain the model.