## Summary

In this notebook, I will use TensorFlow Decision Forest to create a MNIST classifier. You can find documents [here](https://www.tensorflow.org/decision_forests).

## Import necessary Libraries

In [None]:
!pip install -q tensorflow_decision_forests

In [None]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
import tensorflow_decision_forests as tfdf
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

## Utilities

In [None]:
def sample_images(images, row_count, column_count):
    fig, axs = plt.subplots(row_count, column_count, figsize=(10,10))
    for i in range(row_count):
        for j in range(column_count):
            axs[i,j].imshow(images[i * column_count + j])
            axs[i,j].axis('off')
    plt.show()

## Load data

In [None]:
train = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')
test = pd.read_csv('/kaggle/input/digit-recognizer/test.csv')

In [None]:
train.head()

In [None]:
test.head()

let's see what this data look like.

In [None]:
row_count = 15
col_count = 15
images = np.array(train.iloc[np.random.choice(train.shape[0], row_count * col_count)])[:, :784].reshape((-1, 28, 28))
sample_images(images, row_count, col_count)

In [None]:
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(test)

## Model Development

In [None]:
kfold = sklearn.model_selection.StratifiedKFold(5, shuffle=True)
models = []
for fold, (train_indices, val_indices) in enumerate(kfold.split(train, train["label"])):
    print("Training with fold %d"%(fold + 1))
    train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train.iloc[train_indices], label="label").shuffle(len(train_indices))
    valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train.iloc[val_indices], label="label")
    model = tfdf.keras.RandomForestModel(num_trees=30)
    model.compile(metrics=["accuracy"])
    model.fit(train_ds, validation_data=valid_ds)
    models.append(model)

## Submission

In [None]:
result = np.mean([model.predict(test_ds) for model in models], axis=0)

In [None]:
result.shape

In [None]:
test_labels = np.argmax(result, axis=-1)
image_ids = np.arange(1, test_labels.shape[0]+1)
result = np.concatenate((image_ids.reshape(image_ids.shape[0], 1), test_labels.reshape(test_labels.shape[0], 1)), axis=1)
df = pd.DataFrame(result, columns=["ImageId", "Label"], dtype='int')
df.to_csv("submission.csv", index=False)