The notebook uses DataLoaders in fastai2 to help manage the datasets, and use CNN and resnet to create a classification model.

To use TPU resources, the notebook uses xla extension from https://github.com/butchland/fastai_xla_extensions.

For the part of installing xla extension, this notebook refers from https://www.kaggle.com/johnyquest/tpu-fastai-notebook.

If you have any questions, ask away!

# Preparation

## Environment Check

### Check datasets

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Install Packages

***The order of installation is CRUCIAL!***

In [None]:
# %%capture
!pip install -Uqq fastcore --upgrade
!pip install -Uqq fastai --upgrade

Install the TPU client

In [None]:
!pip install -Uqq cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.7-cp37-cp37m-linux_x86_64.whl

Install fastai_xla_extensions

In [None]:
!pip install -Uqq git+https://github.com/butchland/fastai_xla_extensions.git

In [None]:
import fastai_xla_extensions.core

### Check fastai version

In [None]:
import fastai
fastai.__version__

## Import library

In [None]:
%%capture
from fastai.vision.all import *
import fastai_xla_extensions.core

In [None]:
import tensorflow as tf

# Datasets Handling

## Transform TFRecord Format to Image files

In [None]:
path_tmp = Path("/kaggle/temp")

### tfrecord to image

Start with one file.

In [None]:
filenames = ['/kaggle/input/tpu-getting-started/tfrecords-jpeg-224x224/val/09-224x224-232.tfrec']
raw_dataset = tf.data.TFRecordDataset(filenames)
raw_dataset

for raw_record in raw_dataset.take(-1):
    example = tf.train.Example()
    example.ParseFromString(raw_record.numpy())
    print(example)

In [None]:
image_id = example.features.feature["id"].bytes_list.value[0].decode("utf-8")
image_id

In [None]:
image_class = str(example.features.feature["class"].int64_list.value[0])
image_class

In [None]:
image = example.features.feature["image"].bytes_list.value[0]
image

In [None]:
image = Image.open(io.BytesIO(image))
image

Batch processing...

In [None]:
fns = {'train':[], 'val':[], 'test':[]}
for dirname, _, filenames in os.walk('/kaggle/input/tpu-getting-started/tfrecords-jpeg-224x224'):
    for fn in filenames:
        data_type = os.path.basename(dirname)
        fns[data_type].append(os.path.join(dirname, fn))
fns

In [None]:
for data_type, path_list in fns.items():
    raw_dataset = tf.data.TFRecordDataset(path_list)

    for raw_record in raw_dataset.take(-1):
        example = tf.train.Example()
        example.ParseFromString(raw_record.numpy())

        image_id = example.features.feature["id"].bytes_list.value[0].decode("utf-8")
        image = example.features.feature["image"].bytes_list.value[0]
        image = Image.open(io.BytesIO(image))

        if data_type != 'test':
            image_class = str(example.features.feature["class"].int64_list.value[0])

            foldername = f'{path_tmp}/{data_type}/{image_class}'
        else:
            foldername = f'{path_tmp}/{data_type}'
    
        try:
            os.makedirs(foldername)
        except:
            pass
        image.save(f'{foldername}/{image_id}.jpg')

In [None]:
imgs = get_image_files(path_tmp)

In [None]:
len(imgs)

In [None]:
Image.open(imgs[1])

## Create DataLoaders

In [None]:
db = DataBlock(blocks = (ImageBlock, CategoryBlock),
                 get_items=get_image_files, 
                 splitter=GrandparentSplitter(train_name='train', valid_name='val'),
                 get_y=parent_label,
                 batch_tfms=aug_transforms(size=224, min_scale=0.75))

In [None]:
dls = db.dataloaders(path_tmp, bs=64)
dls.show_batch()

In [None]:
xb,yb = dls.one_batch()
xb.shape,yb.shape

# Create and Train Model

## Use fp16 and Restnet50 Model

In [None]:
dls.bs = 32

In [None]:
from fastai.callback.fp16 import *
learn = cnn_learner(dls, resnet50, metrics=accuracy).to_fp16()

### Transfer the model to TPU and check if success

In [None]:
learn.to_xla()

In [None]:
assert one_param(learn.model).device.type == 'xla'

In [None]:
learn.fine_tune(10, freeze_epochs=5)

In [None]:
learn.recorder.plot_loss()

# Validate

In [None]:
learn.show_results()

In [None]:
learn.validate()

# Test Dataset

In [None]:
test_dl = dls.test_dl(get_image_files(f'{path_tmp}/test')) 
class_score, y = learn.get_preds(dl=test_dl) 

In [None]:
class_score = np.argmax(class_score, axis=1)

In [None]:
predicted_classes = [dls.vocab[i] for i in class_score]
predicted_classes[:10]

In [None]:
image_id_list = [x[:-4] for x in os.listdir(path_tmp/'test')]
image_id_list

In [None]:
output = pd.DataFrame({'id': image_id_list, 'label': predicted_classes})
output.to_csv('submission.csv', index=False)
output.head()