In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
!python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev



import csv
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torchvision.transforms as transforms
import time

import torch_xla
import torch_xla.distributed.parallel_loader as pl
import torch_xla.core.xla_model as xm
import torch_xla.distributed.xla_multiprocessing as xmp

import warnings
warnings.filterwarnings("ignore")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Hello, I made this notebook to learn how to use TPUs.
I am just a beginner, so if you notice mistakes or things that I can do better and simpler, please tell me.

Preparing data

In [None]:
train = pd.read_csv("/kaggle/input/digit-recognizer/train.csv/train.csv", dtype = np.float32)
print(train.shape)

# splitting data into features and labels
labels_numpy = train.label.values
features_numpy = train.loc[:,train.columns != "label"].values/255 # normalization

# splitting data in train data and validation data 80% - 20%
features_train, features_valid, labels_train, labels_valid = train_test_split(features_numpy, labels_numpy, test_size = 0.2, random_state = 1)

# converting to tensor
features_train_tensor = torch.from_numpy(features_train)
labels_train_tensor = torch.from_numpy(labels_train).type(torch.LongTensor)

features_valid_tensor = torch.from_numpy(features_valid)
labels_valid_tensor = torch.from_numpy(labels_valid).type(torch.LongTensor)

# # data loader
# train_loader = DataLoader(train_data, batch_size = batch_size, shuffle = False)
# test_loader = DataLoader(test_data, batch_size = batch_size, shuffle = False)

In [None]:
# defining the model
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(NeuralNet, self).__init__()
        self.l1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.l2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out = self.l1(x)
        out = self.relu(out)
        out = self.l2(out)
        return out
    
input_size = 28*28 # the size of an image
hidden_size = 100 # number of neurons in the hidden layer
output_size = 10 # number of label classes, from 0 to 9

model = NeuralNet(input_size, hidden_size, output_size)

In [None]:
def _run(model):
    
    def train_model(train_dataloader, device, optimizer, criterion):

        model.train()

        for i, (images, labels) in enumerate(train_dataloader): # enumerate gives us our actual index
            # 100, 1, 28, 28 the input we have to resize
            # 100, 784
            images = images.reshape(-1, 28*28)
            images = images.to(device)
            labels = labels.to(device)

            # forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)
            
#             xm.master_print(f'loss = {loss.item():4f}')

            # backward pass
            optimizer.zero_grad() # clear the gradients from the previous iteration
            loss.backward() # calculates gradients
            xm.optimizer_step(optimizer) # updates parameters


    def valid_model(valid_dataloader, device):
        
        # calculate accuracy
        correct = 0
        total = 0
        # Predict test dataset
        
        model.eval()
        
        for images, labels in valid_dataloader:
            images = images.reshape(-1, 28 * 28)
            labels = labels

            outputs = model(images)
            _, predicted = torch.max(outputs, 1)

            # total number of labels
            total += len(labels)
            # total number of correct predictions
            correct += (predicted == labels).sum()

        accuracy = 100 * correct / float(total)
        xm.master_print(f'Accuracy: {accuracy}')
        
    # batch size and epoch
    batch_size = 64
    num_epochs = 20
    
    train_dataset = torch.utils.data.TensorDataset(features_train_tensor, labels_train_tensor)
    valid_dataset = torch.utils.data.TensorDataset(features_valid_tensor, labels_valid_tensor)
    
    train_sampler = torch.utils.data.distributed.DistributedSampler(
          train_dataset,
          num_replicas=xm.xrt_world_size(),
          rank=xm.get_ordinal(),
          shuffle=True)
    valid_sampler = torch.utils.data.distributed.DistributedSampler(
          valid_dataset,
          num_replicas=xm.xrt_world_size(),
          rank=xm.get_ordinal(),
          shuffle=False)
    
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler, num_workers=1)
    valid_dataloader = DataLoader(valid_dataset, batch_size=32, sampler=valid_sampler, num_workers=1)
    
    device = xm.xla_device()
    model = model.to(device)
    
    # loss and optimizer
    learning_rate = 1e-3 * xm.xrt_world_size()
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    
    train_begin = time.time()
    for epoch in range(num_epochs):
        
        para_loader = pl.ParallelLoader(train_dataloader, [device])
        
        start = time.time()
        xm.master_print('*'*15)
        xm.master_print(f'EPOCH: {epoch+1}')
        xm.master_print('*'*15)

        xm.master_print('Training.....')
        
        train_model(train_dataloader=para_loader.per_device_loader(device),
                   device=device,
                   optimizer=optimizer,
                   criterion=criterion)
        xm.master_print(f'Epoch completed in {(time.time() - start)/60} minutes')
        
    with torch.no_grad():
        para_loader = pl.ParallelLoader(valid_dataloader, [device])
            
        xm.master_print('Validating...')
        valid_model(valid_dataloader=para_loader.per_device_loader(device), device=device)
            
    xm.master_print(f'Training completed in {(time.time() - train_begin)/60} minutes') 

In [None]:
# Start training processes
def _mp_fn(rank, flags):
    torch.set_default_tensor_type('torch.FloatTensor')
    a = _run(model)

FLAGS={}
xmp.spawn(_mp_fn, args=(FLAGS,), nprocs=8, start_method='fork')

References:
1. https://www.kaggle.com/abhiswain/pytorch-tpu-efficientnet-b5-tutorial-reference