# Benchmarking for MPI

This notebook benchmarks code for the MPI data-parallel implementation.

# Setup

This code sets up the necessary libraries and downloads the training data.

In [None]:
!pip install numpy
!pip install tqdm
!pip install mpi4py
!pip install requests

# Download the datasets necessary

import urllib.request
import os

# Download Penn Treebank dataset
ptb_data = "https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb."
for f in ['train.txt', 'test.txt', 'valid.txt']:
    if not os.path.exists(os.path.join('./data/ptb', f)):
        urllib.request.urlretrieve(ptb_data + f, os.path.join('./data/ptb', f))

# Download CIFAR-10 dataset
if not os.path.isdir("./data/cifar-10-batches-py"):
    urllib.request.urlretrieve("https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz", "./data/cifar-10-python.tar.gz")
    !tar -xvzf './data/cifar-10-python.tar.gz' -C './data'

## Single core w/ Numpy

We first train on a single node with numpy.

In [None]:
import sys
sys.path.append('./python')
sys.path.append('./apps')
import needle as ndl
from models import ResNet9
from simple_ml import train_cifar10, evaluate_cifar10

device = ndl.cpu_numpy()
dataset = ndl.data.CIFAR10Dataset("data/cifar-10-batches-py", train=True)
dataloader = ndl.data.DataLoader(\
         dataset=dataset,
         batch_size=128,
         shuffle=True,)
model = ResNet9(device=device, dtype="float32")
train_cifar10(model, dataloader, n_epochs=2, optimizer=ndl.optim.Adam,
      lr=0.001, weight_decay=0.001, device=device)
print("Evaluating on test dataset...")
test_dataset = ndl.data.CIFAR10Dataset("data/cifar-10-batches-py", train=False)
test_dataloader = ndl.data.DataLoader(\
         dataset=test_dataset,
         batch_size=128,
         shuffle=True,)
evaluate_cifar10(model, test_dataloader, device=device)

## Multicore w/ Numpy, with Data Parallel

We now try training with 2, 4, and 8 ranks.

In [None]:
!mpirun --allow-run-as-root -np 2 python apps/simple_ml_mpi.py 2 64 numpy

In [None]:
!mpirun --allow-run-as-root -np 4 python apps/simple_ml_mpi.py 5 32 numpy

In [None]:
!mpirun --allow-run-as-root -np 8 python apps/simple_ml_mpi.py 5 16 numpy

# Language Model Training

In [None]:
import sys
sys.path.append('./python')
sys.path.append('./apps')
import needle as ndl
from models import LanguageModel
from simple_ml import train_ptb, evaluate_ptb

device = ndl.msl()
corpus = ndl.data.Corpus("data/ptb")
train_data = ndl.data.batchify(corpus.train, batch_size=16, device=ndl.cpu(), dtype="float32")
model = LanguageModel(30, len(corpus.dictionary), hidden_size=10, num_layers=2, seq_model='rnn', device=device)
train_ptb(model, train_data, seq_len=1, n_epochs=1, device=device)
evaluate_ptb(model, train_data, seq_len=40, device=device)