# Test to run the experiments in the cloud

In [17]:
import mechanize
import numpy
import os
import queue
import random
import shutil
import socket
import string
import threading
import time
import urllib.request
from abc import ABC, abstractmethod
import pandas as pd
import libcloud
import paramiko
from dataclasses import dataclass
from libcloud.compute.providers import get_driver
from libcloud.compute.types import Provider
from paramiko.buffered_pipe import PipeTimeout

### Configure the SSH user (Massoud cause it's the user where the bd folder is)

You need to create a GCLOUD-ACCOUNT with an extra key as explained here: https://libcloud.readthedocs.io/en/stable/compute/drivers/gce.html
and add it to the GCLOUD-KEY-PATH for the driver. The private RSA key named PKEY you'll find in your .ssh folder,
normally with the name gcloud-compute-platform

In [2]:
SSH_USER = 'am72ghiassi'
GCLOUD_ACCOUNT = 'libcloud@qpe-big-dl.iam.gserviceaccount.com'
GCLOUD_KEY_PATH = './qpe.json'  # The path to the Service Account Key (a JSON file)
GCLOUD_PROJECT = 'qpe-big-dl'  # GCloud project id
PKEY = './google_compute_engine'
DESIGN_CSV = ''  # The CSV with the experiment design

In [3]:
ComputeEngine = get_driver(Provider.GCE)

driver = ComputeEngine(GCLOUD_ACCOUNT, GCLOUD_KEY_PATH, project=GCLOUD_PROJECT)


### Create the Node definitions

In [32]:

class Node(ABC):
    def __init__(self, driver, name, master=False, masterNode=None):
        """Basic Node """
        print(f'Starting node with name {name}')
        self.driver = driver
        self.name=name
        if not master and masterNode == None:
            raise ValueError("Slave nodes need a master")
        self.master = masterNode
        _nodes = self.driver.list_nodes()
        for n in _nodes:
            if n.name == self.name:
                print(f'Found node {n} with name {n.name} and IPs {n.public_ips}, {n.private_ips}')
                self.public_ip = n.public_ips[0]
                self.private_ip = n.private_ips[0]
        self.connected = False

        for i in range(5):  # Try 5 times
            try:
                self.open_ssh()
                break
            except Exception as e:
                print(e)
                time.sleep(5)
            if not self.connected:
                raise RuntimeError(f"Can't connect to node {self.name}")
        self.start_type()

    def open_ssh(self):
        self.ssh = paramiko.SSHClient()
        self.ssh.load_system_host_keys()
        self.ssh.set_missing_host_key_policy(paramiko.WarningPolicy())
        self.k = paramiko.RSAKey.from_private_key_file(PKEY)
        self.ssh.connect(self.public_ip, username='diego', pkey=self.k)
        self.connected = True
        print(f'Node {self.name} connected via ssh')

    def __del__(self):
        self.close_ssh()

    def close_ssh(self):
        self.connected = False
        self.ssh.close()

    @abstractmethod
    def start_type(self):
        pass

@dataclass
class JobOptions:
    batch_size: int
    max_epochs: int

class MasterNode(Node):
    def start_type(self):
        stdin, stdout, stderr = self.ssh.exec_command(
            f'/home/{SSH_USER}/bd/spark/sbin/start-master.sh')
        if len(stderr.read()) > 0:
            print(stdout.read())
            print(stderr.read())

    def submit(self, options: JobOptions, filename, timeout=None, blocking=False):


        command = f"""/home/{SSH_USER}/bd/spark/bin/spark-submit --master spark://{self.private_ip}:7077 --driver-cores 1 \
                    --driver-memory 1G --total-executor-cores 2 --executor-cores 1 --executor-memory 1G \
                    --py-files /home/{SSH_USER}/bd/spark/lib/bigdl-0.11.0-python-api.zip,/home/{SSH_USER}/bd/mnist/lenet5.py \
                    --properties-file /home/{SSH_USER}/bd/spark/conf/spark-bigdl.conf \
                    --jars /home/{SSH_USER}/bd/spark/lib/bigdl-SPARK_2.3-0.11.0-jar-with-dependencies.jar \
                    --conf spark.driver.extraClassPath=/home/{SSH_USER}/bd/spark/lib/bigdl-SPARK_2.3-0.11.0-jar-with-dependencies.jar \
                    --conf spark.executer.extraClassPath=bigdl-SPARK_2.3-0.11.0-jar-with-dependencies.jar /home/{SSH_USER}/bd/mnist/lenet5.py \
                    --action train --dataPath /tmp/mnist --batchSize {options.batch_size} --endTriggerNum {options.max_epochs} > {filename}.log"""

        print(command)


        # Get the stdout and err out in case we want the command to run blocking
        if not blocking:
            self.ssh.exec_command(command)

        else:
            _, stdout, stderr = self.ssh.exec_command(command)

            if len(stderr.read()) > 0:
                print(f'There were some errors running the experiment {filename}')
                print(stdout.read())
                print(stderr.read())



    def cancel(self):
        br = mechanize.Browser()
        br.open(f"http://{self.public_ip}:8080")

        def select_form(form):
            return form.attrs.get('action', None) == 'app/kill/'
        try:
            br.select_form(predicate=select_form)
        except mechanize._mechanize.FormNotFoundError:
            print("FormNotFoundError")
        except Exception as e:
            print("An error occurred during cancelloing.")
            print(e)
        br.submit()


class SlaveNode(Node):
    def start_type(self):
        stdin, stdout, stderr = self.ssh.exec_command(f'/home/{SSH_USER}/bd/spark/sbin/start-slave.sh spark://{self.master.private_ip}:7077')
        if len(stderr.read()) > 0:
            print(stdout.read())
            print(stderr.read())

#### Create the nodes (master and slaves)

You just have to introduce the name of the node and it automatically finds it and starts all the
daemons necessary

In [34]:
# Try to connect to the master node
master = MasterNode(driver, 'bigdl', master=True)

Starting node with name bigdl
Found node <Node: uuid=cca95b2c0671f4fddfc8dd6f0e127b94cd440f20, name=bigdl, state=RUNNING, public_ips=['34.68.44.195'], private_ips=['10.128.0.4'], provider=Google Compute Engine ...> with name bigdl and IPs ['34.68.44.195'], ['10.128.0.4']
Node bigdl connected via ssh


  key.get_name(), hostname, hexlify(key.get_fingerprint())


In [35]:

# Try to create the other slaves
s1 = SlaveNode(driver, 'instance-1', master=False, masterNode=master)
s2 = SlaveNode(driver, 'instance-2', master=False, masterNode=master)


Starting node with name instance-1
Found node <Node: uuid=2f0b3b48e49b0ef04c01e71aaa9ef20d8388451b, name=instance-1, state=RUNNING, public_ips=['34.123.237.102'], private_ips=['10.128.0.5'], provider=Google Compute Engine ...> with name instance-1 and IPs ['34.123.237.102'], ['10.128.0.5']
Node instance-1 connected via ssh
Starting node with name instance-2
Found node <Node: uuid=15d8728539302907d0a183542c07f7f925cbbb5f, name=instance-2, state=RUNNING, public_ips=['35.238.147.71'], private_ips=['10.128.0.6'], provider=Google Compute Engine ...> with name instance-2 and IPs ['35.238.147.71'], ['10.128.0.6']
Node instance-2 connected via ssh


  key.get_name(), hostname, hexlify(key.get_fingerprint())
  key.get_name(), hostname, hexlify(key.get_fingerprint())


In [36]:
# Create a list of nodes so we can run the commands on all of them easily
from typing import List
nodes :List[Node]= []
nodes.extend([master,s1,s2])

#### Read the csv with the design and run all the experiments in a loop

We need to iterate the experiments and wait for the previous ones to complete

The format of the CSV with the experiments is

exp-number, batch-size, executor-cores, epochs

In [38]:
# declare the parameters to save the data and the logs
user ='diego'
exp_folder = f'/home/{user}/experiments/'
script_path = f'/home/{user}/cpu_io_stats.py'


# Runtime of the CPU and IO capturing script in seconds
exp_runtime = 30

exp = pd.read_csv('experiments.csv')
exp.columns.values[0] = 'Index'
exp.set_index('Index')


start = time.time()

for idx, row in exp.iterrows():

    # build the JobOptions
    job = JobOptions(row.batch, row.epochs)
    filename = f'{int(row.Index)}-batch{row.batch}-epochs{row.epochs}'
    print(filename)

    # Run the scripts in the multiple nddes
    for n in nodes:
    # execute the command to get the cpu and io_wait stats
        print(f'Executing command in {n.name}...')
        n.ssh.exec_command(f"""python3 {script_path} \
        -o {exp_folder}{filename} -m {exp_runtime} > {exp_folder}{filename}-data.out""")

    # start the scripts in all the nodes
    print(f'Submitting job to the master with batch {row.batch} and {row.epochs} epochs')
    try:
        master.submit(job, filename= filename, timeout=200, blocking=True)
    except Exception as e:
        print(f'Error in the command:', e)


print(f'Experiments finished after {(time.time()-start)/60} minutes')


0-batch32-epochs1
Executing command in bigdl...
Executing command in instance-1...
Executing command in instance-2...
Submitting job to the master with batch 32 and 1 epochs
/home/am72ghiassi/bd/spark/bin/spark-submit --master spark://10.128.0.4:7077 --driver-cores 1                     --driver-memory 1G --total-executor-cores 2 --executor-cores 1 --executor-memory 1G                     --py-files /home/am72ghiassi/bd/spark/lib/bigdl-0.11.0-python-api.zip,/home/am72ghiassi/bd/mnist/lenet5.py                     --properties-file /home/am72ghiassi/bd/spark/conf/spark-bigdl.conf                     --jars /home/am72ghiassi/bd/spark/lib/bigdl-SPARK_2.3-0.11.0-jar-with-dependencies.jar                     --conf spark.driver.extraClassPath=/home/am72ghiassi/bd/spark/lib/bigdl-SPARK_2.3-0.11.0-jar-with-dependencies.jar                     --conf spark.executer.extraClassPath=bigdl-SPARK_2.3-0.11.0-jar-with-dependencies.jar /home/am72ghiassi/bd/mnist/lenet5.py                     --action 

#### Run the experiment and the script

You need to set the account and the location of the cpu-io script so it runs.
You also must define the number of seconds of runtime of the script and your experiment folder.

the script is run with ```python3 script -o [output_file] -m [max_seconds] -i [interval]```

- output file is a pickle file in the experiments folder
- max seconds is the run_time of the script. Should be less than the training time of the experiment
- interval at which the measurements are taken.

In [None]:

user ='diego'
exp_folder = f'/home/{user}/experiments/'
script_path = f'/home/{user}/cpu_io_stats.py'
exp_name = 'example'

# Runtime of the CPU and IO capturing script in seconds
exp_runtime = 100

# Submit a job with batch size 128 and 1 epoch for testing
opt = JobOptions(256, 1)

# Save the output to this file
master.submit(opt, '/home/diego/experiments/test.out', 200)

# Start the script to gather the cpu and io_wait usage
for n in nodes:
    # execute the command to get the cpu and io_wait stats
    print(f'Executing command in {n.name}...')
    n.ssh.exec_command(f"""python3 {script_path} -o {exp_folder}{exp_name} -m {exp_runtime} > {exp_folder}{exp_name}-data.out""")
