# Test to run the experiments in the cloud

In [1]:
import mechanize
import numpy
import os
import queue
import random

import shutil
import socket
import string
import threading
import time
import urllib.request
from abc import ABC, abstractmethod
import pandas as pd
import libcloud
import paramiko
from dataclasses import dataclass
from libcloud.compute.providers import get_driver
from libcloud.compute.types import Provider
from paramiko.buffered_pipe import PipeTimeout

### Configure the SSH user (Massoud cause it's the user where the bd folder is)

You need to create a GCLOUD-ACCOUNT with an extra key as explained here: https://libcloud.readthedocs.io/en/stable/compute/drivers/gce.html
and add it to the GCLOUD-KEY-PATH for the driver. The private RSA key named PKEY you'll find in your .ssh folder,
normally with the name gcloud-compute-platform

In [60]:
SSH_USER = 'am72ghiassi'
GCLOUD_ACCOUNT = 'libcloud@qpe-big-dl.iam.gserviceaccount.com'
GCLOUD_KEY_PATH = './qpe.json'  # The path to the Service Account Key (a JSON file)
GCLOUD_PROJECT = 'qpe-big-dl'  # GCloud project id
PKEY = './google_compute_engine'
DESIGN_CSV = ''  # The CSV with the experiment design

# Our own user
USER ='diego'

In [38]:
ComputeEngine = get_driver(Provider.GCE)

driver = ComputeEngine(GCLOUD_ACCOUNT, GCLOUD_KEY_PATH, project=GCLOUD_PROJECT)

In [39]:
driver.list_nodes()

[<Node: uuid=cca95b2c0671f4fddfc8dd6f0e127b94cd440f20, name=bigdl, state=STOPPED, public_ips=[None], private_ips=['10.128.0.4'], provider=Google Compute Engine ...>,
 <Node: uuid=3124f0c0c1eaf12f9aeef33c7d532c7952535b20, name=bigdl-master-1, state=RUNNING, public_ips=['34.67.204.85'], private_ips=['10.128.0.11'], provider=Google Compute Engine ...>,
 <Node: uuid=68c98a4af0f1a83c44031a02bdad5503b94447b3, name=slave-1, state=RUNNING, public_ips=['35.239.159.190'], private_ips=['10.128.0.15'], provider=Google Compute Engine ...>,
 <Node: uuid=412cc4e5e482b0e7ff44f0c276b07adec8c6aa20, name=slave-2, state=RUNNING, public_ips=['34.123.251.206'], private_ips=['10.128.0.16'], provider=Google Compute Engine ...>,
 <Node: uuid=587a4159388cbe73941e162834b68aae06c9af12, name=slave-3, state=RUNNING, public_ips=['35.224.46.25'], private_ips=['10.128.0.17'], provider=Google Compute Engine ...>]

### Create the Node definitions

In [92]:


class Node(ABC):
    def __init__(self, driver, name, master=False, masterNode=None):
        """Basic Node """
        print(f'Starting node with name {name}')
        self.driver = driver
        self.name=name
        if not master and masterNode == None:
            raise ValueError("Slave nodes need a master")
        self.master = masterNode
        _nodes = self.driver.list_nodes()
        for n in _nodes:
            if n.name == self.name:
                print(f'Found node {n} with name {n.name} and IPs {n.public_ips}, {n.private_ips}')
                self.public_ip = n.public_ips[0]
                self.private_ip = n.private_ips[0]
        self.connected = False
        self.running = False

        for i in range(5):  # Try 5 times
            try:
                self.open_ssh()
                break
            except Exception as e:
                print(e)
                time.sleep(5)
            if not self.connected:
                raise RuntimeError(f"Can't connect to node {self.name}")
        self.start_type()

    def open_ssh(self):
        self.ssh = paramiko.SSHClient()
        self.ssh.load_system_host_keys()
        self.ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
        self.k = paramiko.RSAKey.from_private_key_file(PKEY)
        self.ssh.connect(self.public_ip, username=USER, pkey=self.k)
        self.connected = True
        print(f'Node {self.name} connected via ssh')

    def reconnect_ssh(self):
        """ After a long time maybe the ssh closes,
        we need to restart the connection """
        self.ssh.connect(self.public_ip, username=USER, pkey=self.k)


    def __del__(self):
        self.close_ssh()

    def close_ssh(self):
        self.connected = False
        self.ssh.close()

    @abstractmethod
    def start_type(self):
        pass

@dataclass
class JobOptions:
    core_number: int
    batch_size: int
    max_epochs: int

class MasterNode(Node):
    def start_type(self):
        stdin, stdout, stderr = self.ssh.exec_command(
            f'/home/{SSH_USER}/bd/spark/sbin/start-master.sh')
        if len(stderr.read()) > 0:
            print(stdout.read())
            print(stderr.read())

    def submit(self, options: JobOptions, filename, save_path:str, timeout=None, blocking=False):

        # thread to keep the ssh alive
        keep_alive_thread = threading.Thread(target=self._keep_ssh_alive())

        self.running = True
        keep_alive_thread.start()


        command = f"""/home/{SSH_USER}/bd/spark/bin/spark-submit --master spark://{self.private_ip}:7077 --driver-cores 4 \
                    --driver-memory 6G --total-executor-cores {options.core_number} --executor-cores 1 --executor-memory 3G \
                    --py-files /home/{SSH_USER}/bd/spark/lib/bigdl-0.11.0-python-api.zip,/home/{SSH_USER}/bd/mnist/lenet5.py \
                    --properties-file /home/{SSH_USER}/bd/spark/conf/spark-bigdl.conf \
                    --jars /home/{SSH_USER}/bd/spark/lib/bigdl-SPARK_2.3-0.11.0-jar-with-dependencies.jar \
                    --conf spark.driver.extraClassPath=/home/{SSH_USER}/bd/spark/lib/bigdl-SPARK_2.3-0.11.0-jar-with-dependencies.jar \
                    --conf spark.executer.extraClassPath=bigdl-SPARK_2.3-0.11.0-jar-with-dependencies.jar /home/{SSH_USER}/bd/mnist/lenet5.py \
                    --action train --dataPath /tmp/mnist --batchSize {options.batch_size} --endTriggerNum {options.max_epochs} > {save_path}{filename}.log"""

        print(command)


        # Get the stdout and err out in case we want the command to run blocking
        if not blocking:
            self.ssh.exec_command(command)

        else:
            stdin, stdout, stderr = self.ssh.exec_command(command)

            if len(stderr.read()) > 0:
                print(f'There were some errors running the experiment {filename}')
                print(stdout.read())
                print(stderr.read())

        self.running = False



    def cancel(self):
        br = mechanize.Browser()
        br.open(f"http://{self.public_ip}:8080")

        def select_form(form):
            return form.attrs.get('action', None) == 'app/kill/'
        try:
            br.select_form(predicate=select_form)
        except mechanize._mechanize.FormNotFoundError:
            print("FormNotFoundError")
        except Exception as e:
            print("An error occurred during cancelloing.")
            print(e)
        br.submit()

    def _keep_ssh_alive(self):
        while self.running:
            # send a command every 60 seconds
            print('Sending command to the server...')
            self.ssh.exec_command('ls')
            time.sleep(60)



class SlaveNode(Node):
    def start_type(self):
        stdin, stdout, stderr = self.ssh.exec_command(f'/home/{SSH_USER}/bd/spark/sbin/start-slave.sh spark://{self.master.private_ip}:7077')
        if len(stderr.read()) > 0:
            print(stdout.read())
            print(stderr.read())



#### Create the nodes (master and slaves)

You just have to introduce the name of the node and it automatically finds it and starts all the
daemons necessary

In [94]:
# Try to connect to the master node
master = MasterNode(driver, 'bigdl-master-1', master=True)

Starting node with name bigdl-master-1
Found node <Node: uuid=3124f0c0c1eaf12f9aeef33c7d532c7952535b20, name=bigdl-master-1, state=RUNNING, public_ips=['34.67.204.85'], private_ips=['10.128.0.11'], provider=Google Compute Engine ...> with name bigdl-master-1 and IPs ['34.67.204.85'], ['10.128.0.11']
Node bigdl-master-1 connected via ssh


In [95]:

# Try to create the other slaves
s1 = SlaveNode(driver, 'slave-1', master=False, masterNode=master)
s2 = SlaveNode(driver, 'slave-2', master=False, masterNode=master)
s3 = SlaveNode(driver, 'slave-3', master=False, masterNode=master)

Starting node with name slave-1
Found node <Node: uuid=68c98a4af0f1a83c44031a02bdad5503b94447b3, name=slave-1, state=RUNNING, public_ips=['35.239.159.190'], private_ips=['10.128.0.15'], provider=Google Compute Engine ...> with name slave-1 and IPs ['35.239.159.190'], ['10.128.0.15']
Node slave-1 connected via ssh
Starting node with name slave-2
Found node <Node: uuid=412cc4e5e482b0e7ff44f0c276b07adec8c6aa20, name=slave-2, state=RUNNING, public_ips=['34.123.251.206'], private_ips=['10.128.0.16'], provider=Google Compute Engine ...> with name slave-2 and IPs ['34.123.251.206'], ['10.128.0.16']
Node slave-2 connected via ssh
Starting node with name slave-3
Found node <Node: uuid=587a4159388cbe73941e162834b68aae06c9af12, name=slave-3, state=RUNNING, public_ips=['35.224.46.25'], private_ips=['10.128.0.17'], provider=Google Compute Engine ...> with name slave-3 and IPs ['35.224.46.25'], ['10.128.0.17']
Node slave-3 connected via ssh


In [96]:
# Create a list of nodes so we can run the commands on all of them easily
from typing import List
nodes :List[Node]= []
nodes.extend([master,s1,s2,s3])

#### Read the csv with the design and run all the experiments in a loop

We need to iterate the experiments and wait for the previous ones to complete
The format of the CSV with the experiments is

exp-number, cpus, batch-size, njobs

In [97]:
# declare the parameters to save the data and the logs
exp_folder = f'/home/{USER}/experiments/'
script_path = f'/home/{USER}/cpu_io_stats.py'

# experiment designs
factorial_2k = 'experiment_designs/2k_design.csv'
full_fact = 'experiment_designs/fullfact.csv'


# Runtime of the CPU and IO capturing script in seconds
EXP_RUNTIME = 5 * 60 # 5 minutes runtime by default
EPOCHS = 15 # 30 epochs by default


# read the file that we're interested in
exp = pd.read_csv(factorial_2k, dtype=int)
exp.columns.values[0] = 'Index'
exp.set_index('Index')

exp

Unnamed: 0,Index,cpu,batch,njobs
0,0,1,64,1
1,1,8,64,1
2,2,1,1024,1
3,3,8,1024,1
4,4,1,64,5
5,5,8,64,5
6,6,1,1024,5
7,7,8,1024,5


### Run the experiments in a loop

In [98]:
# Remove previous experiments to make room for the new (with a dialog for safety)
response = str(input('Removing previous experiments from the servers, continue? (y/N)'))
if response.lower() != 'y':
    print('stopping...')
    # quit the execution "nicely" without stopping the kernel
    raise KeyboardInterrupt

# Run the scripts in the multiple nddes
for n in nodes:
    # Create the folder for the experiment

    print('Removing previous experiments...')
    _, _, stderr = n.ssh.exec_command(f'rm -rf {exp_folder}*')
    if len(stderr.read()) != 0:
        print('Error creating folder', stderr.read())
        exit(-1)


start = time.time()

for idx, row in exp.iterrows():

    # build the JobOptions with 30 epochs by default
    job = JobOptions(core_number=row.cpu, batch_size=row.batch, max_epochs=EPOCHS)
    filename = f'{int(row.Index)}-cpu{row.cpu}-batch{row.batch}-njobs{row.njobs}'
    print('\n',filename)

    for n in nodes:
        # print('Creating folder in', n.name)

        # Try random command and reopen the ssh shell if needed
        try:
            _, _, stderr = n.ssh.exec_command('ls')
        except Exception as e:
            print('Issue while connecting to node, reconnecting...')
            n.reconnect_ssh()

        _, _, stderr = n.ssh.exec_command(f'mkdir {exp_folder}{filename}')
        if len(stderr.read()) != 0:
            print('Error creating folder', stderr.read())
            exit(-1)



        # execute the command to get the cpu and io_wait stats
        # print(f'Executing command in {n.name}...')
        n.ssh.exec_command(f"""python3 {script_path} \
        -o {exp_folder}{filename}/{filename} -m {EXP_RUNTIME} >{exp_folder}{filename}/{filename}-data.out 2>&1""")


    njobs = row.njobs
    # submit job to the master
    print(f'Submitting {njobs} job(s) to the master with batch {row.batch} and {row.cpu} cores')

    try:
        for i in range(njobs):
            # Only block after submitting all of the tasks
            blocking = False if i < (njobs-1) else True
            print(f'Task number {i}, blocking = {blocking}')
            master.submit(job, save_path = f'{exp_folder}{filename}/', filename= filename, timeout=200, blocking=blocking)
    except Exception as e:
        print(f'Error in the command:', e)
        master.cancel()


print(f'\nExperiments finished after {(time.time()-start)/60} minutes')


Removing previous experiments...
Removing previous experiments...
Removing previous experiments...
Removing previous experiments...

 0-cpu1-batch64-njobs1
Submitting 1 job(s) to the master with batch 64 and 1 cores
Task number 0, blocking = True
/home/am72ghiassi/bd/spark/bin/spark-submit --master spark://10.128.0.11:7077 --driver-cores 4                     --driver-memory 6G --total-executor-cores 1 --executor-cores 1 --executor-memory 3G                     --py-files /home/am72ghiassi/bd/spark/lib/bigdl-0.11.0-python-api.zip,/home/am72ghiassi/bd/mnist/lenet5.py                     --properties-file /home/am72ghiassi/bd/spark/conf/spark-bigdl.conf                     --jars /home/am72ghiassi/bd/spark/lib/bigdl-SPARK_2.3-0.11.0-jar-with-dependencies.jar                     --conf spark.driver.extraClassPath=/home/am72ghiassi/bd/spark/lib/bigdl-SPARK_2.3-0.11.0-jar-with-dependencies.jar                     --conf spark.executer.extraClassPath=bigdl-SPARK_2.3-0.11.0-jar-with-dependenc

KeyboardInterrupt: 

#### Run the experiment and the script

You need to set the account and the location of the cpu-io script so it runs.
You also must define the number of seconds of runtime of the script and your experiment folder.

the script is run with ```python3 script -o [output_file] -m [max_seconds] -i [interval]```

- output file is a pickle file in the experiments folder
- max seconds is the run_time of the script. Should be less than the training time of the experiment
- interval at which the measurements are taken.


In [None]:

exp_folder = f'/home/{USER}/experiments/'
script_path = f'/home/{USER}/cpu_io_stats.py'
exp_name = 'example'

# Runtime of the CPU and IO capturing script in seconds
exp_runtime = 100

# Submit a job with batch size 128 and 1 epoch for testing
opt = JobOptions(256, 1)

# Save the output to this file
master.submit(opt, '/home/diego/experiments/test.out', 200)

# Start the script to gather the cpu and io_wait usage
for n in nodes:
    # execute the command to get the cpu and io_wait stats
    print(f'Executing command in {n.name}...')
    n.ssh.exec_command(f"""python3 {script_path} -o {exp_folder}{exp_name} -m {exp_runtime} > {exp_folder}{exp_name}-data.out""")
