# Test to run the experiments in the cloud

In [None]:
import mechanize
import time
from abc import ABC, abstractmethod
import pandas as pd
import paramiko
from dataclasses import dataclass
from libcloud.compute.providers import get_driver
from libcloud.compute.types import Provider
from config import config

### Configure the SSH user (Massoud cause it's the user where the bd folder is)

You need to create a GCLOUD-ACCOUNT with an extra key as explained here: https://libcloud.readthedocs.io/en/stable/compute/drivers/gce.html
and add it to the GCLOUD-KEY-PATH for the driver. The private RSA key named PKEY you'll find in your .ssh folder,
normally with the name gcloud-compute-platform

In [None]:
SSH_USER = config['SSH_USER']  #Username that you use to connect to server through ssh
BIGDL_USER = config["BIGDL_USER"]  #Username under which BIGDL is installed on server
GCLOUD_ACCOUNT = config["GCLOUD_ACCOUNT"]
GCLOUD_KEY_PATH = config["GCLOUD_KEY_PATH"]  # The path to the Service Account Key (a JSON file)
GCLOUD_PROJECT = config["GCLOUD_PROJECT"]  # GCloud project id
PKEY = config["PKEY"]
DESIGN_CSV = config["DESIGN_CSV"]  # The CSV with the experiment design

In [None]:
ComputeEngine = get_driver(Provider.GCE)

driver = ComputeEngine(GCLOUD_ACCOUNT, GCLOUD_KEY_PATH, project=GCLOUD_PROJECT)

In [None]:
driver.list_nodes()

### Create the Node definitions

In [None]:


class Node(ABC):
    def __init__(self, driver, name, master=False, masterNode=None):
        """Basic Node """
        print(f'Starting node with name {name}')
        self.driver = driver
        self.name=name
        if not master and masterNode == None:
            raise ValueError("Slave nodes need a master")
        self.master = masterNode
        _nodes = self.driver.list_nodes()
        for n in _nodes:
            if n.name == self.name:
                print(f'Found node {n} with name {n.name} and IPs {n.public_ips}, {n.private_ips}')
                self.public_ip = n.public_ips[0]
                self.private_ip = n.private_ips[0]
        self.connected = False
        self.running = False

        for i in range(5):  # Try 5 times
            try:
                self.open_ssh()
                break
            except Exception as e:
                print(e)
                time.sleep(5)
            if not self.connected:
                raise RuntimeError(f"Can't connect to node {self.name}")
        self.start_type()

    def open_ssh(self):
        self.ssh = paramiko.SSHClient()
        self.ssh.load_system_host_keys()
        self.ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
        self.k = paramiko.RSAKey.from_private_key_file(PKEY)
        self.ssh.connect(self.public_ip, username=SSH_USER, pkey=self.k)
        self.connected = True
        print(f'Node {self.name} connected via ssh')

    def reconnect_ssh(self):
        """ After a long time maybe the ssh closes,
        we need to restart the connection """
        self.ssh.connect(self.public_ip, username=USER, pkey=self.k)


    def __del__(self):
        self.close_ssh()

    def close_ssh(self):
        self.connected = False
        self.ssh.close()

    @abstractmethod
    def start_type(self):
        pass

@dataclass
class JobOptions:
    core_number: int
    batch_size: int
    max_epochs: int

class MasterNode(Node):
    def start_type(self):
        stdin, stdout, stderr = self.ssh.exec_command(
            f'/home/{BIGDL_USER}/bd/spark/sbin/start-master.sh')
        if len(stderr.read()) > 0:
            print(stdout.read())
            print(stderr.read())

    def submit(self, options: JobOptions, filename, save_path:str, timeout=None, blocking=False):

        # # thread to keep the ssh alive
        # keep_alive_thread = threading.Thread(target=self._keep_ssh_alive())
        #
        # self.running = True
        # keep_alive_thread.start()

        command = f"""/home/{BIGDL_USER}/bd/spark/bin/spark-submit --master spark://{self.private_ip}:7077 --driver-cores 4 \
                    --driver-memory 6G --total-executor-cores {options.core_number} --executor-cores 1 --executor-memory 3G \
                    --py-files /home/{BIGDL_USER}/bd/spark/lib/bigdl-0.11.0-python-api.zip,/home/{BIGDL_USER}/bd/mnist/lenet5.py \
                    --properties-file /home/{BIGDL_USER}/bd/spark/conf/spark-bigdl.conf \
                    --jars /home/{BIGDL_USER}/bd/spark/lib/bigdl-SPARK_2.3-0.11.0-jar-with-dependencies.jar \
                    --conf spark.driver.extraClassPath=/home/{BIGDL_USER}/bd/spark/lib/bigdl-SPARK_2.3-0.11.0-jar-with-dependencies.jar \
                    --conf spark.executer.extraClassPath=bigdl-SPARK_2.3-0.11.0-jar-with-dependencies.jar /home/{BIGDL_USER}/bd/mnist/lenet5.py \
                    --action train --dataPath /tmp/mnist --batchSize {options.batch_size} --endTriggerNum {options.max_epochs} > {save_path}{filename}.log"""

        print(command)


        # Get the stdout and err out in case we want the command to run blocking
        if not blocking:
            self.ssh.exec_command(command)
            return

        else:
            self.ssh.exec_command(command)

            # Check periodically whether the task is still running
            # and keep alive the ssh client as well
            finished = False
            while not finished:
                time.sleep(10)
                _, out, _ = self.ssh.exec_command('ps aux | grep spark-submit')
                proc = out.read().decode('utf-8')
                num_procs = len(proc.split('\n'))

                # By default this will return 3 lines:
                # - A line with the bash command
                # - A line with the grep
                # - A blank line
                #
                # If we get more than 3 we know that spark-submit is running

                # print(f'There are {num_procs} processes returned')
                if num_procs < 4:
                    print('Process is finished, exiting...')
                    finished = True


        # self.running = False

        print('Master exiting...')



    def cancel(self):
        br = mechanize.Browser()
        br.open(f"http://{self.public_ip}:8080")

        def select_form(form):
            return form.attrs.get('action', None) == 'app/kill/'
        try:
            br.select_form(predicate=select_form)
        except mechanize._mechanize.FormNotFoundError:
            print("FormNotFoundError")
        except Exception as e:
            print("An error occurred during cancelloing.")
            print(e)
        br.submit()

    # Renew the ssh connect in case it exists
    def _keep_ssh_alive(self):
        while self.running:
            # send a command every 60 seconds
            print('Sending command to the server...')
            self.ssh.exec_command('ls')
            time.sleep(60)



class SlaveNode(Node):
    def start_type(self):
        stdin, stdout, stderr = self.ssh.exec_command(f'/home/{BIGDL_USER}/bd/spark/sbin/start-slave.sh spark://{self.master.private_ip}:7077')
        if len(stderr.read()) > 0:
            print(stdout.read())
            print(stderr.read())



#### Create the nodes (master and slaves)

You just have to introduce the name of the node and it automatically finds it and starts all the
daemons necessary

In [None]:
# Try to connect to the master node

master = MasterNode(driver, 'bigdl-master-1', master=True)

In [None]:

# Try to create the other slaves
s1 = SlaveNode(driver, 'slave-1', master=False, masterNode=master)
s2 = SlaveNode(driver, 'slave-2', master=False, masterNode=master)
s3 = SlaveNode(driver, 'slave-3', master=False, masterNode=master)

In [None]:
# Create a list of nodes so we can run the commands on all of them easily
from typing import List
nodes :List[Node]= []
nodes.extend([master,s1,s2,s3])

#### Read the csv with the design and run all the experiments in a loop

We need to iterate the experiments and wait for the previous ones to complete
The format of the CSV with the experiments is

exp-number, cpus, batch-size, njobs

In [None]:
# declare the parameters to save the data and the logs
exp_folder = f'/home/{SSH_USER}/experiments/'
script_path = f'/home/{SSH_USER}/cpu_io_stats.py'

# experiment designs
factorial_2k = 'experiment_designs/2k_design.csv'
full_fact = 'experiment_designs/fullfact.csv'


# Runtime of the CPU and IO capturing script in seconds
EXP_RUNTIME = 300 # 5 minutes runtime by default
EPOCHS = 15 # 30 epochs by default


# read the file that we're interested in
exp = pd.read_csv(factorial_2k, dtype=int)
exp.columns.values[0] = 'Index'
exp.set_index('Index')

exp = exp.sort_values(by='batch', ascending=False)
exp

### Run the experiments in a loop

In [None]:
# Remove previous experiments to make room for the new (with a dialog for safety)
response = str(input('Removing previous experiments from the servers, continue? (y/N)'))
if response.lower() != 'y':
    print('stopping...')
    # quit the execution "nicely" without stopping the kernel
    raise KeyboardInterrupt

# Run the scripts in the multiple nddes
for n in nodes:
    # Create the folder for the experiment

    print('Removing previous experiments...')
    _, _, stderr = n.ssh.exec_command(f'rm -rf {exp_folder}*')
    if len(stderr.read()) != 0:
        print('Error creating folder', stderr.read())
        exit(-1)


start = time.time()

for idx, row in exp.iterrows():

    # build the JobOptions with the parameters from the dataframe line
    job = JobOptions(core_number=row.cpu, batch_size=row.batch, max_epochs=EPOCHS)
    filename = f'{int(row.Index)}-cpu{row.cpu}-batch{row.batch}-njobs{row.njobs}'
    print('\n',filename)

    for n in nodes:
        print('Creating folder in', n.name)

        # Try random command and reopen the ssh shell if needed
        try:
            _, _, stderr = n.ssh.exec_command('ls')
        except Exception as e:
            print('Issue while connecting to node, reconnecting...')
            n.reconnect_ssh()

        _, _, stderr = n.ssh.exec_command(f'mkdir {exp_folder}{filename}')
        if len(stderr.read()) != 0:
            print('Error creating folder', stderr.read())
            exit(-1)



        # execute the command to get the cpu and io_wait stats
        print(f'Executing command in {n.name}...')
        n.ssh.exec_command(f"""python3 {script_path} \
        -o {exp_folder}{filename}/{filename} -m {EXP_RUNTIME} >{exp_folder}{filename}/{filename}-data.out 2>&1""")


    njobs = row.njobs
    # submit job to the master
    print(f'Submitting {njobs} job(s) to the master with batch {row.batch} and {row.cpu} cores')

    try:
        for i in range(njobs):
            # Only block after submitting all of the tasks
            blocking = False if i < (njobs-1) else True
            print(f'Task number {i}, blocking = {blocking}')
            master.submit(job, save_path = f'{exp_folder}{filename}/', filename= filename, timeout=200, blocking=blocking)
    except Exception as e:
        print(f'Error in the command:', e)
        master.cancel()


print(f'\nExperiments finished after {(time.time()-start)/60} minutes')


#### Run the experiment and the script

You need to set the account and the location of the cpu-io script so it runs.
You also must define the number of seconds of runtime of the script and your experiment folder.

the script is run with ```python3 script -o [output_file] -m [max_seconds] -i [interval]```

- output file is a pickle file in the experiments folder
- max seconds is the run_time of the script. Should be less than the training time of the experiment
- interval at which the measurements are taken.


In [None]:
exp_folder = f'/home/{SSH_USER}/experiments/'
script_path = f'/home/{SSH_USER}/cpu_io_stats.py'
exp_name = 'example'

# Runtime of the CPU and IO capturing script in seconds
exp_runtime = 100

# Submit a job with batch size 128 and 1 epoch for testing
opt = JobOptions(256, 1)

# Save the output to this file
master.submit(opt, f'/home/{SSH_USER}/experiments/test.out', 200)

# Start the script to gather the cpu and io_wait usage
for n in nodes:
    # execute the command to get the cpu and io_wait stats
    print(f'Executing command in {n.name}...')
    n.ssh.exec_command(f"""python3 {script_path} -o {exp_folder}{exp_name} -m {exp_runtime} > {exp_folder}{exp_name}-data.out""")
