In [1]:
#sf api setup
import time
import asyncio
import logging
from IPython.display import clear_output

from os.path import join

from sfapi_client         import Client, AsyncClient
from sfapi_client.compute import Machine
from sfapi_client.jobs    import JobState

from sfapi_connector import KeyManager, OsSFAPI, OsWrapper, LOGGER
import json
import re
import subprocess
import webbrowser



In [4]:
# sf api send jobscript



target = "./sfapi_test"


job_script = f"""#!/bin/bash
#SBATCH -q debug
#SBATCH -A m669
#SBATCH -N 2
#SBATCH -n 64              # Number of tasks (64 tasks, 32 per node)
#SBATCH -C cpu
#SBATCH -t 00:30:00
#SBATCH -J sfapi-demo
#SBATCH --exclusive
#SBATCH --output={target}/sfapi-demo-%j.out
#SBATCH --error={target}/sfapi-demo-%j.error

# Print each command for debugging
set -x

# Load necessary modules, for example Python and Dask
module load python dask

echo "Starting scheduler..."

scheduler_file=$SCRATCH/scheduler_file.json
rm -f $scheduler_file

module load python

# Start scheduler
DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT=3600s \
DASK_DISTRIBUTED__COMM__TIMEOUTS__TCP=3600s \
dask-scheduler \
    --interface hsn0 \
    --scheduler-file $scheduler_file &

dask_pid=$!

# Wait for the scheduler to start
sleep 5
until [ -f $scheduler_file ]
do
     echo "Waiting for scheduler to start..."
     sleep 5
done
echo "Scheduler started"

echo "Starting workers..."

# Start workers and redirect their output to a log file for better visibility
DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT=3600s \
DASK_DISTRIBUTED__COMM__TIMEOUTS__TCP=3600s \
srun dask worker \
    --scheduler-file $scheduler_file \
    --interface hsn0 \
    --nworkers 1 > $SCRATCH/worker_log.out 2>&1 &

echo "Workers started. Check $SCRATCH/worker_log.out for details."

# Wait a bit to ensure workers are started
echo "Sleeping..."
sleep 10

# Check number of workers
echo "Verifying number of Dask workers..."
python -c "from dask.distributed import Client; client = Client(scheduler_file='$scheduler_file'); print('Number of workers:', len(client.scheduler_info()['workers']))"


echo "hostname: $(hostname -f)"


echo "waiting for client connection..."
wait



"""



km = KeyManager()

with Client(key=km.key) as client:
    perlmutter = client.compute(Machine.perlmutter)


    job = perlmutter.submit_job(job_script)
    job_global = job
    print(f"Submitted_job: {job.jobid}")
    job_id = job.jobid

    while True:
        job.update()
        clear_output(wait=True)

        print(f"The job state is: {job.state} ({type(job.state)})")
        if job.state not in [JobState.PENDING, JobState.RUNNING, JobState.COMPLETING]:
            if job.state == JobState.FAILED:
                print("Job failed")
            elif job.state == JobState.COMPLETED:
                print("Job completed")
                
            elif job.state == JobState.TIMEOUT:
                print("Job timeout")
                  
            break
        try: 
            output_file = perlmutter.ls(f"/global/homes/s/sanjeevc/sfapi_test/sfapi-demo-{job_id}.out") #todo change to relative paths
            output_file = output_file[0]
            with output_file.open("r") as f:
                file_content = f.read()
                print(file_content)

            # Parsing the hostname using a regex pattern #todo remove hostname search as hostname is not used
            hostname_match = re.search(r"hostname: (.+)", file_content)
            if hostname_match:
                hostname = hostname_match.group(1)
                print(f"Hostname parsed: {hostname}")
                print(f"---------------------------------")
                #find the dask ip for the ssh tunnel. This reads it from the scheduler file that dask makes
                print(f"Searching for Dask IP address...")
                output_file = perlmutter.ls(f"/pscratch/sd/s/sanjeevc/scheduler_file.json")
                output_file = output_file[0]
                with output_file.open("r") as f:
                    file_content = f.read()
                    print(file_content)
                
                                # Parse the JSON content to extract the Dask IP
                scheduler_info = json.loads(file_content)
                dask_address = scheduler_info.get("address", "")
                daskip = re.search(r"tcp://([\d.]+):", dask_address)

                # Extract and print the IP address
                if daskip:
                    daskip = daskip.group(1)
                    print(f"Dask IP address: {daskip}")
                    
                    
                    # Open the SSH tunnel to perlmutter
                    # Example usage
                    command = [
                        "ssh",
                        "-o", "ServerAliveInterval=30", 
                        "-l", "sanjeevc",
                        "-i", "~/.ssh/nersc",
                        "-L", f"8786:{daskip}:8786",
                        "-L", f"8787:{daskip}:8787",
                        "sanjeevc@perlmutter.nersc.gov"
                    ]
                    subprocess.Popen(command)
                    time.sleep(3)
                    print("SSH tunnel opened")
                    webbrowser.open('http://localhost:8787/status')

                   
                   
                   
                   
                    #keep tunnel open till user wants to close it and cancel the job
                    input("Cancel job?")
                    job.cancel()
                    print("Job cancelled")
                    
                    
                    
                else:
                    print("Dask IP address not found.")
                
                
                
                break
            else:
                print("Hostname not found in the file.")
        except Exception as e:
            print(f"error: {e}")
        time.sleep(10)
        
        


The job state is: JobState.RUNNING (<enum 'JobState'>)
Starting scheduler...
Waiting for scheduler to start...
Waiting for scheduler to start...
Scheduler started
Starting workers...
Workers started. Check /pscratch/sd/s/sanjeevc/worker_log.out for details.
Sleeping...
Verifying number of Dask workers...
Number of workers: 32
hostname: x1304c6s5b1n1h0.chn.perlmutter.nersc.gov
waiting for client connection...

Hostname parsed: x1304c6s5b1n1h0.chn.perlmutter.nersc.gov
---------------------------------
Searching for Dask IP address...
b'{\n  "type": "Scheduler",\n  "id": "Scheduler-aac0fca5-920a-42ca-a626-7ff6ddcdb501",\n  "address": "tcp://10.249.18.159:8786",\n  "services": {\n    "dashboard": 8787\n  },\n  "started": 1731456777.3343115,\n  "workers": {}\n}'
Dask IP address: 10.249.18.159
SSH tunnel opened


Pseudo-terminal will not be allocated because stdin is not a terminal.
***************************************************************************
                          NOTICE TO USERS

Lawrence Berkeley National Laboratory operates this computer system under 
contract to the U.S. Department of Energy.  This computer system is the 
property of the United States Government and is for authorized use only.
Users (authorized or unauthorized) have no explicit or implicit 
expectation of privacy.

Any or all uses of this system and all files on this system may be
intercepted, monitored, recorded, copied, audited, inspected, and disclosed
to authorized site, Department of Energy, and law enforcement personnel,
as well as authorized officials of other agencies, both domestic and foreign.
By using this system, the user consents to such interception, monitoring,
recording, copying, auditing, inspection, and disclosure at the discretion
of authorized site or Department of Energy personnel.

U

Job cancelled


channel 5: open failed: connect failed: Connection refused
channel 7: open failed: connect failed: Connection refused
channel 15: open failed: connect failed: Connection refused
channel 14: open failed: connect failed: Connection refused
channel 5: open failed: connect failed: Connection refused
channel 7: open failed: connect failed: Connection refused
channel 8: open failed: connect failed: Connection refused
channel 9: open failed: connect failed: Connection refused
channel 5: open failed: connect failed: Connection refused
channel 7: open failed: connect failed: Connection refused
channel 8: open failed: connect failed: Connection refused
channel 5: open failed: connect failed: Connection refused
channel 7: open failed: connect failed: Connection refused
channel 8: open failed: connect failed: Connection refused
channel 9: open failed: connect failed: Connection refused
channel 5: open failed: connect failed: Connection refused
channel 7: open failed: connect failed: Connection ref