In [7]:
#sf api setup
import time
import asyncio
import logging
from IPython.display import clear_output

from os.path import join

from sfapi_client         import Client, AsyncClient
from sfapi_client.compute import Machine
from sfapi_client.jobs    import JobState

from sfapi_connector import KeyManager, OsSFAPI, OsWrapper, LOGGER
import json
import re
import subprocess
import webbrowser
from io import BytesIO



In [8]:
# sf api send jobscript



target = "./sfapi_test"
# Variables
env_name="sfapi_dask_env"
requirements_file="./sfapi_test/requirements.txt"

job_script = f"""#!/bin/bash
#SBATCH -q debug
#SBATCH -A m669
#SBATCH -N 1
#SBATCH -n 5              # Number of tasks (64 tasks, 32 per node)
#SBATCH -C cpu
#SBATCH -t 00:30:00
#SBATCH -J sfapi-demo
#SBATCH --exclusive
#SBATCH --output=./sfapi_test/sfapi-demo-%j.out
#SBATCH --error=./sfapi_test/sfapi-demo-%j.error

# Print each command for debugging
set -x


# Load necessary modules
module load conda
#module load python dask

# Ensure Conda is initialized
source $(conda info --base)/etc/profile.d/conda.sh

#test
echo "requirements.txt are at {requirements_file}"

# Check if the Conda environment exists; create or update if necessary
if ! conda info --envs | grep -q "^{env_name} "; then
    echo "Creating Conda environment: {env_name}"
    conda create -y -n "{env_name}" python=3.9
fi

echo "Activating Conda environment: {env_name}"
conda activate "{env_name}"

if [ -f "{requirements_file}" ]; then
    echo "Installing dependencies from {requirements_file}"
    pip install -r "{requirements_file}"
else
    echo "No requirements.txt found at {requirements_file}. ERROR: Skipping dependency installation."
fi
sleep 5


# Start Dask Scheduler
echo "Starting scheduler..."
scheduler_file=$SCRATCH/scheduler_file.json
rm -f $scheduler_file

DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT=3600s \
DASK_DISTRIBUTED__COMM__TIMEOUTS__TCP=3600s \
dask-scheduler \
    --interface hsn0 \
    --scheduler-file $scheduler_file &

dask_pid=$!

# Wait for the scheduler to start
sleep 5
until [ -f $scheduler_file ]; do
    echo "Waiting for scheduler to start..."
    sleep 5
done
echo "Scheduler started"

# Start Dask Workers
echo "Starting workers..."
DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT=3600s \
DASK_DISTRIBUTED__COMM__TIMEOUTS__TCP=3600s \
srun dask worker \
    --scheduler-file $scheduler_file \
    --interface hsn0 \
    --nworkers 1 > $SCRATCH/worker_log.out 2>&1 &

echo "Workers started. Check $SCRATCH/worker_log.out for details."

# Wait a bit to ensure workers are started
echo "Sleeping..."
sleep 10

# Check number of workers
echo "Verifying number of Dask workers..."
python -c "
from dask.distributed import Client
client = Client(scheduler_file='$scheduler_file')
print('Number of workers:', len(client.scheduler_info()['workers']))
"

# Print hostname
echo "hostname: $(hostname -f)"

# Wait for client connection
echo "waiting for client connection..."
wait


"""



km = KeyManager()

with Client(key=km.key) as client:
    perlmutter = client.compute(Machine.perlmutter)



    [path] = perlmutter.ls('/global/homes/s/sanjeevc/sfapi_test/', directory=True)
    

    # Read the file into memory as bytes
    with open('./requirements.txt', 'rb') as f:
        file_content = f.read()

    # Wrap the content in BytesIO
    file_requirements = BytesIO(file_content)
    file_requirements.filename = 'requirements.txt'  # Add the required filename attribute

    path.upload(file_requirements)
    print(f"Uploaded requirements.txt to {path}")
    

    job = perlmutter.submit_job(job_script)
    job_global = job
    print(f"Submitted_job: {job.jobid}")
    job_id = job.jobid

    while True:
        job.update()
        clear_output(wait=True)

        print(f"The job state is: {job.state} ({type(job.state)})")
        if job.state not in [JobState.PENDING, JobState.RUNNING, JobState.COMPLETING]:
            if job.state == JobState.FAILED:
                print("Job failed")
            elif job.state == JobState.COMPLETED:
                print("Job completed")
                
            elif job.state == JobState.TIMEOUT:
                print("Job timeout")
                  
            break
        try: 
            output_file = perlmutter.ls(f"/global/homes/s/sanjeevc/sfapi_test/sfapi-demo-{job_id}.out") #todo change to relative paths
            output_file = output_file[0]
            with output_file.open("r") as f:
                file_content = f.read()
                print(file_content)

            # Parsing the hostname using a regex pattern #todo remove hostname search as hostname is not used
            hostname_match = re.search(r"hostname: (.+)", file_content)
            if hostname_match:
                hostname = hostname_match.group(1)
                print(f"Hostname parsed: {hostname}")
                print(f"---------------------------------")
                #find the dask ip for the ssh tunnel. This reads it from the scheduler file that dask makes
                print(f"Searching for Dask IP address...")
                output_file = perlmutter.ls(f"/pscratch/sd/s/sanjeevc/scheduler_file.json")
                output_file = output_file[0]
                with output_file.open("r") as f:
                    file_content = f.read()
                    print(file_content)
                
                                # Parse the JSON content to extract the Dask IP
                scheduler_info = json.loads(file_content)
                dask_address = scheduler_info.get("address", "")
                daskip = re.search(r"tcp://([\d.]+):", dask_address)

                # Extract and print the IP address
                if daskip:
                    daskip = daskip.group(1)
                    print(f"Dask IP address: {daskip}")
                    
                    
                    # Open the SSH tunnel to perlmutter
                    # Example usage
                    command = [
                        "ssh",
                        "-o", "ServerAliveInterval=30", 
                        "-l", "sanjeevc",
                        "-i", "~/.ssh/nersc",
                        "-L", f"8786:{daskip}:8786",
                        "-L", f"8787:{daskip}:8787",
                        "sanjeevc@perlmutter.nersc.gov"
                    ]
                    subprocess.Popen(command)
                    time.sleep(3)
                    print("SSH tunnel opened")
                    webbrowser.open('http://localhost:8787/status')

                   
                   
                   
                   
                    #keep tunnel open till user wants to close it and cancel the job
                    input("Cancel job?")
                    job.cancel()
                    print("Job cancelled")
                    
                    
                    
                else:
                    print("Dask IP address not found.")
                
                
                
                break
            else:
                print("Hostname not found in the file.")
        except Exception as e:
            print(f"error: {e}")
        time.sleep(10)
        
        


The job state is: JobState.PENDING (<enum 'JobState'>)
error: ls: cannot access '/global/homes/s/sanjeevc/sfapi_test/sfapi-demo-33084637.out': No such file or directory

