# Dataproc Spark Job
- Dataproc Cluster
- Job with BQ data
- Delete Dataproc Cluster

API Reference: https://googleapis.dev/python/dataproc/0.7.0/gapic/v1/api.html

## Setup

inputs:

In [29]:
REGION = 'us-central1'
PROJECT_ID='statmike-mlops'
DATANAME = 'fraud'
NOTEBOOK = 'dataproc'

DATAPROC_COMPUTE = "n1-standard-4"
DATAPROC_MAIN_INSTANCES = 1
DATAPROC_WORK_INSTANCES = 4

packages:

In [30]:
from google.cloud import dataproc_v1
from datetime import datetime

clients:

In [31]:
client_options = {"api_endpoint": f"{REGION}-dataproc.googleapis.com:443"}
clients = {}

In [41]:
clients['cluster'] = dataproc_v1.ClusterControllerClient(client_options = client_options)
clients['job'] = dataproc_v1.JobControllerClient(client_options = client_options)

parameters:

In [33]:
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
BUCKET = PROJECT_ID
URI = f"gs://{BUCKET}/{DATANAME}/models/{NOTEBOOK}"
DIR = f"temp/{NOTEBOOK}"

environment:

In [34]:
!rm -rf {DIR}
!mkdir -p {DIR}

E1004 08:58:28.045879212     133 backup_poller.cc:133]       Run client channel backup poller: {"created":"@1633337908.045714837","description":"pollset_work","file":"src/core/lib/iomgr/ev_epollex_linux.cc","file_line":321,"referenced_errors":[{"created":"@1633337908.045707029","description":"Bad file descriptor","errno":9,"file":"src/core/lib/iomgr/ev_epollex_linux.cc","file_line":957,"os_error":"Bad file descriptor","syscall":"epoll_wait"}]}


## Create Cluster
https://cloud.google.com/dataproc/docs/guides/create-cluster

In [36]:
cluster_specs = {
	"project_id": PROJECT_ID,
    "cluster_name": DATANAME,
    "config": {
    	"master_config": {"num_instances": DATAPROC_MAIN_INSTANCES, "machine_type_uri": DATAPROC_COMPUTE},
    	"worker_config": {"num_instances": DATAPROC_WORK_INSTANCES, "machine_type_uri": DATAPROC_COMPUTE}
    }
}

In [37]:
cluster = clients['cluster'].create_cluster(
    request = {
        "project_id": PROJECT_ID,
        "region": REGION,
        "cluster": cluster_specs
	}
)

In [38]:
cluster.result().cluster_name

'fraud'

## Define Job

In [None]:
%%writefile {DIR}/train.py


In [None]:
!gsutil cp {DIR}/train.py {URI}/{TIMESTAMP}/train.py

## Submit Job

In [None]:
job_specs = {
	"placement": {"cluster_name": DATANAME},
    "spark_job": {
    	"main_class": "",
        "jar_file_uris": [],
        "args": []
    }
}

In [None]:
job = clients['job'].submit_job()

In [None]:
job.reference.job_id

## Wait On Job

In [None]:
white True:
    if job.status.State.Name(job.status.state) == "ERROR":
        raise Exception(job.status.details)
    elif job.status.State.Name(job.status.state) == "DONE":
        print ("Finished")
        return job

## Delete Cluster
https://cloud.google.com/dataproc/docs/guides/manage-cluster#delete_a_cluster

In [12]:
delCluster = clients['cluster'].delete_cluster(
    request = {
        "project_id": PROJECT_ID,
        "region": REGION,
        "cluster_name": cluster.result().cluster_name
	}
)