# Batch Spark Job on Synapse Compute

## Prepare your AML workspace

In [None]:
from azureml.core import Workspace

ws = Workspace.from_config()
ws

## Input data

In [None]:
from azureml.core import Dataset
from azureml.data.dataset_factory import DataType

dataset_name="blob_ds"
try:
    dataset = Dataset.get_by_name(workspace=ws, name=dataset_name)
except:
    # create a TabularDataset from a delimited file behind a public web url and convert column "Survived" to boolean
    web_path ='https://dprepdata.blob.core.windows.net/demo/Titanic.csv'
    titanic_ds = Dataset.Tabular.from_delimited_files(path=web_path, set_column_types={'Survived': DataType.to_bool()})
    titanic_ds.register(ws,name=dataset_name)
    dataset = Dataset.get_by_name(workspace=ws, name=dataset_name)

input = dataset.as_named_input("synapse_input")

## Output Config

In [None]:
from azureml.data import HDFSOutputDatasetConfig
output = HDFSOutputDatasetConfig("synapse_output", (ws.datastores['workspaceblobstore'],"test"))

## dataprep script

In [None]:
import os
os.makedirs("code", exist_ok=True)

In [None]:
%%writefile code/test.py
import os
import azureml.core
from azureml.core import Workspace, Run

print(azureml.core.VERSION)
print(os.environ['synapse_input'])
print(os.environ['synapse_output'])

run_context = Run.get_context()
dataset = run_context.input_datasets['synapse_input']
sdf = dataset.to_spark_dataframe()
sdf.show()

sdf.coalesce(1).write\
.option("header", "true")\
.csv(os.environ['synapse_output'],mode='overwrite')

## Submit an Experiment 

In [None]:
from azureml.core import RunConfiguration
from azureml.core import ScriptRunConfig 
from azureml.core import Experiment 

run_config = RunConfiguration(framework="pyspark")
run_config.target = 'synapsecompute'

run_config.spark.configuration["spark.driver.memory"] = "1g" 
run_config.spark.configuration["spark.driver.cores"] = 2 
run_config.spark.configuration["spark.executor.memory"] = "1g" 
run_config.spark.configuration["spark.executor.cores"] = 1 
run_config.spark.configuration["spark.executor.instances"] = 1 


script_run_config = ScriptRunConfig(source_directory = './code',
                                    script= 'test.py',
                                    arguments = [input,output],
                                    run_config = run_config) 


exp = Experiment(workspace=ws, name="synapse-spark") 
run = exp.submit(config=script_run_config) 
run