# Batch Spark Job on Synapse Compute

## Prepare your AML workspace

In [None]:
from azureml.core import Workspace

ws = Workspace.from_config()
ws

## Input data

In [None]:
datastore = ws.get_default_datastore()
file_name = 'Titanic.csv'
source = 'https://dprepdata.blob.core.windows.net/demo/{}'.format(file_name)
dest = '{}://{}.blob.{}/{}/{}'.format(
    datastore.protocol, 
    datastore.account_name,
    datastore.endpoint,
    datastore.container_name,
    file_name)
dest_key = datastore.account_key

In [None]:
!azcopy --source $source --destination $dest --dest-key $dest_key --quiet

In [None]:
titanic_dataset = Dataset.File.from_files(path=[(datastore, file_name)])
input = titanic_dataset.as_hdfs()

## Output Config

In [None]:
from azureml.data import HDFSOutputDatasetConfig
output = HDFSOutputDatasetConfig(destination=(datastore,"test")).register_on_complete(name="registered_dataset")

## dataprep script

In [None]:
import os
os.makedirs("code", exist_ok=True)

In [None]:
%%writefile code/test.py
import os
import sys
import azureml.core
from pyspark.sql import SparkSession
from azureml.core import Run

print(azureml.core.VERSION)
print(os.environ)

import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--input_dir")
parser.add_argument("--output_dir")
args = parser.parse_args()

spark= SparkSession.builder.getOrCreate()
sdf = spark.read.option("header", "true").csv(args.input_dir)
sdf.show()

sdf.coalesce(1).write\
.option("header", "true")\
.mode("append")\
.csv(args.output_dir)

## Submit an Experiment 

In [None]:
from azureml.core.environment import CondaDependencies
conda_dep = CondaDependencies()
conda_dep.add_pip_package("azureml-core==1.20.0")

In [None]:
from azureml.core import RunConfiguration
from azureml.core import ScriptRunConfig 
from azureml.core import Experiment 

run_config = RunConfiguration(framework="pyspark")
run_config.target = 'synapsecompute'

run_config.spark.configuration["spark.driver.memory"] = "1g" 
run_config.spark.configuration["spark.driver.cores"] = 2 
run_config.spark.configuration["spark.executor.memory"] = "1g" 
run_config.spark.configuration["spark.executor.cores"] = 1 
run_config.spark.configuration["spark.executor.instances"] = 1 


run_config.environment.python.conda_dependencies = conda_dep


script_run_config = ScriptRunConfig(source_directory = './code',
                                    script= 'test.py',
                                    arguments = ["--input_dir", input, "--output_dir", output],
                                    run_config = run_config) 



exp = Experiment(workspace=ws, name="synapse-spark") 
run = exp.submit(config=script_run_config) 
run