# Train Recommender with Apache Spark ML
Next, use the Amazon SageMaker Python SDK to submit a processing job. Use the Spark container that was just built with our Spark script.

In [2]:
import sagemaker
import boto3

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
region = boto3.Session().region_name

In [3]:
s3_input_data = 's3://{}/spark_datasets/train/movielens/sample_movielens_ratings.txt'.format(bucket)

print(s3_input_data)

s3://sagemaker-us-east-1-835319576252/spark_datasets/train/movielens/sample_movielens_ratings.txt


In [4]:
!aws s3 ls $s3_input_data

2020-11-02 20:04:44      32363 sample_movielens_ratings.txt


In [18]:
!aws s3 cp $s3_input_data ./data/

download: s3://sagemaker-us-east-1-835319576252/spark_datasets/train/movielens/sample_movielens_ratings.txt to data/sample_movielens_ratings.txt


In [19]:
import pandas as pd

pd.read_csv('./data/sample_movielens_ratings.txt', 
            sep='::', 
            engine='python', 
            header=None, 
            names=['user_id','movie_id','rating', 'timestamp'])

Unnamed: 0,user_id,movie_id,rating,timestamp
0,0,2,3,1424380312
1,0,3,1,1424380312
2,0,5,2,1424380312
3,0,9,4,1424380312
4,0,11,1,1424380312
...,...,...,...,...
1496,29,90,4,1424380312
1497,29,93,1,1424380312
1498,29,94,4,1424380312
1499,29,97,1,1424380312


## Setup Output Data

In [4]:
from time import gmtime, strftime
from time import gmtime, strftime
timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", gmtime())

output_prefix = 'spark-als-{}'.format(timestamp_prefix)
processing_job_name = 'spark-als-{}'.format(timestamp_prefix)

print('Processing job name:  {}'.format(processing_job_name))

Processing job name:  spark-als-2020-11-30-22-20-34


In [5]:
s3_output_data = 's3://{}/{}/output'.format(bucket, output_prefix)

print(s3_output_data)

s3://sagemaker-us-east-1-835319576252/spark-als-2020-11-30-22-20-34/output


# Review the Spark preprocessing script.

In [6]:
!pygmentize ./src/train_spark_als.py

[34mfrom[39;49;00m [04m[36m__future__[39;49;00m [34mimport[39;49;00m print_function
[34mfrom[39;49;00m [04m[36m__future__[39;49;00m [34mimport[39;49;00m unicode_literals

[34mimport[39;49;00m [04m[36mtime[39;49;00m
[34mimport[39;49;00m [04m[36msys[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m
[34mimport[39;49;00m [04m[36mshutil[39;49;00m
[34mimport[39;49;00m [04m[36mcsv[39;49;00m

[34mimport[39;49;00m [04m[36mpyspark[39;49;00m
[34mfrom[39;49;00m [04m[36mpyspark[39;49;00m[04m[36m.[39;49;00m[04m[36msql[39;49;00m [34mimport[39;49;00m SparkSession
[34mfrom[39;49;00m [04m[36mpyspark[39;49;00m[04m[36m.[39;49;00m[04m[36msql[39;49;00m[04m[36m.[39;49;00m[04m[36mfunctions[39;49;00m [34mimport[39;49;00m *
[34mfrom[39;49;00m [04m[36mpyspark[39;49;00m[04m[36m.[39;49;00m[04m[36mml[39;49;00m[04m[36m.[39;49;00m[04m[36mevaluation[39;49;00m [34mimport[39;49;00m RegressionEvaluator
[34mfrom[39;49;00m [

In [7]:
from sagemaker.spark.processing import PySparkProcessor

processor = PySparkProcessor(base_job_name='spark-als',
                             role=role,
                             instance_count=1,
                             instance_type='ml.r5.2xlarge',
                             max_runtime_in_seconds=1200)

## Start the Spark Processing Job

_Notes on not using ProcessingInput and Output:_
* Since Spark natively reads/writes from/to S3 using s3a://, we can avoid the copy required by ProcessingInput and ProcessingOutput (FullyReplicated or ShardedByS3Key) and just specify the S3 input and output buckets/prefixes._"
* See https://github.com/awslabs/amazon-sagemaker-examples/issues/994 for issues related to using /opt/ml/processing/input/ and output/
* If we use ProcessingInput, the data will be copied to each node (which we don't want in this case since Spark already handles this)

In [8]:
from sagemaker.processing import ProcessingOutput

processor.run(submit_app='./src/train_spark_als.py',
              arguments=['s3_input_data', s3_input_data,
                         's3_output_data', s3_output_data,
              ],
              logs=True,
              wait=False
)


Job Name:  spark-als-2020-11-30-22-20-35-496
Inputs:  [{'InputName': 'code', 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-835319576252/spark-als-2020-11-30-22-20-35-496/input/code/train_spark_als.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  []


In [9]:
from IPython.core.display import display, HTML

processing_job_name = processor.jobs[-1].describe()['ProcessingJobName']

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/processing-jobs/{}">Processing Job</a></b>'.format(region, processing_job_name)))


In [10]:
from IPython.core.display import display, HTML

processing_job_name = processor.jobs[-1].describe()['ProcessingJobName']

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/cloudwatch/home?region={}#logStream:group=/aws/sagemaker/ProcessingJobs;prefix={};streamFilter=typeLogStreamPrefix">CloudWatch Logs</a> After a Few Minutes</b>'.format(region, processing_job_name)))


In [11]:
from IPython.core.display import display, HTML

s3_job_output_prefix = output_prefix

display(HTML('<b>Review <a target="blank" href="https://s3.console.aws.amazon.com/s3/buckets/{}/{}/?region={}&tab=overview">S3 Output Data</a> After The Spark Job Has Completed</b>'.format(bucket, s3_job_output_prefix, region)))


# Monitor the Processing Job

In [12]:
running_processor = sagemaker.processing.ProcessingJob.from_processing_name(processing_job_name=processing_job_name,
                                                                            sagemaker_session=sagemaker_session)

processing_job_description = running_processor.describe()

print(processing_job_description)

{'ProcessingInputs': [{'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-835319576252/spark-als-2020-11-30-22-20-35-496/input/code/train_spark_als.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}], 'ProcessingJobName': 'spark-als-2020-11-30-22-20-35-496', 'ProcessingResources': {'ClusterConfig': {'InstanceCount': 1, 'InstanceType': 'ml.r5.2xlarge', 'VolumeSizeInGB': 30}}, 'StoppingCondition': {'MaxRuntimeInSeconds': 1200}, 'AppSpecification': {'ImageUri': '173754725891.dkr.ecr.us-east-1.amazonaws.com/sagemaker-spark-processing:2.4-cpu', 'ContainerEntrypoint': ['smspark-submit', '/opt/ml/processing/input/code/train_spark_als.py'], 'ContainerArguments': ['s3_input_data', 's3://sagemaker-us-east-1-835319576252/spark_datasets/train/movielens/sample_movielens_ratings.txt', 's3_output_data', 's3://sagemaker-us-east-1-835319576252

In [13]:
running_processor.wait()

[34m11-30 22:24 smspark.cli  INFO     Parsing arguments. argv: ['/usr/local/bin/smspark-submit', '/opt/ml/processing/input/code/train_spark_als.py', 's3_input_data', 's3://sagemaker-us-east-1-835319576252/spark_datasets/train/movielens/sample_movielens_ratings.txt', 's3_output_data', 's3://sagemaker-us-east-1-835319576252/spark-als-2020-11-30-22-20-34/output'][0m
[34m11-30 22:24 smspark.cli  INFO     Raw spark options before processing: {'class_': None, 'jars': None, 'py_files': None, 'files': None, 'verbose': False}[0m
[34m11-30 22:24 smspark.cli  INFO     App and app arguments: ['/opt/ml/processing/input/code/train_spark_als.py', 's3_input_data', 's3://sagemaker-us-east-1-835319576252/spark_datasets/train/movielens/sample_movielens_ratings.txt', 's3_output_data', 's3://sagemaker-us-east-1-835319576252/spark-als-2020-11-30-22-20-34/output'][0m
[34m11-30 22:24 smspark.cli  INFO     Rendered spark options: {'class_': None, 'jars': None, 'py_files': None, 'files': None, 'verbose':

# _Please Wait Until the ^^ Processing Job ^^ Completes Above._

# Inspect the Output

In [14]:
!aws s3 ls --recursive $s3_output_data/

2020-11-30 22:25:31          0 spark-als-2020-11-30-22-20-34/output/all-recommendations/_SUCCESS
2020-11-30 22:25:30      11073 spark-als-2020-11-30-22-20-34/output/all-recommendations/part-00000-c5dfdafd-ddcb-434a-81eb-1435fb48a69e-c000.json
2020-11-30 22:25:37          0 spark-als-2020-11-30-22-20-34/output/top-10-recommendations/_SUCCESS
2020-11-30 22:25:36      35887 spark-als-2020-11-30-22-20-34/output/top-10-recommendations/part-00000-0622b7b4-fe10-4521-9b06-945defc287d8-c000.json


## Copy the Output from S3 to Local

In [15]:
!aws s3 cp --recursive $s3_output_data ./spark-als-output/ --exclude="*" --include="*.json"

download: s3://sagemaker-us-east-1-835319576252/spark-als-2020-11-30-22-20-34/output/all-recommendations/part-00000-c5dfdafd-ddcb-434a-81eb-1435fb48a69e-c000.json to spark-als-output/all-recommendations/part-00000-c5dfdafd-ddcb-434a-81eb-1435fb48a69e-c000.json
download: s3://sagemaker-us-east-1-835319576252/spark-als-2020-11-30-22-20-34/output/top-10-recommendations/part-00000-0622b7b4-fe10-4521-9b06-945defc287d8-c000.json to spark-als-output/top-10-recommendations/part-00000-0622b7b4-fe10-4521-9b06-945defc287d8-c000.json


## Review Recommendations

In [16]:
import glob
import pandas as pd
import os

def load_dataset_json(path):
    data = pd.concat([pd.read_json(f, lines=True) for f in glob.glob('{}/*.json'.format(path))], ignore_index=True)

    return data

In [17]:
df_all_recommendations = load_dataset_json(path='./spark-als-output/all-recommendations/')
df_all_recommendations

Unnamed: 0,userId,recommendations
0,12,"[{'movieId': 46, 'rating': 5.175531}, {'movieI..."
1,1,"[{'movieId': 74, 'rating': 5.29319}, {'movieId..."
2,6,"[{'movieId': 29, 'rating': 4.942736}, {'movieI..."
3,3,"[{'movieId': 30, 'rating': 5.5714946}, {'movie..."
4,4,"[{'movieId': 52, 'rating': 4.0926905}, {'movie..."
5,8,"[{'movieId': 29, 'rating': 5.360059}, {'movieI..."
6,11,"[{'movieId': 32, 'rating': 5.1472187}, {'movie..."
7,19,"[{'movieId': 90, 'rating': 4.2163053}, {'movie..."
8,23,"[{'movieId': 55, 'rating': 5.44681}, {'movieId..."
9,21,"[{'movieId': 29, 'rating': 4.413492}, {'movieI..."


In [18]:
df_top_10_recommendations = load_dataset_json(path='./spark-als-output/top-10-recommendations/')
df_top_10_recommendations

Unnamed: 0,movieId,recommendations
0,81,"[{'userId': 28, 'rating': 5.0103116}, {'userId..."
1,65,"[{'userId': 23, 'rating': 4.773918}, {'userId'..."
2,76,"[{'userId': 14, 'rating': 4.8065825}, {'userId..."
3,1,"[{'userId': 15, 'rating': 3.9443717}, {'userId..."
4,12,"[{'userId': 28, 'rating': 4.9454618}, {'userId..."
...,...,...
95,77,"[{'userId': 7, 'rating': 4.033238}, {'userId':..."
96,62,"[{'userId': 26, 'rating': 4.650314}, {'userId'..."
97,32,"[{'userId': 3, 'rating': 5.390518}, {'userId':..."
98,73,"[{'userId': 26, 'rating': 3.8126805}, {'userId..."
