# Dimensionality Reduction: Principal Component Analysis
## 3. PCA using sagemaker

In [1]:
import numpy as np
import pandas as pd

import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

In [2]:
# Set the output path for the saved model
bucket_name = 'sagemaker-tutorial-rnd'
prefix = "PCA"

model_output = f"s3://{bucket_name}/{prefix}/saved_model"
train_input = sagemaker.TrainingInput(
    f"s3://{bucket_name}/{prefix}/train/environment_train.csv", content_type="text/csv;label_size=0")
# Store the standardized data in S3 bucket

print(sagemaker.Session().boto_region_name)

ap-southeast-1


## Training and Deploying

In [3]:
# Create estimator
container = sagemaker.image_uris.retrieve(
    "pca", sagemaker.Session().boto_region_name)

base_job_name = "pca-environment"

pca_estimator = sagemaker.estimator.Estimator(
    container,
    role=get_execution_role(),
    instance_count=1,
    instance_type='ml.m5.xlarge',
    output_path=model_output,
    sagemaker_session=sagemaker.Session(),
    use_spot_instances=False,
    max_run=3600,
    # max_wait=3600,
    # checkpoint_s3_uri=f"s3://{bucket_name}/{prefix}/checkpoints/{base_job_name}",
    base_job_name=base_job_name,
)

pca_estimator.set_hyperparameters(
    feature_dim=7,
    num_components=3,  # because 3 components has explained variance of 99%
    subtract_mean=False,  # because it is already standardized
    algorithm_mode='regular',
    mini_batch_size=1000
)

In [4]:
# Fit the model
pca_estimator.fit({'train': train_input})

2022-02-05 11:17:28 Starting - Starting the training job...
2022-02-05 11:17:52 Starting - Preparing the instances for trainingProfilerReport-1644059848: InProgress
......
2022-02-05 11:18:59 Downloading - Downloading input data...
2022-02-05 11:19:24 Training - Downloading the training image...
2022-02-05 11:19:55 Uploading - Uploading generated training model[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[02/05/2022 11:19:45 INFO 139795070814016 integration.py:636] worker started[0m
[34m[02/05/2022 11:19:45 INFO 139795070814016] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-conf.json: {'algorithm_mode': 'regular', 'subtract_mean': 'true', 'extra_components': '-1', 'force_dense': 'true', 'epochs': 1, '_log_level': 'info', '_kvstore': 'dist_sync', '_num_kv_servers': 'auto', '_num_gpus': 'auto'}[0m
[34m[02/05/2022 11:19:45 INFO 139795070814016] Merging w

In [5]:
# Deploy the model
pca_predictor = pca_estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.xlarge',
    endpoint_name=base_job_name
)

------!

## Prediction

In [6]:
# Serializer for PCA predictor
pca_predictor.serializer = CSVSerializer()
pca_predictor.deserializer = JSONDeserializer()

In [7]:
# Predict single data
single_result = pca_predictor.predict([[-1.45264386, 0.867438826, -0.631348872,
                                        -1.450186585, -0.042464642, -1.450766812, -1.185413776]])
single_result

{'projections': [{'projection': [-0.0005539982230402529,
    0.7090167999267578,
    2.8915176391601562]}]}

In [8]:
# Load data
test = pd.read_csv("environment_test.csv")
test = test.values

In [9]:
# Predict multiple data
pred = pca_predictor.predict(test)

# Convert json to dataframe
pred_result = [result['projection'] for result in pred['projections']]
pred_result = pd.DataFrame(pred_result)
pred_result.head()

Unnamed: 0,0,1,2
0,-0.03015,-2.222405,-0.103516
1,-0.028445,-2.308333,-0.294343
2,0.115493,1.043182,-1.634197
3,-0.010507,0.67719,3.277793
4,-0.031363,-2.449602,-0.362065


In [10]:
pred_result.to_csv("environment_test_pca.csv", index=False, header=False)

In [None]:
# Delete the endpoint if not needed