# Test processor sagemaker object locally

In [1]:
from sagemaker.local import LocalSession
from sagemaker.processing import Processor, ProcessingInput, ProcessingOutput
sagemaker_session = LocalSession()

In [2]:
role = "arn:aws:iam::12345678910:role/test_role"

In [3]:
def load_env_variables(*env_files):
    env_vars = dict()
    get_values = lambda x: [(x.split("=")[0], x.split("=")[1])]
    for env_file in env_files:
        with open(env_file, "r") as file:
            env_vars.update(
                dict(
                    [
                        (key.strip(), value.strip()) 
                        for line in file.readlines()
                        for key, value in get_values(line)
                    ]
                )
            )
    return env_vars

## Initialize processor job

In [4]:
process_job = Processor(
    image_uri = "koombea_blogs_extraction_koombea_blogs_extraction_component",
    role=role,
    instance_type="local",
    instance_count=1,
    entrypoint=["python", "run.py"],
    base_job_name="etl-koombea-blogs-job",
    sagemaker_session = sagemaker_session,
    env=load_env_variables("../vars.env", "../vars.prod.env"),
    tags=[dict(description="This job is for testing purposes,"
               " it will automatically extract the data from koombea db and process it,"
               " to then update the data and train data to s3"
               " and automatically make some insights on then using unsupervised learning techniques",),
          dict(mantainer="emanuel.afanador@koombea.com")]
)

In [5]:
bucket_name = sagemaker_session.default_bucket()
key_prefix = "{}/{}/".format("koombea_website_ml", "koombea_blogs_information")
s3_bucket_name = "s3://{}/{}".format(bucket_name, key_prefix)

## Run processor job

In [6]:
source_output = "/opt/ml/processing/processed_data"
process_job.run(
    outputs=[
        ProcessingOutput(
            source = source_output,
            destination = s3_bucket_name
        )
    ],
    arguments=["--output-path", source_output]
)

INFO:sagemaker:Creating processing-job with name etl-koombea-blogs-job-2023-02-08-16-04-03-421
INFO:sagemaker.local.local_session:Starting processing job
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:sagemaker.local.image:No AWS credentials found in session but credentials from EC2 Metadata Service are available.
INFO:sagemaker.local.image:docker compose file: 
networks:
  sagemaker-local:
    name: sagemaker-local
services:
  algo-1-4jkap:
    container_name: we70axfdbq-algo-1-4jkap
    entrypoint:
    - python
    - run.py
    - --output-path
    - /opt/ml/processing/processed_data
    environment:
    - '[Masked]'
    - '[Masked]'
    - '[Masked]'
    - '[Masked]'
    - '[Masked]'
    - '[Masked]'
    - '[Masked]'
    - '[Masked]'
    - '[Masked]'
    - '[Masked]'
    - '[Masked]'
    - '[Masked]'
    - '[Masked]'
    image: koombea_blogs_extraction_koombea_blogs_extraction_component
    networks:
      sagemaker-local:
        a


Job Name:  etl-koombea-blogs-job-2023-02-08-16-04-03-421
Inputs:  []
Outputs:  [{'OutputName': 'output-1', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-west-2-256305374409/koombea_website_ml/koombea_blogs_information/', 'LocalPath': '/opt/ml/processing/processed_data', 'S3UploadMode': 'EndOfJob'}}]
Creating we70axfdbq-algo-1-4jkap ... 
Creating we70axfdbq-algo-1-4jkap ... done
Attaching to we70axfdbq-algo-1-4jkap
[36mwe70axfdbq-algo-1-4jkap |[0m 2023-02-08 16:04:09,441 - INFO - generated new fontManager
[36mwe70axfdbq-algo-1-4jkap |[0m 2023-02-08 16:04:09,812 - INFO - /opt/ml/koombea_blogs/connection/dataBaseKey.pem
[36mwe70axfdbq-algo-1-4jkap |[0m 2023-02-08 16:04:10,031 - INFO - Connected (version 2.0, client Go)
[36mwe70axfdbq-algo-1-4jkap |[0m 2023-02-08 16:04:10,558 - INFO - Authentication (publickey) successful!
[36mwe70axfdbq-algo-1-4jkap |[0m 2023-02-08 16:04:10,559 - INFO - Connect to the following sqlalchemy url: mysql+pymysql://koombea20:-WFgRvi2d

## Show s3 files

In [7]:
contents = sagemaker_session.boto_session.client("s3").list_objects_v2(
    Bucket=bucket_name,
    Prefix=key_prefix
)["Contents"]
for content in contents[1:]:
    print("filename : s3://" + bucket_name + "/" + content["Key"])

filename : s3://sagemaker-us-west-2-256305374409/koombea_website_ml/koombea_blogs_information/blogs_df_wp_koombea20.csv
filename : s3://sagemaker-us-west-2-256305374409/koombea_website_ml/koombea_blogs_information/blogs_df_wp_koombea20stg.csv
filename : s3://sagemaker-us-west-2-256305374409/koombea_website_ml/koombea_blogs_information/en_data_wp_koombea20.json
filename : s3://sagemaker-us-west-2-256305374409/koombea_website_ml/koombea_blogs_information/en_data_wp_koombea20stg.json
filename : s3://sagemaker-us-west-2-256305374409/koombea_website_ml/koombea_blogs_information/es_data_wp_koombea20.json
filename : s3://sagemaker-us-west-2-256305374409/koombea_website_ml/koombea_blogs_information/es_data_wp_koombea20stg.json
