# start 

In [2]:
import os

import sagemaker
from sagemaker import ScriptProcessor, ModelMetrics, MetricsSource, TrainingInput
from sagemaker.estimator import Estimator
from sagemaker.network import NetworkConfig
from sagemaker.processing import ProcessingInput, ProcessingOutput, ScriptProcessor
from sagemaker.sklearn import SKLearnProcessor, SKLearn
from sagemaker.model import Model
from sagemaker.workflow.condition_step import JsonGet, ConditionStep
from sagemaker.workflow.conditions import ConditionLessThanOrEqualTo
from sagemaker.workflow.conditions import ConditionGreaterThanOrEqualTo
from sagemaker.workflow.parameters import ParameterInteger, ParameterString, ParameterFloat
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.properties import PropertyFile
from sagemaker.workflow.step_collections import RegisterModel
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.dataset_definition.inputs import (
    AthenaDatasetDefinition,
    DatasetDefinition,
)
from datetime import datetime
import time
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.pipeline_experiment_config import PipelineExperimentConfig
from sagemaker.workflow.execution_variables import ExecutionVariables
from sagemaker.workflow.functions import Join
from sagemaker.transformer import Transformer
from sagemaker.inputs import TransformInput
from sagemaker.workflow.steps import TransformStep

import json
import logging
import pathlib
import pickle
import tarfile

import numpy as np
import pandas as pd
# import xgboost

from sklearn.metrics import mean_squared_error

logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())

In [3]:
!aws codeartifact login --tool pip --domain cirrus-ml-ds-domain --domain-owner 813736554012 --repository cirrus-ml-ds-shared-repo
# !pip install awswrangler --quiet
!pip install lightgbm --quiet
# !pip install category_encoders --quiet
# !pip install imbalanced-learn --quiet

Successfully configured pip to use AWS CodeArtifact repository https://cirrus-ml-ds-domain-813736554012.d.codeartifact.eu-north-1.amazonaws.com/pypi/cirrus-ml-ds-shared-repo/ 
Login expires in 12 hours at 2022-10-06 19:25:54+00:00
[0m

In [4]:
import boto3
import sagemaker


def get_environment(project_name, ssm_params):
    sm = boto3.client("sagemaker")
    ssm = boto3.client("ssm")

    r = sm.describe_domain(
            DomainId=sm.describe_project(
                ProjectName=project_name
                )["CreatedBy"]["DomainId"]
        )
    del r["ResponseMetadata"]
    del r["CreationTime"]
    del r["LastModifiedTime"]
    r = {**r, **r["DefaultUserSettings"]}
    del r["DefaultUserSettings"]

    i = {
        **r,
        **{t["Key"]:t["Value"]
            for t in sm.list_tags(ResourceArn=r["DomainArn"])["Tags"]
            if t["Key"] in ["EnvironmentName", "EnvironmentType"]}
    }

    for p in ssm_params:
        try:
            i[p["VariableName"]] = ssm.get_parameter(Name=f"{i['EnvironmentName']}-{i['EnvironmentType']}-{p['ParameterName']}")["Parameter"]["Value"]
        except:
            i[p["VariableName"]] = ""

    return i


def get_session(region, default_bucket):
    """Gets the sagemaker session based on the region.

    Args:
        region: the aws region to start the session
        default_bucket: the bucket to use for storing the artifacts

    Returns:
        sagemaker.session.Session instance
    """

    boto_session = boto3.Session(region_name=region)

    sagemaker_client = boto_session.client("sagemaker")
    runtime_client = boto_session.client("sagemaker-runtime")
    print(f"Sarah: ml_pipelines > utiles > environments.py > sagemaker.__version__ is {sagemaker.__version__}")
    return sagemaker.session.Session(
        boto_session=boto_session,
        sagemaker_client=sagemaker_client,
        sagemaker_runtime_client=runtime_client,
        default_bucket=default_bucket,
    ), sagemaker_client


def environment_data(project_name):
    # Dynamically load environmental SSM parameters - provide the list of the variables to load from SSM parameter store
    ssm_parameters = [
        {"VariableName": "DataBucketName", "ParameterName": "data-bucket-name"},
        {"VariableName": "ModelBucketName", "ParameterName": "model-bucket-name"},
        {"VariableName": "S3KmsKeyId", "ParameterName": "kms-s3-key-arn"},
        {"VariableName": "EbsKmsKeyArn", "ParameterName": "kms-ebs-key-arn"},
        {"VariableName": "TrustedDefaultKinesisAccount", "ParameterName": "TrustedDefaultKinesisAccount"},
    ]
    env_data = get_environment(project_name=project_name, ssm_params=ssm_parameters)
    env_data["ProcessingRole"] = env_data["ExecutionRole"]
    env_data["TrainingRole"] = env_data["ExecutionRole"]
    
    return env_data


# get_latest_model_metadata

In [5]:
def get_latest_model_metadata(sm_client, model_package_group_name):
    print("SARAH: Inference > get_latest_model_metadata()")
    approved_packages = []
    for p in sm_client.get_paginator('list_model_packages').paginate(
            ModelPackageGroupName=model_package_group_name,
            ModelApprovalStatus='PendingManualApproval',
            SortBy="CreationTime",
            SortOrder="Descending",
    ):
        approved_packages.extend(p["ModelPackageSummaryList"])
    model_metadata = sm_client.describe_model_package(ModelPackageName =approved_packages[0]["ModelPackageArn"])
    print(model_metadata)
    print("SARAH: Inference > get_latest_model_metadata() END")
    return model_metadata


In [6]:
import os
import json

def get_pipeline(
        region,
        project_name=None,
        source_scripts_path="./",
        model_package_group_name="AbalonePackageGroup",
        pipeline_name="AbalonePipeline",
        base_job_prefix="Abalone",
        revision="no-revision-provided",):
    """Gets a SageMaker ML Pipeline instance working with on abalone data.

    Args:
        region: AWS region to create and run the pipeline.
        @todo arg. definitions

    Returns:
        an instance of a pipeline
    """

    # get env data
    env_data = environment_data(project_name)
    print(f"Environment data:\n{json.dumps(env_data, indent=2)}")
    print(f"SARAH: get_pipeline() > source_scripts_path={source_scripts_path}")

    sagemaker_session, sagemaker_client = get_session(region, env_data["DataBucketName"])
    default_bucket = sagemaker_session.default_bucket()
    base_dir = os.getcwd()
    print(f"Creating the pipeline '{pipeline_name}':")
    print(f"Parameters:{region}\n{env_data['SecurityGroups']}\n{env_data['SubnetIds']}\n{env_data['ProcessingRole']}\n\
    {env_data['TrainingRole']}\n{env_data['DataBucketName']}\n{env_data['ModelBucketName']}\n{model_package_group_name}\n\
    {pipeline_name}\n{base_job_prefix}\n{env_data['TrustedDefaultKinesisAccount']}")
    model_metadata = get_latest_model_metadata(sagemaker_client, model_package_group_name)


    return model_metadata


In [7]:
model_metadata=get_pipeline(
        region='eu-north-1',
        project_name='customerone-inf',
        source_scripts_path="/root/sagemaker-customerone2-p-vrs1c6dm1yir-model-build-train/source_scripts/",
        model_package_group_name="customerone-inf-p-0mogq7hgpkye",
        pipeline_name="debugging-inf-cv2",
        base_job_prefix="CVM",
        revision="no-revision-provided",)

Environment data:
{
  "DomainArn": "arn:aws:sagemaker:eu-north-1:370702650160:domain/d-tdizim9qnor9",
  "DomainId": "d-tdizim9qnor9",
  "DomainName": "mlops-dev-eu-north-1-sagemaker-domain",
  "HomeEfsFileSystemId": "fs-03fc3d37f8623fea2",
  "Status": "InService",
  "AuthMode": "IAM",
  "AppNetworkAccessType": "VpcOnly",
  "SubnetIds": [
    "subnet-0724be5e7071e7070",
    "subnet-01def51ffe7467c71"
  ],
  "Url": "https://d-tdizim9qnor9.studio.eu-north-1.sagemaker.aws",
  "VpcId": "vpc-0459a28f3637e285c",
  "KmsKeyId": "f4664542-0f2e-42ca-b51f-2bec0ad62278",
  "ExecutionRole": "arn:aws:iam::370702650160:role/sm-mlops-env-EnvironmentIAM-SageMakerExecutionRole-14AU65MVMBUGO",
  "SecurityGroups": [
    "sg-041054ee4500f96f6"
  ],
  "JupyterServerAppSettings": {
    "DefaultResourceSpec": {
      "SageMakerImageArn": "arn:aws:sagemaker:eu-north-1:243637512696:image/jupyter-server-3",
      "InstanceType": "system",
      "LifecycleConfigArn": "arn:aws:sagemaker:eu-north-1:370702650160:stud

In [8]:
model_metadata

{'ModelPackageGroupName': 'customerone-inf-p-0mogq7hgpkye',
 'ModelPackageVersion': 6,
 'ModelPackageArn': 'arn:aws:sagemaker:eu-north-1:370702650160:model-package/customerone-inf-p-0mogq7hgpkye/6',
 'CreationTime': datetime.datetime(2022, 10, 5, 16, 3, 18, 819000, tzinfo=tzlocal()),
 'InferenceSpecification': {'Containers': [{'Image': '662702820516.dkr.ecr.eu-north-1.amazonaws.com/sagemaker-scikit-learn:0.23-1-cpu-py3',
    'ImageDigest': 'sha256:4b41b03ae858fca42720730b06e2803b3b046c676c143526cba9ed5a3483e4ad',
    'ModelDataUrl': 's3://mlops-dev-370702650160-eu-north-1-data/lifecycle/60d/customerone-inf/b4caacc/2022_10_05_15_30_26/p1033/output/training/output/pipelines-cqd2rjsw3lpv-ModelSelectionStep-O6kzGlCDY1/output/model.tar.gz'}],
  'SupportedTransformInstanceTypes': ['ml.m5.xlarge'],
  'SupportedRealtimeInferenceInstanceTypes': ['ml.t2.medium', 'ml.m5.xlarge'],
  'SupportedContentTypes': ['text/csv'],
  'SupportedResponseMIMETypes': ['text/csv']},
 'ModelPackageStatus': 'Comple

In [9]:
model_s3_path = model_metadata['InferenceSpecification']['Containers'][0]['ModelDataUrl']
model_s3_path

's3://mlops-dev-370702650160-eu-north-1-data/lifecycle/60d/customerone-inf/b4caacc/2022_10_05_15_30_26/p1033/output/training/output/pipelines-cqd2rjsw3lpv-ModelSelectionStep-O6kzGlCDY1/output/model.tar.gz'

# boto3

In [10]:
"""Evaluation script for measuring mean squared error."""
import json
import logging
import pathlib
import pickle
import tarfile

import numpy as np
import pandas as pd
from sklearn.externals import joblib

import boto3



In [11]:
s3=boto3.client('s3')

# Get the trained model

In [12]:
s3.download_file('mlops-dev-370702650160-eu-north-1-data', 
                 'lifecycle/60d/customerone-inf/601f446/2022_10_03_13_36_50/p1033/output/training/output/pipelines-u5lr8glo2o8k-ModelSelectionStep-MgOYTUvvLF/output/model.tar.gz', 
                 'model.tar.gz')

In [13]:
model_path = "model.tar.gz"
with tarfile.open(model_path) as tar:
    tar.extractall(path="./test")

# logger.info("Loading xgboost model.")
# model = pickle.load(open("./test/model.joblib", "rb"))

In [14]:
# import lightgbm as lgb

# lgb.Booster(model_file='./test/model.joblib')

In [15]:
from sklearn.externals import joblib
# save model
# joblib.dump(lgbmodel, 'lgb.pkl')
# load model
model_lgbm = joblib.load('./test/model.joblib')



# Get the data from S3

In [16]:
inference_data_path = 's3://mlops-dev-370702650160-eu-north-1-data/lifecycle/60d/customerone-inf/customerone-inf-p-0mogq7hgpkye-inference/1664892089/p1033/model-input/inference-data.csv'

inference_data_df = pd.read_csv(inference_data_path)
print(inference_data_df.shape) # (5895, 67)
inference_data_df.head()

(5895, 67)


Unnamed: 0,2021-09-04,100215,45,Unnamed: 3,Unnamed: 4,0.0,Unnamed: 6,0,0.1,Unnamed: 9,...,0.0.7,0.17,Unnamed: 59,0.18,0.19,1.3,Unnamed: 63,0.20,Unnamed: 65,Unnamed: 66
0,2021-07-22,100375,35,0.0,0.0,0.0,0.0,1,0,,...,0.0,0,,2,0,3,0.0,1,0.0,0.0
1,2021-08-16,100488,81,0.0,0.0,0.0,0.0,1,0,1.0,...,0.0,0,,2,0,0,0.0,1,0.0,0.0
2,2021-06-13,100736,97,,,0.0,0.645161,0,0,1.033333,...,0.0,0,,5,0,2,0.0,0,,
3,2021-06-13,100736,97,,,0.0,0.645161,0,0,1.033333,...,0.0,0,,5,0,2,0.0,0,,
4,2021-08-22,100874,80,0.0,0.0,0.0,0.0,2,0,0.0,...,0.0,0,1.0,2,0,4,0.0,0,,


In [17]:
inference_data_df = inference_data_df.drop(inference_data_df.columns[[0,1]], axis=1)

In [18]:
results = model_lgbm.predict(inference_data_df)

In [19]:
inference_output_path = 's3://mlops-dev-370702650160-eu-north-1-data/lifecycle/60d/customerone-inf/customerone-inf-p-0mogq7hgpkye-inference/1664856691/p1033/'
pd.DataFrame(results).to_csv(inference_output_path+'inference_result.csv')

In [20]:
# Another input data 

In [36]:
inf_data = 's3://mlops-dev-370702650160-eu-north-1-data/lifecycle/60d/customerone-inf/customerone-inf-p-0mogq7hgpkye-inference/1664986449/p1033/model-input/inference-data.csv'
df2 = pd.read_csv(inf_data)
print(df2.shape) # (5895, 67)
df2 = df2.drop(df2.columns[[0, 1]], axis=1)
df2.head()

(5896, 67)


Unnamed: 0,rev_m_bill_shock_eom_total_bill_amt_1m_to_avg_12m_flg,dmgrphc_b_zip_code_cd,rev_m_bill_shock_eom_total_bill_amt_0m_to_avg_01m_flg,cust_prd_hld_d_tvchannelpackage_active_30_to_60_days_avg_val,ci_d_sum_total_resolved_cmpl_last_0_to_30_days_val,ci_d_sum_total_compl_last_0_to_30_days_cnt,cust_prd_hld_d_vas_closed_val,cust_prd_hld_d_vas_closed_30_to_60_days_avg_val,rev_m_eom_total_bill_amt_m0_val,cust_prd_hld_d_fixedbroadband_active_val,...,cust_prd_hld_d_tvchannelpackage_closed_0_to_30_days_avg_val,cust_prd_hld_d_tvchannelpackage_closed_30_to_60_days_avg_val,cust_prd_hld_d_postpaid_tvchannelpackage_cnt,ci_d_sum_total_compl_last_0_to_60_days_cnt,cust_prd_hld_d_fixedbroadband_closed_30_to_60_days_avg_val,ci_d_total_num_of_interactions_weekday_last_0_to_30_cnt,cust_prd_hld_d_mobilevoicesubscription_closed_val,cust_prd_hld_d_mobilevoicesubscription_active_val,cust_prd_hld_d_fixedbroadband_active_0_to_30_days_avg_val,cust_prd_hld_d_vas_active_val
0,0.0,28,0.0,1.0,0,0,0,0.0,379.333872,1,...,0.483871,0.0,2,0,0.0,0,0,0,1.0,1
1,,34,,0.0,0,0,0,0.0,,1,...,0.0,0.0,1,0,0.0,0,0,1,1.0,0
2,,34,,0.0,0,0,0,0.0,,1,...,0.0,0.0,1,0,0.0,0,0,1,1.0,0
3,,34,,0.0,0,0,0,0.0,,1,...,0.0,0.0,1,0,0.0,0,0,1,1.0,0
4,0.0,99,0.0,1.4,0,0,0,0.0,175.992847,1,...,0.0,0.0,6,0,0.0,1,0,0,0.032258,1


In [37]:
type(df2)

pandas.core.frame.DataFrame

In [39]:
df2.values

array([[ 0.        , 28.        ,  0.        , ...,  0.        ,
         1.        ,  1.        ],
       [        nan, 34.        ,         nan, ...,  1.        ,
         1.        ,  0.        ],
       [        nan, 34.        ,         nan, ...,  1.        ,
         1.        ,  0.        ],
       ...,
       [        nan,  5.        ,         nan, ...,  1.        ,
         0.        ,  2.        ],
       [        nan, 45.        ,         nan, ...,  1.        ,
         2.74193548,  1.        ],
       [ 0.        , 74.        ,  0.        , ...,  1.        ,
         1.12903226,  2.        ]])

In [38]:
model_lgbm.predict(df2.values)

array([0, 0, 0, ..., 0, 0, 0])