### 비지도 학습 : Kmeans 군집화
- 아래와 같이 두개의 bucket을 직접 만들어 사용한다(뒤에 숫자는 고유한 값을 사용한다)  
awsml-sagemaker-test-source-05-15  
awsml-sagemaker-test-result-05-15

In [1]:
import sagemaker

# Get a SageMaker-compatible role used by this Notebook Instance.
role = sagemaker.get_execution_role()

# get a SageMaker session object, that can be
# used to manage the interaction with the SageMaker API.
sagemaker_session = sagemaker.Session()

# create a training job to train a KMeans model using
# Amazon SageMaker's own implementation of the k-means algorithm
#
# set hyperparameter k = 3
from sagemaker import KMeans

input_location = 's3://awsml-sagemaker-test-source-05-15/iris-train.csv'
output_location = 's3://awsml-sagemaker-test-result-05-15'

# kmeans_estimator = KMeans(role=role,
#                 train_instance_count=1,
#                 train_instance_type='ml.m4.xlarge',
#                 output_path=output_location,
#                 k=3)

kmeans_estimator = KMeans(role=role,
                instance_count=1,
                instance_type='ml.m5.xlarge',
                output_path=output_location,
                k=3)

In [2]:
import boto3
import io
import pandas as pd
import numpy as np

# load training and validation dataset from Amazon S3
s3_client = boto3.client('s3')
s3_bucket_name='awsml-sagemaker-test-source-05-15'

response = s3_client.get_object(Bucket='awsml-sagemaker-test-source-05-15', Key='iris_train.csv')
response_body = response["Body"].read()
df_iris_train = pd.read_csv(io.BytesIO(response_body), header=0, delimiter=",", low_memory=False)

response = s3_client.get_object(Bucket='awsml-sagemaker-test-source-05-15', Key='iris_test.csv')
response_body = response["Body"].read()
df_iris_test = pd.read_csv(io.BytesIO(response_body), header=0, index_col=False, delimiter=",", low_memory=False)

# Convert target variables 'species' from strings into integers.
from sklearn.preprocessing import LabelEncoder
labelEncoder = LabelEncoder()
labelEncoder.fit(df_iris_train['species'])
labelEncoder.fit(df_iris_test['species'])
df_iris_train['species'] = labelEncoder.transform(df_iris_train['species'])
df_iris_test['species'] = labelEncoder.transform(df_iris_test['species'])

# separate training and validation dataset into separate features and target datasets
# assuming that the first column of the iris_train.csv and iris_test.csv files
# contains the target attribute.
#
# since training a k-means classifier does not require labelled training data,
# you will not make use of df_iris_target_train

df_iris_features_train= df_iris_train.iloc[:,1:]
df_iris_target_train = df_iris_train.iloc[:,0]

df_iris_features_test= df_iris_test.iloc[:,1:]
df_iris_target_test = df_iris_test.iloc[:,0]

# create a training job.  
train_data = df_iris_features_train.values.astype('float32')
record_set = kmeans_estimator.record_set(train_data)

# 약 3분 소요
kmeans_estimator.fit(record_set)

INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: kmeans-2023-05-20-14-39-43-132


2023-05-20 14:39:45 Starting - Starting the training job...
2023-05-20 14:40:01 Starting - Preparing the instances for training......
2023-05-20 14:41:04 Downloading - Downloading input data...
2023-05-20 14:41:30 Training - Downloading the training image.....[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[05/20/2023 14:42:32 INFO 140499369477952] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'init_method': 'random', 'mini_batch_size': '5000', 'epochs': '1', 'extra_center_factor': 'auto', 'local_lloyd_max_iter': '300', 'local_lloyd_tol': '0.0001', 'local_lloyd_init_method': 'kmeans++', 'local_lloyd_num_trials': 'auto', 'half_life_time_size': '0', 'eval_metrics': '["msd"]', 'force_dense': 'true', '_disable_wait_to_read': 'false', '_enable_profiler': 'false', '_kvstore': 'auto', '_log_level': 'info', '_num_gpus': 'auto', '_num_kv_servers': '1', '

In [3]:
# deploy the model to a prediction instance
# and create a prediction endpoint.
# 약 3분 소요
predictor = kmeans_estimator.deploy(initial_instance_count=1, instance_type="ml.m5.xlarge")

INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating model with name: kmeans-2023-05-20-14-43-07-139
INFO:sagemaker:Creating endpoint-config with name kmeans-2023-05-20-14-43-07-139
INFO:sagemaker:Creating endpoint with name kmeans-2023-05-20-14-43-07-139


------!

In [4]:
test_data = df_iris_features_test.values.astype('float32')

predictions = predictor.predict(test_data)
print (predictions) # 각각의 군집값과 군집의 중심점과의 거리

[label {
  key: "closest_cluster"
  value {
    float32_tensor {
      values: 1.0
    }
  }
}
label {
  key: "distance_to_cluster"
  value {
    float32_tensor {
      values: 0.695756733417511
    }
  }
}
, label {
  key: "closest_cluster"
  value {
    float32_tensor {
      values: 0.0
    }
  }
}
label {
  key: "distance_to_cluster"
  value {
    float32_tensor {
      values: 0.16961853206157684
    }
  }
}
, label {
  key: "closest_cluster"
  value {
    float32_tensor {
      values: 2.0
    }
  }
}
label {
  key: "distance_to_cluster"
  value {
    float32_tensor {
      values: 0.6240471005439758
    }
  }
}
, label {
  key: "closest_cluster"
  value {
    float32_tensor {
      values: 0.0
    }
  }
}
label {
  key: "distance_to_cluster"
  value {
    float32_tensor {
      values: 0.46935996413230896
    }
  }
}
, label {
  key: "closest_cluster"
  value {
    float32_tensor {
      values: 0.0
    }
  }
}
label {
  key: "distance_to_cluster"
  value {
    float32_tensor {


In [33]:
import re
# "values" 값 만 추출
values = np.array(re.findall(r'values: ([\d.]+)', str(predictions)))

print('[cluster]       [distance]')
for k in range(0,values.shape[0],2):
    print(' ',values[k],'  :   ',values[k+1])

[cluster]       [distance]
  1.0   :    0.695756733417511
  0.0   :    0.16961853206157684
  2.0   :    0.6240471005439758
  0.0   :    0.46935996413230896
  0.0   :    0.5570276975631714
  0.0   :    0.648684561252594
  0.0   :    0.6549350023269653
  2.0   :    1.5151643753051758
  0.0   :    0.9439111948013306
  0.0   :    0.8950684070587158
  2.0   :    0.5623982548713684
  1.0   :    0.33278292417526245
  0.0   :    0.34041574597358704
  1.0   :    0.2464577555656433
  2.0   :    0.7677396535873413
  1.0   :    0.43911445140838623
  1.0   :    0.7276668548583984
  2.0   :    0.49540120363235474
  2.0   :    0.8049861192703247
  2.0   :    0.5299631357192993
  0.0   :    0.7368848323822021
  1.0   :    0.21965493261814117
  0.0   :    0.8057723641395569
  0.0   :    0.7890915274620056
  0.0   :    0.4777616858482361
  0.0   :    0.6650727391242981
  0.0   :    0.8163267970085144
  0.0   :    0.37343013286590576
  1.0   :    0.18005862832069397
  0.0   :    0.553978681564331
  1.0  

In [5]:
# terminate the prediction instance and associated
# HTTPS endpoint.
# kmeans_estimator.delete_endpoint()
predictor.delete_endpoint()

INFO:sagemaker:Deleting endpoint configuration with name: kmeans-2023-05-20-14-43-07-139
INFO:sagemaker:Deleting endpoint with name: kmeans-2023-05-20-14-43-07-139
