## Sagemaker - Sleep Quality

In [1]:
import sklearn # Check Sklearn version
sklearn.__version__

'1.4.1.post1'

## 1. Initialize Boto3 SDK and create S3 bucket. 

In [2]:
import numpy as np
from sagemaker import get_execution_role
import sagemaker
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
import datetime
import time
import tarfile
import boto3
import pandas as pd

sm_boto3 = boto3.client("sagemaker")
sess = sagemaker.Session()
region = sess.boto_session.region_name
bucket = 'ml-data-repository-pp2959' # Mention the created S3 bucket name here
print("Using bucket " + bucket)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
Using bucket ml-data-repository-pp2959


## 3. Data Exploration and Understanding.

In [3]:
df = pd.read_csv("sleep_data.csv")

In [4]:
df.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea


In [5]:
df.shape

(374, 13)

In [6]:
df.columns

Index(['Person ID', 'Gender', 'Age', 'Occupation', 'Sleep Duration',
       'Quality of Sleep', 'Physical Activity Level', 'Stress Level',
       'BMI Category', 'Blood Pressure', 'Heart Rate', 'Daily Steps',
       'Sleep Disorder'],
      dtype='object')

In [7]:
# ['Low_Risk','High_Risk'],[0,1]
df = df.drop(['Stress Level', 'Blood Pressure', 'Person ID', 'Occupation'], axis=1)
df = pd.get_dummies(df)

In [8]:
df.columns

Index(['Age', 'Sleep Duration', 'Quality of Sleep', 'Physical Activity Level',
       'Heart Rate', 'Daily Steps', 'Gender_Female', 'Gender_Male',
       'BMI Category_Normal', 'BMI Category_Normal Weight',
       'BMI Category_Obese', 'BMI Category_Overweight',
       'Sleep Disorder_Insomnia', 'Sleep Disorder_Sleep Apnea'],
      dtype='object')

In [9]:
df.shape

(374, 14)

In [10]:
# Find the Percentage of Values are missing
df.isnull().mean() * 100

Age                           0.0
Sleep Duration                0.0
Quality of Sleep              0.0
Physical Activity Level       0.0
Heart Rate                    0.0
Daily Steps                   0.0
Gender_Female                 0.0
Gender_Male                   0.0
BMI Category_Normal           0.0
BMI Category_Normal Weight    0.0
BMI Category_Obese            0.0
BMI Category_Overweight       0.0
Sleep Disorder_Insomnia       0.0
Sleep Disorder_Sleep Apnea    0.0
dtype: float64

In [11]:
features = list(df.columns)
features

['Age',
 'Sleep Duration',
 'Quality of Sleep',
 'Physical Activity Level',
 'Heart Rate',
 'Daily Steps',
 'Gender_Female',
 'Gender_Male',
 'BMI Category_Normal',
 'BMI Category_Normal Weight',
 'BMI Category_Obese',
 'BMI Category_Overweight',
 'Sleep Disorder_Insomnia',
 'Sleep Disorder_Sleep Apnea']

In [12]:
label = 'Quality of Sleep'
features.remove(label)
label

'Quality of Sleep'

In [13]:
x = df[features]
y = df[label]

In [14]:
x.head()

Unnamed: 0,Age,Sleep Duration,Physical Activity Level,Heart Rate,Daily Steps,Gender_Female,Gender_Male,BMI Category_Normal,BMI Category_Normal Weight,BMI Category_Obese,BMI Category_Overweight,Sleep Disorder_Insomnia,Sleep Disorder_Sleep Apnea
0,27,6.1,42,77,4200,False,True,False,False,False,True,False,False
1,28,6.2,60,75,10000,False,True,True,False,False,False,False,False
2,28,6.2,60,75,10000,False,True,True,False,False,False,False,False
3,28,5.9,30,85,3000,False,True,False,False,True,False,False,True
4,28,5.9,30,85,3000,False,True,False,False,True,False,False,True


In [15]:
# {0: 'Low_Risk',1: 'High_Risk'}
y.head()

0    6
1    6
2    6
3    4
4    4
Name: Quality of Sleep, dtype: int64

In [16]:
x.shape

(374, 13)

In [17]:
y.value_counts()

Quality of Sleep
8    109
6    105
7     77
9     71
5      7
4      5
Name: count, dtype: int64

In [18]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.25, random_state=42, shuffle = True)

In [19]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(280, 13)
(94, 13)
(280,)
(94,)


## 4. Split the data into Train/Test CSV File. 

In [20]:
trainX = pd.DataFrame(X_train)
trainX[label] = y_train

testX = pd.DataFrame(X_test)
testX[label] = y_test

In [21]:
print(trainX.shape)
print(testX.shape)

(280, 14)
(94, 14)


In [22]:
trainX.head()

Unnamed: 0,Age,Sleep Duration,Physical Activity Level,Heart Rate,Daily Steps,Gender_Female,Gender_Male,BMI Category_Normal,BMI Category_Normal Weight,BMI Category_Obese,BMI Category_Overweight,Sleep Disorder_Insomnia,Sleep Disorder_Sleep Apnea,Quality of Sleep
222,44,6.3,45,72,6000,False,True,False,False,False,True,True,False,6
227,44,6.3,45,72,6000,False,True,False,False,False,True,True,False,6
141,38,7.1,60,68,8000,False,True,True,False,False,False,False,False,8
17,29,6.0,30,70,8000,False,True,True,False,False,False,False,True,6
246,44,6.3,45,72,6000,False,True,False,False,False,True,True,False,6


In [23]:
trainX.isnull().sum()

Age                           0
Sleep Duration                0
Physical Activity Level       0
Heart Rate                    0
Daily Steps                   0
Gender_Female                 0
Gender_Male                   0
BMI Category_Normal           0
BMI Category_Normal Weight    0
BMI Category_Obese            0
BMI Category_Overweight       0
Sleep Disorder_Insomnia       0
Sleep Disorder_Sleep Apnea    0
Quality of Sleep              0
dtype: int64

In [24]:
testX.isnull().sum()

Age                           0
Sleep Duration                0
Physical Activity Level       0
Heart Rate                    0
Daily Steps                   0
Gender_Female                 0
Gender_Male                   0
BMI Category_Normal           0
BMI Category_Normal Weight    0
BMI Category_Obese            0
BMI Category_Overweight       0
Sleep Disorder_Insomnia       0
Sleep Disorder_Sleep Apnea    0
Quality of Sleep              0
dtype: int64

## 5. Upload data into the S3 Bucket.

In [25]:
trainX.to_csv("train-V-1.csv",index = False)
testX.to_csv("test-V-1.csv", index = False)

In [26]:
# send data to S3. SageMaker will take training data from s3
sk_prefix = "sagemaker/health-data/sklearncontainer"
trainpath = sess.upload_data(
    path="train-V-1.csv", bucket=bucket, key_prefix=sk_prefix
)

testpath = sess.upload_data(
    path="test-V-1.csv", bucket=bucket, key_prefix=sk_prefix
)

In [27]:
testpath

's3://ml-data-repository-pp2959/sagemaker/health-data/sklearncontainer/test-V-1.csv'

In [28]:
trainpath

's3://ml-data-repository-pp2959/sagemaker/health-data/sklearncontainer/train-V-1.csv'

In [None]:
sleep_models_dir = 's3://ml-data-repository-pp2959/sagemaker/sleep-models'

## 6. Create Training Script

In [63]:
%%writefile sleep_script.py


from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import validation_curve, train_test_split

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO 
import argparse
import joblib
import os
import numpy as np
import pandas as pd
import boto3

s3 = boto3.client("s3")


# inference functions ---------------

def input_fn(request_body, request_content_type):
    print(request_body)
    print(request_content_type)
    if request_content_type == "text/csv":
        request_body = request_body.strip()
        try:
            df = pd.read_csv(StringIO(request_body), header=None)
            return df
        
        except Exception as e:
            print(e)
    else:
        return """Please use Content-Type = 'text/csv' and, send the request!!""" 
 
    
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf

def predict_fn(input_data, model):
    if type(input_data) != str:
        prediction = model.predict(input_data)
        print(prediction)
        return prediction
    else:
        return input_data
        
    
if __name__ == "__main__":

    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--random_state", type=int, default=0)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train-V-1.csv")
    parser.add_argument("--test-file", type=str, default="test-V-1.csv")
    parser.add_argument("--s3-bucket", type=str, default="ml-data-repository-pp2959")
    parser.add_argument("--s3-data-key", type=str, default="sagemaker/health-data/sklearncontainer/")
    parser.add_argument("--s3-model-key", type=str, default="sagemaker/sleep-models/")

    args, _ = parser.parse_known_args()
    
    print("SKLearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)

    print("[INFO] Reading data")
    print()
    
    train_path = os.path.join(args.train, args.train_file)
    test_path = os.path.join(args.test, args.test_file)
    s3.download_file(args.s3_bucket, args.s3_data_key + args.train_file, train_path)
    s3.download_file(args.s3_bucket, args.s3_data_key + args.test_file, test_path)
    
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    
    print(train_df.head())
    print(test_df.head())
    
    features = list(train_df.columns)
    label = features.pop(-1)
    
    print("Building training and testing datasets")
    print()
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]

    print('Column order: ')
    print(features)
    print()
    
    print("Label column is: ",label)
    print()
    
    print("Data Shape: ")
    print()
    print("---- SHAPE OF TRAINING DATA (85%) ----")
    print(X_train.shape)
    print(y_train.shape)
    print()
    print("---- SHAPE OF TESTING DATA (15%) ----")
    print(X_test.shape)
    print(y_test.shape)
    print()
    
    print("Training RandomForest Model.....")
    print()
    
    model = KNeighborsClassifier(n_neighbors = 3)
    model.fit(X_train, y_train)
    print()
    

    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model,model_path)
    s3.upload_file(model_path, args.s3_bucket, args.s3_model_key + "model.joblib")
    print("Model persisted at " + model_path)
    print()

    
    y_pred_test = model.predict(X_test)
    
    
    test_acc = accuracy_score(y_test,y_pred_test)
    test_rep = classification_report(y_test,y_pred_test)

    print()
    print("---- METRICS RESULTS FOR TESTING DATA ----")
    print()
    print("Total Rows are: ", X_test.shape[0])
    print('[TESTING] Model Accuracy is: ', test_acc)
    print('[TESTING] Testing Report: ')
    print(test_rep)

Overwriting sleep_script.py


In [64]:
! python sleep_script.py --n_estimators 100 \
                   --random_state 0 \
                   --model-dir ./ \
                   --train ./ \
                   --test ./ \

[INFO] Extracting arguments
SKLearn Version:  1.4.1.post1
Joblib Version:  1.3.2
[INFO] Reading data

   Age  Sleep Duration  ...  Sleep Disorder_Sleep Apnea  Quality of Sleep
0   44             6.3  ...                       False                 6
1   44             6.3  ...                       False                 6
2   38             7.1  ...                       False                 8
3   29             6.0  ...                        True                 6
4   44             6.3  ...                       False                 6

[5 rows x 14 columns]
   Age  Sleep Duration  ...  Sleep Disorder_Sleep Apnea  Quality of Sleep
0   53             8.5  ...                       False                 9
1   31             6.1  ...                       False                 6
2   29             6.0  ...                       False                 6
3   53             8.5  ...                       False                 9
4   32             6.0  ...                       False      

## 7. Train script inside Sagemaker container.

In [66]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point="sleep_script.py",
    role=get_execution_role(),
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="RF-custom-sklearn",
    hyperparameters={
        "n_estimators": 100,
        "random_state": 0,
    },
    use_spot_instances = True,
    max_wait = 7200,
    max_run = 3600
)

In [67]:
# launch training job, with asynchronous call
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)
# sklearn_estimator.fit({"train": datapath}, wait=True)

INFO:sagemaker:Creating training-job with name: RF-custom-sklearn-2024-05-12-20-59-14-479


2024-05-12 20:59:14 Starting - Starting the training job...
2024-05-12 20:59:29 Starting - Preparing the instances for training...
2024-05-12 20:59:57 Downloading - Downloading input data...
2024-05-12 21:00:28 Downloading - Downloading the training image...
2024-05-12 21:01:13 Training - Training image download completed. Training in progress.
2024-05-12 21:01:13 Uploading - Uploading generated training model[34m2024-05-12 21:01:07,232 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2024-05-12 21:01:07,236 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-05-12 21:01:07,284 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2024-05-12 21:01:07,462 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-05-12 21:01:07,475 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-

## 8. Store Model Artifacts(model.tar.gz) into the S3 Bucket. 

In [68]:
sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + artifact)


2024-05-12 21:01:29 Starting - Preparing the instances for training
2024-05-12 21:01:29 Downloading - Downloading the training image
2024-05-12 21:01:29 Training - Training image download completed. Training in progress.
2024-05-12 21:01:29 Uploading - Uploading generated training model
2024-05-12 21:01:29 Completed - Training job completed
Model artifact persisted at s3://sagemaker-us-east-1-891376963426/RF-custom-sklearn-2024-05-12-20-59-14-479/output/model.tar.gz


## 9. Deploy Sagemaker Endpoint(API) for trained model, and test it. 

In [69]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print(artifact)
model = SKLearnModel(
    name =  model_name,
    model_data=artifact,
    role=get_execution_role(),
    entry_point="sleep_script.py",
    framework_version=FRAMEWORK_VERSION,
)

s3://sagemaker-us-east-1-891376963426/RF-custom-sklearn-2024-05-12-20-59-14-479/output/model.tar.gz


In [81]:
import boto3

# Initialize SageMaker client
sagemaker_client = boto3.client('sagemaker')

# Specify the S3 location of the new model artifact
new_model_url = 's3://sagemaker-us-east-1-891376963426/RF-custom-sklearn-2024-05-12-20-59-14-479/output/model.tar.gz'

# Specify the name of the existing SageMaker endpoint configuration
endpoint_config_name = 'Sleep-Quality-Inference-Model-1'

# Specify the name of the existing SageMaker endpoint
endpoint_name = 'Sleep-Quality-Inference-Model'

response = sagemaker_client.create_endpoint_config(
    EndpointConfigName = endpoint_config_name,
    ProductionVariants=[{
        'VariantName': 'variant-1',
        'ModelName': model_name,
        'InitialInstanceCount': 1,
        'InstanceType': 'ml.m5.large',
        'InitialVariantWeight': 1
    }]
)

print(response)



ClientError: An error occurred (ValidationException) when calling the CreateEndpointConfig operation: Cannot create already existing endpoint configuration "arn:aws:sagemaker:us-east-1:891376963426:endpoint-config/Sleep-Quality-Inference-Model".

In [80]:
# Update the SageMaker endpoint configuration to use the updated model
response = sagemaker_client.update_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=endpoint_config_name,
    RetainAllVariantProperties=True,
)

# Print the response
print(response)


{'EndpointArn': 'arn:aws:sagemaker:us-east-1:891376963426:endpoint/Sleep-Quality-Inference-Model', 'ResponseMetadata': {'RequestId': 'b60e7484-50e8-4686-ae7d-949f930ce813', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'b60e7484-50e8-4686-ae7d-949f930ce813', 'content-type': 'application/x-amz-json-1.1', 'content-length': '97', 'date': 'Sun, 12 May 2024 21:27:02 GMT'}, 'RetryAttempts': 0}}


In [82]:
endpoint_name = "Sleep-Quality-Inference-Model"
print("EndpointName={}".format(endpoint_name))

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name,
    update_endpoint = True
)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker:Creating model with name: Custom-sklearn-model-2024-05-12-21-02-23


EndpointName=Sleep-Quality-Inference-Model


INFO:sagemaker:Creating endpoint-config with name Sleep-Quality-Inference-Model


ClientError: An error occurred (ValidationException) when calling the CreateEndpointConfig operation: Cannot create already existing endpoint configuration "arn:aws:sagemaker:us-east-1:891376963426:endpoint-config/Sleep-Quality-Inference-Model".

In [None]:
testX.head()

In [None]:
test_features = np.array(testX[features][0:2].values.tolist()[:-1])
test_features

In [None]:
predictor.predict([[0]*13])

In [None]:
print(predictor.predict(testX[features][0:2].values.tolist()))

## Don't forget to delete the endpoint !

In [None]:
sm_boto3.delete_endpoint(EndpointName=endpoint_name)