In [2]:
import pandas as pd
import numpy as np

<h2> <font color='aquamarine'>Cleaning the Dataset </font></h2>

In [3]:
ds_houses = pd.read_csv('house_prices.csv')

In [4]:
ds_houses.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [5]:
ds_houses.drop(['id', 'date', 'sqft_living15', 'sqft_lot15'], axis = 1, inplace=True)

In [6]:
ds_train = ds_houses.iloc[0:15129,:]
ds_test = ds_houses.iloc[15129:,:]

In [7]:
ds_test.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long
15129,937750.0,4,2.75,2580,3560,1.5,0,0,5,7,1710,870,1917,0,98115,47.6753,-122.304
15130,725126.0,4,2.5,3200,12369,2.0,0,0,3,10,3200,0,1998,0,98059,47.5273,-122.143
15131,135000.0,3,1.0,840,3000,1.0,0,0,3,5,840,0,1943,0,98178,47.494,-122.275
15132,635000.0,4,2.5,2970,7961,1.0,0,0,3,8,2020,950,1969,0,98125,47.7118,-122.29
15133,245500.0,2,1.0,790,7500,1.0,0,0,3,6,790,0,1950,0,98011,47.7644,-122.198


<p><font color='#50C878' size=5><b>Train set has to be a pandas ds (csv file), the test data set has a numpy type</b></font></p>

In [23]:
x_test = ds_test.iloc[:,1:17].values
y_test = ds_test.iloc[:,0].values

In [24]:
ds_train.to_csv('ds_train_xboost.csv', header = False, index = False)
ds_test.to_csv('ds_test_xboost.csv', header = False, index = False)

<h2> <font color='aquamarine'>Sagemaker setup </font></h2>

In [26]:
import sagemaker
import boto3

In [39]:
session = sagemaker.Session()
bucket = 'rogerio-curso-aws'
sub_folder_models = 'models/house_prices/xgboost'
sub_folder_dataset = 'datasets/house_prices'
key_train = 'house_train_xgboost'
key_test = 'house_test_xgboost'
role = sagemaker.get_execution_role()
s3_train = f's3://{bucket}/{sub_folder_dataset}/train/{key_train}'
s3_test = f's3://{bucket}/{sub_folder_dataset}/test/{key_test}'
output_path = f's3://{bucket}/{sub_folder_models}/output'

In [28]:
import os

In [40]:
with open('ds_train_xboost.csv', 'rb') as f:
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(sub_folder_dataset,'train',key_train)).upload_fileobj(f)

In [41]:
with open('ds_test_xboost.csv', 'rb') as f:
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(sub_folder_dataset,'test',key_test)).upload_fileobj(f)

<h2> <font color='aquamarine'>XGBoost training </font></h2>

In [47]:
container = sagemaker.image_uris.retrieve(framework='xgboost', region = boto3.Session().region_name, version = 'latest')
xgboost = sagemaker.estimator.Estimator(image_uri = container,
                                        role = role,
                                        instance_count=1,
                                        instance_type='ml.m5.xlarge',
                                        output_path=output_path,
                                        sagemaker_session=session
                                       )

In [48]:
xgboost.set_hyperparameters(num_round = 100)

In [51]:
train_input = sagemaker.inputs.TrainingInput(s3_data = s3_train, content_type='csv', s3_data_type='S3Prefix')
test_input = sagemaker.inputs.TrainingInput(s3_data = s3_test, content_type='csv', s3_data_type='S3Prefix')
data_channels = {'train': train_input,'validation': test_input}

In [52]:
xgboost.fit(data_channels)

2022-09-29 18:31:08 Starting - Starting the training job...ProfilerReport-1664476268: InProgress
...
2022-09-29 18:32:02 Starting - Preparing the instances for training.........
2022-09-29 18:33:33 Downloading - Downloading input data......
2022-09-29 18:34:33 Training - Downloading the training image..[34mArguments: train[0m
[34m[2022-09-29:18:34:49:INFO] Running standalone xgboost training.[0m
[34m[2022-09-29:18:34:49:INFO] File size need to be processed in the node: 1.55mb. Available memory size in the node: 8448.76mb[0m
[34m[2022-09-29:18:34:49:INFO] Determined delimiter of CSV input is ','[0m
[34m[18:34:49] S3DistributionType set as FullyReplicated[0m
[34m[18:34:49] 15129x16 matrix with 242064 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2022-09-29:18:34:49:INFO] Determined delimiter of CSV input is ','[0m
[34m[18:34:49] S3DistributionType set as FullyReplicated[0m
[34m[18:34:49] 6484x16 matrix with 103744 entries load

<h1><font color='aquamarine' size = 5>Deploy, preditions, evaluating</font></h1>

In [53]:
xbooster_regressor = xgboost.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

------!

In [54]:
from sagemaker.serializers import CSVSerializer
xbooster_regressor.serializer = CSVSerializer()

In [61]:
pred = xbooster_regressor.predict(x_test).decode('utf-8').split(',')
pred = np.array(pred).astype(np.float32)

In [62]:
from sklearn.metrics import mean_absolute_error, mean_squared_error 

In [63]:
mae = mean_absolute_error(y_test, pred)
msae = mean_absolute_error(y_test, pred)
rmse = np.sqrt(mean_absolute_error(y_test, pred))

In [67]:
print(f' MAE: {mae.round(2)}\n MSAE: {msae.round(2)}\n RMSE: {rmse.round(2)}')

 MAE: 69311.55
 MSAE: 69311.55
 RMSE: 263.27


<h1><font color='aquamarine' size = 5> Tuning </font></h1>

In [118]:
tuning_job_config = {
     "ParameterRanges": {
      "CategoricalParameterRanges": [],
      "ContinuousParameterRanges": [
        {
          "MaxValue": "1",
          "MinValue": "0",
          "Name": "eta"
        },
        {
          "MaxValue": "2",
          "MinValue": "0",
          "Name": "alpha"
        },
        {
          "MaxValue": "10",
          "MinValue": "1",
          "Name": "min_child_weight"
        }
      ],
      "IntegerParameterRanges": [
        {
          "MaxValue": "10",
          "MinValue": "1",
          "Name": "max_depth"
        }
      ],
      "IntegerParameterRanges": [
        {
          "MaxValue": "300",
          "MinValue": "50",
          "Name": "num_round"
        }
      ]
    },
    "ResourceLimits": {
      "MaxNumberOfTrainingJobs": 5,
      "MaxParallelTrainingJobs": 3
    },
    "Strategy": "Bayesian",
    "HyperParameterTuningJobObjective": {
      "MetricName": "validation:rmse",
      "Type": "Minimize"
    }
  }

In [121]:
training_job_definition = {
    "AlgorithmSpecification": {
      "TrainingImage": container,
      "TrainingInputMode": "File"
    },
    "InputDataConfig": [
      {
        "ChannelName": "train",
        "CompressionType": "None",
        "ContentType": "csv",
        "DataSource": {
          "S3DataSource": {
            "S3DataDistributionType": "FullyReplicated",
            "S3DataType": "S3Prefix",
            "S3Uri": s3_train
          }
        }
      },
      {
        "ChannelName": "validation",
        "CompressionType": "None",
        "ContentType": "csv",
        "DataSource": {
          "S3DataSource": {
            "S3DataDistributionType": "FullyReplicated",
            "S3DataType": "S3Prefix",
            "S3Uri": s3_test
          }
        }
      }
    ],
    "OutputDataConfig": {
      "S3OutputPath": "s3://{}/{}/output".format(bucket,sub_folder_models)
    },
    "ResourceConfig": {
      "InstanceCount": 2,
      "InstanceType": "ml.m5.xlarge",
      "VolumeSizeInGB": 10
    },
    "RoleArn": role,
    "StaticHyperParameters": {
      "eval_metric": "rmse",
      "objective": "reg:linear",
      "rate_drop": "0.3",
      "tweedie_variance_power": "1.4"
    },
   
    "StoppingCondition": {
      "MaxRuntimeInSeconds": 43200
    }
}

In [122]:
smclient = boto3.client('sagemaker')
smclient.create_hyper_parameter_tuning_job(HyperParameterTuningJobName = 'xgboosttuninghouses',
                                            HyperParameterTuningJobConfig = tuning_job_config,
                                            TrainingJobDefinition = training_job_definition
                                          )

{'HyperParameterTuningJobArn': 'arn:aws:sagemaker:sa-east-1:890895794940:hyper-parameter-tuning-job/xgboosttuninghouses',
 'ResponseMetadata': {'RequestId': '2442d862-27d5-4e7c-84fa-185bb00bcfe1',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '2442d862-27d5-4e7c-84fa-185bb00bcfe1',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '120',
   'date': 'Thu, 29 Sep 2022 19:58:26 GMT'},
  'RetryAttempts': 0}}

<h3><font color='aquamarine' style ='bold'> Hyperparameters results</font></h3>

<p>Key	Value<br>
_tuning_objective_metric	validation:rmse<br>
alpha	1.7114218618998898<br>
eta	0.09874686146818112<br>
eval_metric	rmse<br>
min_child_weight	1.8351150086549781<br>
num_round	204<br>
objective	reg:linear<br>
rate_drop	0.3<br>
tweedie_variance_power	1.4</p>