In [76]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer

In [77]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from scipy.stats import uniform, randint
from sklearn.model_selection import train_test_split
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.debugger import Rule, ProfilerRule, rule_configs
from sagemaker.session import TrainingInput

In [78]:
# Importing the dataset
train_file_path = "train.csv"
houseprice_df = pd.read_csv(train_file_path)
X = houseprice_df.drop(['Id','SalePrice'], axis=1)
y = houseprice_df['SalePrice']

# Data Preprocessing

In [81]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Select numerical features
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Select categorical features
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

# Numerical pipeline
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical pipeline
categorical_pipeline = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'most_frequent', fill_value = 'missing')),
    ('onehot', OneHotEncoder(handle_unknown = 'ignore'))  # Set handle_unknown to 'ignore'
])

# perform preprocessing to the data
preprocessor = ColumnTransformer([
    ('numerical', numerical_pipeline, numerical_features),
    ('categorical', categorical_pipeline, categorical_features)
])

# deploy the preprocessing pipeline
X_train = preprocessor.fit_transform(X_train) # fit_transform on the training set
X_val = preprocessor.transform(X_val) # fit_transform on the validation set
X_test = preprocessor.transform(X_test) # fit_transform on the test set


In [82]:
# Convert sparse matrix to DataFrame or Series
X_train_df = pd.DataFrame(X_train.toarray(), columns=preprocessor.get_feature_names_out())
X_val_df = pd.DataFrame(X_val.toarray(), columns=preprocessor.get_feature_names_out())
X_test_df = pd.DataFrame(X_test.toarray(), columns=preprocessor.get_feature_names_out())

# Concatenate with y_train and y_test
train = pd.concat([X_train_df, y_train.reset_index(drop=True)], axis=1)
val = pd.concat([X_val_df, y_val.reset_index(drop=True)], axis=1)
test = pd.concat([X_test_df, y_test.reset_index(drop=True)], axis=1)

In [83]:
train

Unnamed: 0,numerical__MSSubClass,numerical__LotFrontage,numerical__LotArea,numerical__OverallQual,numerical__OverallCond,numerical__YearBuilt,numerical__YearRemodAdd,numerical__MasVnrArea,numerical__BsmtFinSF1,numerical__BsmtFinSF2,...,categorical__SaleType_New,categorical__SaleType_Oth,categorical__SaleType_WD,categorical__SaleCondition_Abnorml,categorical__SaleCondition_AdjLand,categorical__SaleCondition_Alloca,categorical__SaleCondition_Family,categorical__SaleCondition_Normal,categorical__SaleCondition_Partial,SalePrice
0,0.077639,-0.015857,-0.247949,-0.084531,-0.496207,0.946221,0.730560,-0.583411,-0.961118,-0.277712,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,172400
1,-0.868615,1.188298,0.070742,1.367415,-0.496207,1.141048,1.018965,0.866809,-0.961118,-0.277712,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,250000
2,0.550766,0.328187,0.134538,-0.084531,1.263620,0.426684,0.874762,-0.150684,-0.961118,-0.277712,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,178000
3,1.497020,-1.263017,-0.516914,1.367415,-0.496207,0.946221,0.730560,-0.583411,1.636770,-0.277712,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,251000
4,0.077639,-0.015857,4.087289,-0.084531,-0.496207,0.686453,0.346020,-0.583411,1.178447,-0.277712,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,240000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,0.077639,1.274309,0.222233,-0.084531,0.383707,-0.092854,-0.807600,2.784840,0.012004,-0.277712,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,193500
872,0.077639,2.048408,2.415582,2.819361,-0.496207,0.783866,0.538290,7.474662,2.051650,-0.277712,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,625000
873,-0.868615,0.285182,-0.065909,-0.810504,-0.496207,0.101973,-0.471127,-0.583411,1.365251,0.520977,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,170000
874,0.077639,0.156165,0.022461,1.367415,-0.496207,1.108577,0.970897,1.053934,0.224874,-0.277712,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,275000


In [84]:
val

Unnamed: 0,numerical__MSSubClass,numerical__LotFrontage,numerical__LotArea,numerical__OverallQual,numerical__OverallCond,numerical__YearBuilt,numerical__YearRemodAdd,numerical__MasVnrArea,numerical__BsmtFinSF1,numerical__BsmtFinSF2,...,categorical__SaleType_New,categorical__SaleType_Oth,categorical__SaleType_WD,categorical__SaleCondition_Abnorml,categorical__SaleCondition_AdjLand,categorical__SaleCondition_Alloca,categorical__SaleCondition_Family,categorical__SaleCondition_Normal,categorical__SaleCondition_Partial,SalePrice
0,-0.868615,0.887259,0.332575,0.641442,1.263620,0.426684,0.634425,-0.583411,-0.498451,-0.277712,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,202500
1,-0.868615,0.414198,-0.105420,-0.084531,-0.496207,-0.579921,-1.528612,0.808332,1.830091,-0.277712,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,167000
2,-0.158924,-0.015857,0.053588,-1.536477,-0.496207,-0.450036,0.201818,-0.583411,-0.961118,-0.277712,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,161500
3,-0.632051,-0.445912,-0.435001,-0.084531,-0.496207,-1.456640,-1.672814,-0.583411,-0.961118,-0.277712,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,102000
4,1.497020,-0.015857,-0.470272,1.367415,-0.496207,0.426684,-0.038520,-0.583411,1.289226,-0.277712,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,187500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214,-0.868615,0.414198,-0.139149,-0.810504,1.263620,-0.547450,0.970897,-0.583411,0.081513,2.761203,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,139400
215,2.443275,-1.736078,-0.724107,0.641442,-0.496207,1.108577,0.970897,-0.583411,-0.961118,-0.277712,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,144152
216,-0.868615,0.285182,-0.227326,-0.810504,-0.496207,-0.547450,-1.480544,-0.583411,-0.961118,-0.277712,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,93000
217,-0.868615,0.156165,-0.208342,-0.810504,0.383707,-0.547450,0.249885,3.217567,1.673696,-0.277712,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,153000


In [85]:
test

Unnamed: 0,numerical__MSSubClass,numerical__LotFrontage,numerical__LotArea,numerical__OverallQual,numerical__OverallCond,numerical__YearBuilt,numerical__YearRemodAdd,numerical__MasVnrArea,numerical__BsmtFinSF1,numerical__BsmtFinSF2,...,categorical__SaleType_New,categorical__SaleType_Oth,categorical__SaleType_WD,categorical__SaleCondition_Abnorml,categorical__SaleCondition_AdjLand,categorical__SaleCondition_Alloca,categorical__SaleCondition_Family,categorical__SaleCondition_Normal,categorical__SaleCondition_Partial,SalePrice
0,-0.868615,-0.015857,-0.219713,-0.084531,2.143534,-0.255210,0.874762,-0.583411,0.479015,-0.277712,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,154500
1,0.077639,1.188298,0.150535,1.367415,-0.496207,0.751395,0.490223,1.533442,1.280538,-0.277712,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,325000
2,-0.632051,-0.617934,-0.167096,-0.810504,0.383707,-1.424169,-1.672814,-0.583411,-0.961118,-0.277712,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,115000
3,-0.158924,-0.875968,-0.548716,-0.084531,1.263620,-0.774747,-1.672814,-0.583411,-0.094431,-0.277712,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,159000
4,-0.868615,0.801248,0.212403,2.093388,-0.496207,1.173519,1.115100,-0.174075,1.258816,-0.277712,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,315500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360,0.077639,-0.015857,0.130297,-0.084531,0.383707,0.166915,-0.423060,1.159191,-0.622263,-0.277712,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,195000
361,2.443275,0.199171,0.006560,-0.084531,0.383707,0.296799,-0.230790,-0.583411,-0.961118,-0.277712,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,120000
362,0.077639,-0.015857,-0.127103,1.367415,-0.496207,1.011164,0.826695,0.287890,-0.961118,-0.277712,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,228500
363,0.077639,-0.015857,1.856548,0.641442,0.383707,0.751395,0.442155,-0.583411,0.331309,-0.277712,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,248000


In [86]:
print(train.shape)
print(val.shape)
print(test.shape)

(876, 282)
(219, 282)
(365, 282)


# Bring in SageMaker

### Print out the session and the role

In [3]:
import boto3
import sagemaker
from sagemaker.xgboost.estimator import XGBoost
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput

### Set an output path in s3 bucket where the trained model is saved

In [4]:
# set an output path where the trained model will be saved
bucket = sagemaker.Session().default_bucket()
prefix = 'sagemaker-xgboost-house-price-prediction'
output_path = 's3://{}/{}/{}/output'.format(bucket, prefix, 'xgboost_model')
print(bucket)
print(prefix)
print(output_path)

sagemaker-eu-central-1-590183757961
sagemaker-xgboost-house-price-prediction
s3://sagemaker-eu-central-1-590183757961/sagemaker-xgboost-house-price-prediction/xgboost_model/output


### Print out the sagemaker session, region, role and container

In [5]:
sagemaker_session = sagemaker.Session()
print("Sagemaker Session: {}".format(sagemaker_session))

region = sagemaker.Session().boto_region_name
print("AWS Region: {}".format(region))

role = sagemaker.get_execution_role()
print("RoleArn: {}".format(role))

container = sagemaker.image_uris.retrieve("xgboost", region, "1.2-1")
print("Container: {}".format(container))

Sagemaker Session: <sagemaker.session.Session object at 0x7fc3d390c3a0>
AWS Region: eu-central-1
RoleArn: arn:aws:iam::590183757961:role/aws_final_project
Container: 492215442770.dkr.ecr.eu-central-1.amazonaws.com/sagemaker-xgboost:1.2-1


### Convert train and test datasets to CSV and upload them to S3 buckets

In [90]:
train.to_csv('train_dataset.csv', index=False, header=True)
val.to_csv('val_dataset.csv', index=False, header=True)
test.to_csv('test_dataset.csv', index=False, header=True)

In [91]:
import sagemaker, boto3, os
boto3.Session().resource('s3').Bucket(bucket).Object(
    os.path.join(prefix, 'data/train_dataset.csv')).upload_file('train_dataset.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(
    os.path.join(prefix, 'data/val_dataset.csv')).upload_file('val_dataset.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(
    os.path.join(prefix, 'data/test_dataset.csv')).upload_file('test_dataset.csv')

### Retrieve training dataset from the s3 bucket

In [92]:
train_input = TrainingInput(
    "s3://{}/{}/{}".format(bucket, prefix, "data/train_dataset.csv"), content_type="csv"
)
val_input = TrainingInput(
    "s3://{}/{}/{}".format(bucket, prefix, "data/val_dataset.csv"), content_type="csv"
)

In [93]:
# Set the source directory to the current directory
# source_dir = "."

In [94]:
pwd

'/home/ec2-user/SageMaker'

In [95]:
data_path = 's3://sagemaker-eu-central-1-590183757961/sagemaker-xgboost-house-price-prediction/data'

In [96]:
from sagemaker.inputs import TrainingInput
from sagemaker.estimator import Estimator
from sagemaker.tuner import HyperparameterTuner, IntegerParameter, ContinuousParameter
from sagemaker.amazon.amazon_estimator import get_image_uri

# Define hyperparameter search space
hyperparameter_ranges = {
    'max_depth': IntegerParameter(3, 10),
    'eta': ContinuousParameter(0.01, 0.2),
    'min_child_weight': IntegerParameter(1, 10),
    'subsample': ContinuousParameter(0.5, 1),
    'gamma': ContinuousParameter(0, 10),
    'colsample_bytree': ContinuousParameter(0.5, 1),
    'alpha': ContinuousParameter(0, 2),
    'num_round': IntegerParameter(1, 4000),
}

# Define the estimator with default hyperparameters
xgb_estimator = Estimator(
    image_uri=container,
    # image_uri=get_image_uri(boto3.Session().region_name, 'xgboost'),
    role=role,
    instance_count=1,
    instance_type='ml.m5.2xlarge',
    volume_size=5,
    output_path=output_path,
    sagemaker_session=sagemaker.Session(),
    # entry_point='training_script.py',  # Specify your training script filename
)

# Set up the hyperparameter tuner
tuner = HyperparameterTuner(
    estimator = xgb_estimator,
    objective_metric_name = 'validation:rmse',  # Adjust the metric name as per your requirement
    hyperparameter_ranges = hyperparameter_ranges,
    objective_type='Minimize',  # Adjust if you are maximizing a metric
    max_jobs = 100,  # Adjust as per your requirement
    max_parallel_jobs = 5,  # Adjust as per your resource availability
    strategy='Random',  # Or 'Bayesian', 'Random', etc.
    base_tuning_job_name='xgb-tuning'
)

# Train and tune the model on training set

In [98]:
# Launch hyperparameter tuning job
tuner.fit(inputs={'train': train_input, "validation": val_input}, wait=True)

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


..............................................................................................................................................................................................!


## Retrieve the best hyperparameters

In [117]:
# Retrieve the best hyperparameters
best_hyperparameters = tuner.best_estimator().hyperparameters()

# Create a final SageMaker training job with the best hyperparameters
final_estimator = sagemaker.estimator.Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type='ml.m5.2xlarge',
    volume_size=5,
    output_path=output_path,
    sagemaker_session=sagemaker.Session(),
    hyperparameters=best_hyperparameters
)


2024-03-31 09:29:08 Starting - Found matching resource for reuse
2024-03-31 09:29:08 Downloading - Downloading the training image
2024-03-31 09:29:08 Training - Training image download completed. Training in progress.
2024-03-31 09:29:08 Uploading - Uploading generated training model
2024-03-31 09:29:08 Completed - Resource released due to keep alive period expiry


In [118]:
final_estimator

<sagemaker.estimator.Estimator at 0x7f586c2dcee0>

In [119]:
best_hyperparameters

{'_tuning_objective_metric': 'validation:rmse',
 'alpha': '0.02502686804987686',
 'colsample_bytree': '0.8972246811424174',
 'eta': '0.19422206037057102',
 'gamma': '0.6519058275543321',
 'max_depth': '3',
 'min_child_weight': '5',
 'num_round': '128',
 'subsample': '0.7349182502047757'}

In [104]:
# Train the final model with the best hyperparameters
final_estimator.fit(inputs={'train': train_input, "validation": val_input}, wait=True)

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-03-31-09-35-05-104


2024-03-31 09:35:05 Starting - Starting the training job...
2024-03-31 09:35:20 Starting - Preparing the instances for training......
2024-03-31 09:36:34 Downloading - Downloading the training image...
2024-03-31 09:37:00 Training - Training image download completed. Training in progress....
2024-03-31 09:37:36 Uploading - Uploading generated training model
2024-03-31 09:37:36 Completed - Training job completed
[34m[2024-03-31 09:37:20.708 ip-10-0-245-149.eu-central-1.compute.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter _tuning_objective_metric value validation:rmse to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determin

# Deploy the model on test set

In [136]:
predictor = tuner.best_estimator()


2024-03-31 09:29:08 Starting - Found matching resource for reuse
2024-03-31 09:29:08 Downloading - Downloading the training image
2024-03-31 09:29:08 Training - Training image download completed. Training in progress.
2024-03-31 09:29:08 Uploading - Uploading generated training model
2024-03-31 09:29:08 Completed - Resource released due to keep alive period expiry


In [152]:
predictor.model_data

's3://sagemaker-eu-central-1-590183757961/sagemaker-xgboost-house-price-prediction/xgboost_model/output/xgb-tuning-2024-03-31-10-12-02-906/output/model.tar.gz'

In [151]:
predictor

<sagemaker.estimator.Estimator at 0x7f586cf25cc0>

In [1]:
import sagemaker
from sagemaker.serializers import CSVSerializer
# Assuming 'final_estimator' has been used for training and the training job has completed successfully

# Retrieve model artifacts from S3
model_uri = "s3://sagemaker-eu-central-1-590183757961/sagemaker-xgboost-house-price-prediction/xgboost_model/output/xgb-tuning-240331-0912-100-e255e67a/output/model.tar.gz"
model_uri

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


's3://sagemaker-eu-central-1-590183757961/sagemaker-xgboost-house-price-prediction/xgboost_model/output/xgb-tuning-240331-0912-100-e255e67a/output/model.tar.gz'

In [6]:
# Create a new Estimator object with the model_uri
deploy_estimator = sagemaker.model.Model(
    model_data=model_uri,
    image_uri=container,  # Assuming 'container' is defined elsewhere
    role=role  # Assuming 'role' is defined elsewhere
)

In [19]:
deploy_estimator.model_data

's3://sagemaker-eu-central-1-590183757961/sagemaker-xgboost-house-price-prediction/xgboost_model/output/xgb-tuning-240331-0912-100-e255e67a/output/model.tar.gz'

In [20]:
test_input = TrainingInput(
    "s3://{}/{}/{}".format(bucket, prefix, "data/test_dataset.csv"), content_type="csv"
)

In [34]:
# Deploy the final estimator with the best hyperparameters on test set
predictions = deploy_estimator.predict(test_input)

AttributeError: 'Model' object has no attribute 'predict'

In [127]:
import boto3

# Assume 'session', 'predictor', 'payload', and 'endpoint_name' are defined

# Create a SageMaker runtime client
runtime_client = boto3.client('sagemaker-runtime')

# Invoke the endpoint to get predictions
response = runtime_client.invoke_endpoint(
    EndpointName=predictor.endpoint_name,
    ContentType="text/csv",  # Adjust content type to CSV
    Body=payload
)

# Read and decode the response
result = response["Body"].read().decode("ascii")

# Process the result
print("Predicted values are {}.".format(result))

AttributeError: 'NoneType' object has no attribute 'endpoint_name'

In [31]:
# retrieve the name of the endpoint
print(deploy_estimator.endpoint_name)

sagemaker-xgboost-2024-03-31-10-56-01-624


In [145]:
predictor.delete_model()
predictor.delete_endpoint()

NameError: name 'final_estimator' is not defined