In [60]:

import sagemaker
import boto3
from sagemaker.amazon.amazon_estimator import get_image_uri 
from sagemaker.session import s3_input, Session

In [44]:
bucket_name = 'ssbankapplication' # <--- CHANGE THIS VARIABLE TO A UNIQUE NAME FOR YOUR BUCKET
my_region = boto3.session.Session().region_name # set the region of the instance
print(my_region)

us-east-1


In [45]:

s3 = boto3.resource('s3')
try:
    if  my_region == 'us-east-1':
        s3.create_bucket(Bucket=bucket_name)
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error: ',e)
     

S3 bucket created successfully


In [80]:

# set an output path where the trained model will be saved
prefix = 'xgboost-as-a-built-in-algo'
output_path ='s3://{}/{}/output'.format(bucket_name, prefix)
print(output_path)

s3://ssbankapplication/xgboost-as-a-built-in-algo/output


In [102]:
!pip install gdown


Collecting gdown
  Downloading gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Downloading gdown-5.2.0-py3-none-any.whl (18 kB)
Installing collected packages: gdown
Successfully installed gdown-5.2.0


In [110]:
import pandas as pd
import gdown

# Download the dataset from Google Drive
url = 'https://drive.google.com/uc?id=1vDlolkCu1_gKYreG-J9uC6DSo5J7xrT2'
output = 'Housing.csv'
try:
    gdown.download(url, output, quiet=False)
    print('Success: Downloaded Housing.csv from Google Drive.')
except Exception as e:
    print('Data download error: ', e)

# Load the dataset into a DataFrame
try:
    model_data = pd.read_csv('Housing.csv')
    print('Success: Data loaded into DataFrame.')
except Exception as e:
    print('Data load error: ', e)






Downloading...
From: https://drive.google.com/uc?id=1vDlolkCu1_gKYreG-J9uC6DSo5J7xrT2
To: /home/ec2-user/SageMaker/Housing.csv
100%|██████████| 30.0k/30.0k [00:00<00:00, 15.5MB/s]

Success: Downloaded Housing.csv from Google Drive.
Success: Data loaded into DataFrame.





In [111]:
model_data.head()


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [113]:
model_data.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus'],
      dtype='object')

In [114]:
# Split the data into train and test sets
train_data, test_data = train_test_split(model_data, test_size=0.3, random_state=1729)

print(train_data.shape, test_data.shape)


(381, 13) (164, 13)


In [118]:
# Split the data into train and test sets
train_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.7 * len(model_data))])
print(train_data.shape, test_data.shape)

(381, 13) (164, 13)


  return bound(*args, **kwds)


In [120]:
# Saving Train Data into a Bucket
train_output_path = 'train.csv'
pd.concat([train_data['price'], train_data.drop('price', axis=1)], axis=1).to_csv(train_output_path, index=False, header=False)


In [121]:
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file(train_output_path)
s3_input_train = TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')

# Saving Test Data into a Bucket
test_output_path = 'test.csv'
pd.concat([test_data['price'], test_data.drop('price', axis=1)], axis=1).to_csv(test_output_path, index=False, header=False)

boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'test/test.csv')).upload_file(test_output_path)
s3_input_test = TrainingInput(s3_data='s3://{}/{}/test'.format(bucket_name, prefix), content_type='csv')

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


In [122]:
# Train XGBoost Model using SageMaker
container = retrieve('xgboost', boto3.Session().region_name, version="1.0-1")


INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker.image_uris:Defaulting to only supported image scope: cpu.


In [123]:
hyperparameters = {
    "max_depth": "5",
    "eta": "0.2",
    "gamma": "4",
    "min_child_weight": "6",
    "subsample": "0.7",
    "objective": "reg:squarederror",  # Regression objective for house price prediction
    "num_round": "50"
}

In [124]:
estimator = Estimator(image_uri=container,
                      hyperparameters=hyperparameters,
                      role=sagemaker.get_execution_role(),
                      instance_count=1,
                      instance_type='ml.m5.2xlarge',
                      volume_size=5,
                      output_path=f's3://{bucket_name}/{prefix}/output',
                      use_spot_instances=True,
                      max_run=300,
                      max_wait=600)

In [125]:
estimator.fit({'train': s3_input_train, 'validation': s3_input_test})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-05-14-08-11-07-023


2024-05-14 08:11:07 Starting - Starting the training job...
2024-05-14 08:11:21 Starting - Preparing the instances for training...
2024-05-14 08:11:48 Downloading - Downloading input data...
2024-05-14 08:12:29 Training - Training image download completed. Training in progress....
2024-05-14 08:13:07 Uploading - Uploading generated training model
2024-05-14 08:13:07 Completed - Training job completed
[34m[2024-05-14 08:12:50.840 ip-10-0-188-254.ec2.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value reg:squarederror to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34m

In [None]:

# Step 6: Train the Model
s3_input_train = TrainingInput(s3_data=s3_train_path, content_type='csv')
estimator.fit({'train': s3_input_train})