# Pune House Price Prediction 📶💸💸🏚️🏗️🏠💸💸📶

## Model Deployment with SageMaker

### Model Deployment Purpose: 
- To build the model on large scale with Amazon SageMaker and S3 bucket.
- Deploy the training and inference pipeline on SM along with data ingestion.
- Create reusable endpoint for model access

In [4]:
import sagemaker
import boto3
import os
import numpy as np
import pandas as pd
import pickle
import json
import math
import re

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import TargetEncoder
from sklearn.linear_model import LinearRegression

random_seed = 42

In [2]:
sm_boto3 = boto3.client("sagemaker")
sess = sagemaker.Session()
region = sess.boto_session.region_name
bucket = "housepricebucketsagemaker"
print("Using bucket: ", bucket)

sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\shrin\AppData\Local\sagemaker\sagemaker\config.yaml
Using bucket:  housepricebucketsagemaker


In [3]:
raw_data = pd.read_excel("./Pune_Real_Estate_Data.xlsx")
raw_data.head()

Unnamed: 0,Sr. No.,Location,Sub-Area,Propert Type,Property Area in Sq. Ft.,Price in lakhs,Price in Millions,Company Name,TownShip Name/ Society Name,Total TownShip Area in Acres,ClubHouse,School / University in Township,Hospital in TownShip,Mall in TownShip,Park / Jogging track,Swimming Pool,Gym,Description
0,1,"Pune, Maharashtra, India",Bavdhan,1 BHK,492,39,3.9,Shapoorji Paloonji,Vanaha,1000.0,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Shapoorji Paloonji comunity located in the sub...
1,2,"Pune, Maharashtra, India",Bavdhan,2 BHK,774,65,6.5,Shapoorji Paloonji,Vanaha,1000.0,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Vanaha Township located near the lonavala hill...
2,3,"Pune, Maharashtra, India",Bavdhan,3 BHK,889,74,7.4,Shapoorji Paloonji,Vanaha,1000.0,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Vanaha Society is suitable for all aged group ...
3,4,"Pune, Maharashtra, India",Bavdhan,3 BHK Grand,1018,89,8.9,Shapoorji Paloonji,Vanaha,1000.0,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Vanaha township are offering 3BHK grand prpoer...
4,5,"Pune, Maharashtra, India",Mahalunge,2BHK,743,74,7.4,Godrej Properties,Godrej Hills retreat,100.0,Yes,Yes,Yes,Yes,Yes,Yes,Yes,The area is a hub of prestigious schools like ...


In [5]:
raw_data = raw_data.drop(["Sr. No.", "Location", "Description"], axis = 1)
raw_data = raw_data.rename(columns={"Sub-Area": "Location", "Propert Type" : "Type", "Property Area in Sq. Ft." : "Area_sqft", "Price in lakhs" : "Price_Lakhs", "Price in Millions" : "Price_Mil",
                                    "Company Name" : "Developer", "TownShip Name/ Society Name" : "Name", "Total TownShip Area in Acres" : "Area_township", "ClubHouse" : "hasClubHouse",
                                    "School / University in Township ": "hasEduFacility", "Hospital in TownShip": "hasHospital", "Mall in TownShip" : "hasMall", "Park / Jogging track" : "hasParkOrJogTrack",
                                    "Swimming Pool" : "hasPool", "Gym" : "hasGym" })

In [6]:
clean_df = raw_data.copy()
clean_df["Price_Mil"] = clean_df["Price_Mil"].apply(lambda x:round(float(x), 4) if str(x).replace(".", "", 1).isdigit() else np.NAN).astype("float64")
clean_df["Price_Mil"] = clean_df["Price_Mil"].replace([np.NAN], 9.5)
clean_df["Price_Mil"] = clean_df["Price_Mil"].replace([92.300, 93.000], [9.230, 9.300])

In [7]:
#Consider mean area in case of a range
def get_area(area):
    if str(area).isdigit():
        return float(area)
    else:
        s = [float(s) for s in re.findall(r'-?\d+\.?\d*', str(area))]
        if len(s) == 0:
            return np.NAN
        else:
            return np.mean(s)

def get_bedroom_size(prop_type):
    if "bhk" in str(prop_type):
        s = [float(s) for s in re.findall(r'-?\d+\.?\d*', prop_type)]
        if len(s) == 0:
            return np.NAN
        else:
            return sum(s)
    return np.NAN

def get_township_size(ts_area):

    if not np.isnan(ts_area):
        if 0 < int(ts_area) <= 25:
            return "small"
        elif 25 < int(ts_area) <= 250:
            return "medium"
        elif 250 < int(ts_area) < 25000:
            return "large"
    return "unknown"
    
ts_size_map = {
    "unknown" : 0,
    "small" : 1,
    "medium":2,
    "large":3
}

loc_trend_map = {"bavdhan" : 6.5 , "mahalunge" : 15.2 , "balewadi" :  8.3 , "ravet" : 3.6  , "baner" : 14.7  , "kharadi" :  11.5 , "koregaon park" :  13.7 , "keshav nagar" :  -2.4 , 
        "kirkatwadi sinhagad road" :  2.4 , "akurdi" :  -0.7 , "tathawade" : 4.0  , "hadapsar" :  25.2 , "kiwale" : 5.7  , "kalyani nagar" : 24.5  , "pisoli" :  -4.0 , "manjri" : 0.0  ,
        "handewadi" : 2.1  , "mundhwa" : 0.0  , "nibm" : 1.0  , "bt kawade rd" : 3.6  , "undri" : 2.5  , "karvenagar" : 2.0  , "magarpatta" : 12.1  , "hinjewadi" : 8.0  , "vimannagar" : 17.0  , 
        "wadgaon sheri" : 38.4  , "susgaon" : 3.1  , "mohammadwadi" : 4.3  , "dhanori" : 4.3  , "lonavala" : 0  , "talegoan" : 0 }

val_tag_map = {0    : "unknown",
           1    : "affordable",
           2    : "midrange",
           3    : "premium"
        }
area_idx_dict = {"bavdhan" : 3 , "mahalunge" : 3 , "balewadi" :  3 , "ravet" : 1  , "baner" : 3  , "kharadi" :  3 , "koregaon park" :  3 , "keshav nagar" :  2 , 
        "kirkatwadi sinhagad road" :  1 , "akurdi" :  2 , "tathawade" : 2  , "hadapsar" :  3 , "kiwale" : 1  , "kalyani nagar" : 3  , "pisoli" :  1 , "manjri" : 2  ,
        "handewadi" : 1  , "mundhwa" : 2  , "nibm" : 3  , "bt kawade rd" : 2  , "undri" : 2  , "karvenagar" : 3  , "magarpatta" : 3  , "hinjewadi" : 2  , "vimannagar" : 3  , 
        "wadgaon sheri" : 3  , "susgaon" : 2  , "mohammadwadi" : 3  , "dhanori" : 2  , "lonavala" : 0  , "talegoan" : 0 }

loc_tag_map = {}
for key,val in area_idx_dict.items():
        loc_tag_map[key] = val_tag_map[val]

In [8]:
clean_df["Area_sqft"] = clean_df["Area_sqft"].apply(lambda x:get_area(x))
clean_df["Area_sqft"].fillna(value=clean_df["Area_sqft"].mean(), inplace=True)

clean_df = clean_df.apply(lambda x:x.str.lower() if x.dtypes == "O" else x)
clean_df = clean_df.apply(lambda x:x.str.strip() if x.dtypes == "O" else x)

clean_df["No_of_Bedroom"] = clean_df["Type"].apply(lambda x:get_bedroom_size(x))
clean_df["No_of_Bedroom"].fillna(value=clean_df["No_of_Bedroom"].mean(), inplace=True)

clean_df["Township_Size_Ordinal"] = clean_df["Area_township"].apply(lambda x:get_township_size(x)).map(ts_size_map).fillna(0)

clean_df["Loc_trend"] = clean_df.Location.map(loc_trend_map).fillna(0)
clean_df["Loc_tag"] = clean_df.Location.map(loc_tag_map).fillna("unknown")
clean_df["Loc_tag_ordinal"] = clean_df.Location.map(area_idx_dict).fillna(0)

In [9]:
dummy_cat = pd.get_dummies(clean_df[["hasClubHouse","hasEduFacility",	"hasHospital",	"hasMall",	"hasParkOrJogTrack",	"hasPool",	"hasGym"]], drop_first=True, dtype=int)
clean_df = pd.concat([clean_df, dummy_cat], axis=1)

In [10]:
clean_df = clean_df.drop(["Area_township","Type", "Price_Lakhs", "Loc_tag", "hasClubHouse","hasEduFacility",	"hasHospital",	"hasMall",	"hasParkOrJogTrack",	"hasPool",	"hasGym"], axis=1)
clean_df.head()

Unnamed: 0,Location,Area_sqft,Price_Mil,Developer,Name,No_of_Bedroom,Township_Size_Ordinal,Loc_trend,Loc_tag_ordinal,hasClubHouse_yes,hasEduFacility_yes,hasHospital_yes,hasMall_yes,hasParkOrJogTrack_yes,hasPool_yes,hasGym_yes
0,bavdhan,492.0,3.9,shapoorji paloonji,vanaha,1.0,3,6.5,3.0,1,1,1,1,1,1,1
1,bavdhan,774.0,6.5,shapoorji paloonji,vanaha,2.0,3,6.5,3.0,1,1,1,1,1,1,1
2,bavdhan,889.0,7.4,shapoorji paloonji,vanaha,3.0,3,6.5,3.0,1,1,1,1,1,1,1
3,bavdhan,1018.0,8.9,shapoorji paloonji,vanaha,3.0,3,6.5,3.0,1,1,1,1,1,1,1
4,mahalunge,743.0,7.4,godrej properties,godrej hills retreat,2.0,2,15.2,3.0,1,1,1,1,1,1,1


In [11]:
y = clean_df["Price_Mil"]
X = clean_df.drop(["Price_Mil"], axis=1)

test_ratio = 0.2
random_seed = 42

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=test_ratio, random_state=random_seed)

In [12]:
full_train_df = pd.concat([X_train, y_train], axis=1)
full_test_df = pd.concat([X_test, y_test], axis=1)
print("Train data shape: ", full_train_df.shape)
print("Test data shape: ", full_test_df.shape)

Train data shape:  (160, 16)
Test data shape:  (40, 16)


In [14]:
#Target encoding cate feats
cat_feat = ["Location", "Developer", "Name"]
tgt_encoder = TargetEncoder(target_type="continuous")
full_train_df[cat_feat] = tgt_encoder.fit_transform(full_train_df[cat_feat], full_train_df["Price_Mil"])

In [16]:
num_feat = cat_feat + ["Area_sqft","Loc_trend"]
print(num_feat)

['Location', 'Developer', 'Name', 'Area_sqft', 'Loc_trend']


In [17]:
# Scaling Numerical features
train_scalar = StandardScaler()
full_train_df[num_feat] = train_scalar.fit_transform(full_train_df[num_feat])
full_train_df.describe()

Unnamed: 0,Location,Area_sqft,Developer,Name,No_of_Bedroom,Township_Size_Ordinal,Loc_trend,Loc_tag_ordinal,hasClubHouse_yes,hasEduFacility_yes,hasHospital_yes,hasMall_yes,hasParkOrJogTrack_yes,hasPool_yes,hasGym_yes,Price_Mil
count,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0
mean,4.218847e-16,1.1102230000000002e-17,-1.1102230000000002e-17,-2.275957e-16,2.414965,0.35,-3.3306690000000003e-17,2.0875,0.725,0.0375,0.0375,0.0375,0.9875,0.6125,0.80625,8.539006
std,1.00314,1.00314,1.00314,1.00314,0.834041,0.646228,1.00314,1.005566,0.447916,0.19058,0.19058,0.19058,0.111451,0.488709,0.396476,5.182403
min,-2.045433,-2.083066,-2.317809,-1.551382,1.0,0.0,-1.256738,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.319
25%,-0.6164377,-0.5919538,-0.6898351,-0.6069461,2.0,0.0,-0.7193835,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,5.175
50%,-0.2462911,-0.2340069,-0.221519,-0.2292525,2.0,0.0,-0.3808501,2.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,7.125
75%,0.5886112,0.3919002,0.5059715,0.2956686,3.0,1.0,0.4090612,3.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,9.74675
max,3.874727,4.140011,2.769423,7.218445,6.0,3.0,3.300029,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,42.0


In [18]:
full_test_df[cat_feat] = tgt_encoder.transform(full_test_df[cat_feat])
full_test_df[num_feat] = train_scalar.transform(full_test_df[num_feat])
full_test_df.describe()

Unnamed: 0,Location,Area_sqft,Developer,Name,No_of_Bedroom,Township_Size_Ordinal,Loc_trend,Loc_tag_ordinal,hasClubHouse_yes,hasEduFacility_yes,hasHospital_yes,hasMall_yes,hasParkOrJogTrack_yes,hasPool_yes,hasGym_yes,Price_Mil
count,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0
mean,-0.007442,-0.002741,0.031529,0.008108,2.3125,0.2,0.029958,1.9,0.8,0.0,0.0,0.0,0.95,0.725,0.825,9.303225
std,0.788387,1.068877,0.961375,0.848736,0.882069,0.516398,1.069873,1.057331,0.405096,0.0,0.0,0.0,0.220721,0.452203,0.384808,7.679335
min,-1.128898,-1.296516,-1.221559,-0.849684,1.0,0.0,-1.256738,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.35
25%,-0.544481,-0.601286,-0.692285,-0.598344,2.0,0.0,-0.719384,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,5.24925
50%,-0.13625,-0.275334,-0.20634,-0.298241,2.0,0.0,-0.418465,2.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,7.0705
75%,0.357787,0.404565,0.225791,0.334134,3.0,0.0,0.495038,3.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,8.975
max,1.680334,3.660082,1.86255,2.291417,5.0,2.0,3.300029,3.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,46.0


In [35]:
full_train_df.to_csv("train-v1.csv", index=False)
full_test_df.to_csv("test-v1.csv", index=False)

In [21]:
features = list(full_train_df.columns)
print(features)
label = features.pop(-1)
print(features, label)


['Location', 'Area_sqft', 'Developer', 'Name', 'No_of_Bedroom', 'Township_Size_Ordinal', 'Loc_trend', 'Loc_tag_ordinal', 'hasClubHouse_yes', 'hasEduFacility_yes', 'hasHospital_yes', 'hasMall_yes', 'hasParkOrJogTrack_yes', 'hasPool_yes', 'hasGym_yes', 'Price_Mil']
['Location', 'Area_sqft', 'Developer', 'Name', 'No_of_Bedroom', 'Township_Size_Ordinal', 'Loc_trend', 'Loc_tag_ordinal', 'hasClubHouse_yes', 'hasEduFacility_yes', 'hasHospital_yes', 'hasMall_yes', 'hasParkOrJogTrack_yes', 'hasPool_yes', 'hasGym_yes'] Price_Mil


In [36]:
# Send data to S3. SageMaker will be using this data to train the model further.
sk_prefix = "sagemaker/pune_house_price_prediction/sklearncontainer"
trainpath = sess.upload_data(
    path="train-v1.csv", bucket=bucket, key_prefix=sk_prefix
)
testpath = sess.upload_data(
    path="test-v1.csv", bucket=bucket, key_prefix=sk_prefix
)
print("Train data path : ", trainpath)
print("Test data path : ", testpath)

Train data path :  s3://housepricebucketsagemaker/sagemaker/pune_house_price_prediction/sklearncontainer/train-v1.csv
Test data path :  s3://housepricebucketsagemaker/sagemaker/pune_house_price_prediction/sklearncontainer/test-v1.csv


#### Data Ingestion is done

## SageMaker scripting and modeling

In [40]:
%%writefile script.py

import os
import sklearn
import pandas as pd
import pathlib
import argparse
import numpy as np
import joblib
import boto3

from sklearn.linear_model import LinearRegression

def print_results(y_pred, y_act):

    mse = np.mean((y_pred - y_act)**2)
    print("[INFO] Test MSE: ", mse)

    rmse = np.sqrt(mse)
    print("[INFO] Test RMSE: ", rmse)

def model_fn(model_dir):
    lin_reg_model = joblib.load(os.path.join(model_dir, "model.joblib"))



if __name__ == "__main__":

    print("[INFO] Checking args")
    parser = argparse.ArgumentParser()

    parser.add_argument("--random_state", type=int, default=42)

    # Data, model, and output directories

    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train-v1.csv")
    parser.add_argument("--test-file", type=str, default="test-v1.csv")

    args, _ = parser.parse_known_args()

    print("Scikit-Learn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)

    print("[INFO] Reading Data")
    print(args.train, args.train_file)
    print(args.test, args.test_file)
    train_path = os.path.join(args.train, args.train_file)
    train_df = pd.read_csv(train_path)
    test_path = os.path.join(args.test, args.test_file)
    test_df = pd.read_csv(test_path)

    features = list(train_df.columns)
    label = features.pop(-1)

    print("[INFO] Building training and testing datasets \n")

    X_train =train_df[features]
    X_test =test_df[features]

    y_train = train_df[label]
    y_test = test_df[label]

    print("Column orrder : \n", features)
    print("Label column is : \n", label)
    print("Data Shape: \n")
    print("Train features: ", X_train.shape, ", Label: ", y_train.shape)
    print("Test features: ", X_test.shape, ", Label: ", y_test.shape)

    print("[INFO] Training simple regressor:\n")

    lin_reg_model = LinearRegression()
    lin_reg_model.fit(X_train, y_train)
    print()

    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(lin_reg_model, model_path)

    print("Model stored at: ", model_path)
    print()
    
    y_pred_train = lin_reg_model.predict(X_train)
    y_pred_test = lin_reg_model.predict(X_test)

    print_results(y_pred_train, y_train)
    print("Training R2: ", lin_reg_model.score(X_train, y_train))

    print_results(y_pred_test, y_test)
    print("Testing R2: ", lin_reg_model.score(X_test, y_test))    

Overwriting script.py


In [41]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point="script.py",
    role="arn:aws:iam::178244213000:role/service-role/SageMaker-SageMaker_prediction_role",
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="LR-custom-sklearn",
    hyperparameters={
        "random_state" : 42,
    },
    use_spot_instances=True,
    max_wait=7200,
    max_run=3600
)

sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\shrin\AppData\Local\sagemaker\sagemaker\config.yaml


In [42]:
sklearn_estimator.fit({"train":trainpath, "test":testpath}, wait=True)

Using provided s3_resource


INFO:sagemaker:Creating training-job with name: LR-custom-sklearn-2023-12-13-04-49-27-792


2023-12-13 04:49:32 Starting - Starting the training job...
2023-12-13 04:49:46 Starting - Preparing the instances for training......
2023-12-13 04:51:05 Downloading - Downloading input data......
2023-12-13 04:52:11 Training - Training image download completed. Training in progress..2023-12-13 04:52:16,496 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2023-12-13 04:52:16,499 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2023-12-13 04:52:16,535 sagemaker_sklearn_container.training INFO     Invoking user training script.
2023-12-13 04:52:16,687 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2023-12-13 04:52:16,699 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2023-12-13 04:52:16,711 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2023-12-13 04:52:16,720 sagemaker-training-toolkit INFO     Invoking user sc

In [43]:
sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model Artifacts stored at : ", artifact)


2023-12-13 04:52:37 Starting - Preparing the instances for training
2023-12-13 04:52:37 Downloading - Downloading the training image
2023-12-13 04:52:37 Training - Training image download completed. Training in progress.
2023-12-13 04:52:37 Uploading - Uploading generated training model
2023-12-13 04:52:37 Completed - Training job completed
Model Artifacts stored at :  s3://sagemaker-us-east-1-178244213000/LR-custom-sklearn-2023-12-13-04-49-27-792/output/model.tar.gz


In [44]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model = SKLearnModel(
    name= model_name,
    model_data= artifact,
    role= "arn:aws:iam::178244213000:role/service-role/SageMaker-SageMaker_prediction_role",
    entry_point="script.py",
    framework_version= FRAMEWORK_VERSION
)

sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\shrin\AppData\Local\sagemaker\sagemaker\config.yaml


In [45]:
model

<sagemaker.sklearn.model.SKLearnModel at 0x17cf2e5be20>

In [56]:
# Deploy as an endpoint

endpoint_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("EndpointName={}".format(endpoint_name))

predictor = model.deploy(
    initial_instance_count = 1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name,
)

EndpointName=Custom-sklearn-model-2023-12-13-06-41-37


INFO:sagemaker:Creating model with name: Custom-sklearn-model-2023-12-13-06-18-00
INFO:sagemaker:Creating endpoint-config with name Custom-sklearn-model-2023-12-13-06-41-37
INFO:sagemaker:Creating endpoint with name Custom-sklearn-model-2023-12-13-06-41-37


-----!

In [57]:
predictor.predict

<bound method Predictor.predict of <sagemaker.sklearn.model.SKLearnPredictor object at 0x0000017CF4604280>>