Below are some packages that must be installed if they are not already installed.

In [1]:

#!pip install category_encoders
#!pip install pandas
#!pip install scikit-learn
#!unzip arquive.zip (optional)

In [2]:
#Importing the required libraries
import pandas as pd
import numpy as np
import pickle #Pickling” is the process whereby a Python object hierarchy is converted into a byte stream
#import warnings

#warnings.simplefilter("ignore")

from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_percentage_error,
    mean_absolute_error)

from category_encoders import TargetEncoder
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV


In [3]:
sklearn.__version__  #to use in requeriments.txt

'1.2.1'

I preferred to turn the categorical variables into dummies and created two more columns after noticing a high correlation of bathrooms and room numbers.
Additionally, I wanted to see if the room rate per house was highly correlated with price.

In [20]:
#Defining a function to load the training and test data into pandas DataFrames
def load_data(train_path:str, test_path:str) -> (pd.DataFrame, pd.DataFrame):
    """
    Loads the train and test data into pandas DataFrames
    """

    train = pd.read_csv(train_path)
    
    #adding new features to the model
    train = train.join(pd.get_dummies(train.sector)).drop(['sector'],axis=1) #placing the "type" category in the training set
    train = train.join(pd.get_dummies(train.type)).drop(['type'],axis=1)
    train['bathroom_ratio'] = train['n_bathroom']/train['n_rooms']
    train['household_rooms']=train['n_rooms']/(train['casa']+train['departamento'])
    train = train.dropna()
    
    # Mapping dictionary to rename the columns
    column_mapping = {
        "net_usable_area": "net_usable_area",
        "net_area": "net_area",
        "n_rooms": "n_rooms",
        "n_bathroom": "n_bathroom",
        "latitude": "latitude",
        "longitude": "longitude",
        "price": "price",
        "la reina": "la_reina",
        "las condes": "las_condes",
        "lo barnechea": "lo_barnechea",
        "nunoa": "nunoa",
        "providencia": "providencia",
        "vitacura": "vitacura",
        "casa": "casa",
        "departamento": "departamento",
        "bathroom_ratio": "bathroom_ratio",
        "household_rooms": "household_rooms"
    }

    # Renaming the columns
    train = train.rename(columns=column_mapping)

    #feature engineering

    test = pd.read_csv(test_path)
    
    #adding new features to the model
    test=test.join(pd.get_dummies(test.sector)).drop(['sector'],axis=1)
    test = test.join(pd.get_dummies(test.type)).drop(['type'],axis=1)
    test['bathroom_ratio'] = test['n_bathroom']/test['n_rooms']
    test['household_rooms']=test['n_rooms']/(test['casa']+test['departamento'])
    test = test.dropna()

      # Mapping dictionary to rename the columns
    column_mapping = {
        "net_usable_area": "net_usable_area",
        "net_area": "net_area",
        "n_rooms": "n_rooms",
        "n_bathroom": "n_bathroom",
        "latitude": "latitude",
        "longitude": "longitude",
        "price": "price",
        "la reina": "la_reina",
        "las condes": "las_condes",
        "lo barnechea": "lo_barnechea",
        "nunoa": "nunoa",
        "providencia": "providencia",
        "vitacura": "vitacura",
        "casa": "casa",
        "departamento": "departamento",
        "bathroom_ratio": "bathroom_ratio",
        "household_rooms": "household_rooms"
    }

    # Renaming the columns
    test = test.rename(columns=column_mapping)
    
    return train, test
#Load the training and test data
train, test = load_data('./learningFiles/train.csv', './learningFiles/test.csv')


   net_usable_area  net_area  n_rooms  n_bathroom  latitude  longitude  price  \
0            152.0     257.0      3.0         3.0 -33.37940  -70.54470  18500   
1            140.0     165.0      4.0         4.0 -33.41135  -70.56977  14500   
2            101.0     101.0      4.0         3.0 -33.44154  -70.55704   6522   
3             80.0     112.0      1.0         2.0 -33.42486  -70.60868   6100   
4            200.0     200.0      3.0         4.0 -33.40490  -70.59450  19000   

   la_reina  las_condes  lo_barnechea  nunoa  providencia  vitacura   casa  \
0     False       False         False  False        False      True   True   
1     False        True         False  False        False     False  False   
2      True       False         False  False        False     False  False   
3     False       False         False  False         True     False  False   
4     False       False         False  False        False      True  False   

   departamento  bathroom_ratio  household_r

In [5]:
#Specifing the columns to use for training, the categorical columns, and the target variable:
train_cols = [
    col for col in train.columns if col not in ['id', 'target']
    ]

#|categorical_cols = ["type", "sector"]
#categorical_cols = ["type"]
target           = "price"

In [6]:
#Defining the categorical transformer using TargetEncoder
categorical_transformer = TargetEncoder()

#Create a ColumnTransformer to apply the categorical transformer to the categorical columns
#preprocessor = ColumnTransformer(
##    transformers=[
#        ('categorical',
#          categorical_transformer,
#          categorical_cols)
#    ])

#Defining the steps of the pipeline, including the preprocessor and the model (GradientBoostingRegressor)
steps = [
 #   ('preprocessor', preprocessor),
    ('model', GradientBoostingRegressor(**{
        "learning_rate":0.03, #alpha
        "n_estimators":2500,   #m
        "max_depth":6,
        "loss":"absolute_error"
    }))
]
#Creating the pipeline using the defined steps
pipeline = Pipeline(steps)


In [7]:
#Fitting the pipeline on the training data:
pipeline.fit(train[train_cols], train[target])

In [8]:
#Predicting property valuations for the test data using the trained model:
test_predictions = pipeline.predict(test[train_cols])
test_target = test[target].values

In [9]:
type(test_predictions), type(test_target)

(numpy.ndarray, numpy.ndarray)

In [10]:
#Model Evaluation
#defining a function to print evaluation metrics:
def print_metrics(predictions, target):
    print("RMSE: ", np.sqrt(mean_squared_error(predictions, target)))
    print("MAPE: ", mean_absolute_percentage_error(predictions, target))
    print("MAE : ", mean_absolute_error(predictions, target))

In [11]:
print_metrics(test_predictions, test_target)

RMSE:  236.50304609785385
MAPE:  0.0007902052430558677
MAE :  12.244290911099439


Exporting the model

In [12]:
with open('trained_pipeline-0.1.0.pkl','wb') as b:
    pickle.dump(pipeline, b)

In [13]:
!zip -r ./trained_pipeline_model-0.1.0.pkl.zip ./trained_pipeline-0.1.0.pkl

'zip' n�o � reconhecido como um comando interno
ou externo, um programa oper�vel ou um arquivo em lotes.


Testing model idempotency with several different calls

In [14]:
pipeline.predict(train.loc[[0]])

array([11899.99938052])

In [15]:
print(type(train[0:1]))
pipeline.predict(train[0:1])

<class 'pandas.core.frame.DataFrame'>


array([11899.99938052])

In [16]:
# net_usable_area  net_area  n_rooms  n_bathroom  latitude  longitude  price  \
#la reina  las condes  lo barnechea  nunoa  providencia  vitacura   casa  \
#departamento  bathroom_ratio  household_rooms  
apicsv = pd.read_csv("./learningFiles/api.csv",sep=",")
print(apicsv)
pipeline.predict(apicsv.loc[[0]])

   net_usable_area  net_area  n_rooms  n_bathroom  latitude  longitude  price  \
0            140.0     170.0      4.0         4.0 -33.40123  -70.58056  11900   
1            140.0     170.0      4.0         4.0 -33.40123  -70.58056  11900   
2            140.0     170.0      4.0         4.0 -33.40123  -70.58056  11900   
3            140.0     170.0      4.0         4.0 -33.40123  -70.58056  11900   
4            140.0     170.0      4.0         4.0 -33.40123  -70.58056  11900   
5            140.0     170.0      4.0         4.0 -33.40123  -70.58056  11900   
6            140.0     170.0      4.0         4.0 -33.40123  -70.58056  11900   
7            140.0     170.0      4.0         4.0 -33.40123  -70.58056  11900   
8            140.0     170.0      4.0         4.0 -33.40123  -70.58056  11900   
9            140.0     170.0      4.0         4.0 -33.40123  -70.58056  11900   

   la reina  las condes  lo barnechea  nunoa  providencia  vitacura  casa  \
0         0           0        

array([11899.99938052])

In [17]:
with open(f"trained_pipeline-0.1.0.pkl","rb") as f:
    model = pickle.load(f) #load the model
model.predict(apicsv.loc[[0]])

array([11899.99938052])