<center><h1 style="color:#CC0099">House-prices-modeling</h1></center>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import sys
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
import joblib
import os
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from typing import List, Tuple

<h1>Model Building: <span style="color:#6666CC">Model Training</span></h1>

### 1. Dataset loading and splitting into train and test

In [2]:
label_col = 'SalePrice'
useful_features = ['Foundation', 'KitchenQual', 'TotRmsAbvGrd', 'WoodDeckSF', 'YrSold', '1stFlrSF']

continuous_columns = ['TotRmsAbvGrd', 'WoodDeckSF', 'YrSold', '1stFlrSF']
categorical_columns = ['Foundation', 'KitchenQual']

In [3]:
def reading_csv(file_path: str) -> pd.DataFrame:
    csv_file = pd.read_csv(file_path)
    return csv_file

train_csv = reading_csv('../data/train.csv')

In [4]:
def splitting_df(initial_df: pd.DataFrame):
    train_df, test_df = train_test_split(initial_df, test_size=0.33, random_state=42)
    return train_df, test_df

train_df, test_df = splitting_df(train_csv)

In [5]:
def useful_df(data_df: pd.DataFrame, useful_features: List[str], label_col: str) -> pd.DataFrame:
    # Final df with only the useful features and the label
    data_df = data_df[useful_features + [label_col]]
    return data_df

train_df = useful_df(train_df, useful_features, label_col)
test_df = useful_df(test_df, useful_features, label_col)

### 2. Preprocessing and feature engineering

In [6]:
def preprocessing(data: pd.DataFrame, categorical_columns: List[str], continuous_columns: List[str]) -> pd.DataFrame:
    # Replace missing values in categorical columns with mode
    data[categorical_columns] = data[categorical_columns].fillna(data[categorical_columns].mode().iloc[0])
    # Replace missing values in continuous columns with mean
    data[continuous_columns] = data[continuous_columns].fillna(data[continuous_columns].mean())
    # Remove duplicated rows
    data = data.drop_duplicates(keep='first')
    # Reset index
    preprocess_df = data.reset_index(drop=True)

    return preprocess_df

train_df = preprocessing(train_df, categorical_columns, continuous_columns)
test_df = preprocessing(test_df, categorical_columns, continuous_columns)

<h4 style="color:#084A68">Saving the scaler and encoder</h4>

In [7]:
def fit_scaler_encoder(df_train: pd.DataFrame, categorical_columns: List[str], 
                       continuous_columns: List[str]) -> Tuple[StandardScaler, OneHotEncoder]:
    # Fitting the scaler on the train_df
    scaler = StandardScaler()
    scaler.fit(df_train[continuous_columns])

    # Fitting the encoder on the train_df
    encoder = OneHotEncoder()
    encoder.fit(df_train[categorical_columns])

    return scaler, encoder
            
scaler, encoder = fit_scaler_encoder(train_df, categorical_columns, continuous_columns)

# Saving the scaler and the encoder to use them after for transforming
models_folder = '../models'
joblib.dump(scaler, os.path.join(models_folder, 'scaler.joblib'))
joblib.dump(encoder, os.path.join(models_folder, 'encoder.joblib'))

['../models/encoder.joblib']

In [8]:
def transform_scaler_encoder(data_df: pd.DataFrame, categorical_columns: List[str], 
                             continuous_columns: List[str], models_folder: str) -> pd.DataFrame:
    # Using the saved encoder and scalers
    scaler = joblib.load(os.path.join(models_folder, 'scaler.joblib'))
    encoder = joblib.load(os.path.join(models_folder, 'encoder.joblib'))
    scaled_data = scaler.transform(data_df[continuous_columns])
    scaled_df = pd.DataFrame(data=scaled_data, columns=continuous_columns)
    encoded_data = encoder.transform(data_df[categorical_columns])
    encoded_df = pd.DataFrame(data=encoded_data.toarray(),
                              columns=encoder.get_feature_names_out(categorical_columns))

    transformed_df = pd.concat([scaled_df, encoded_df], axis=1)

    return transformed_df
 
X_train = transform_scaler_encoder(train_df, categorical_columns, continuous_columns, models_folder)
y_train = train_df[label_col]

X_test = transform_scaler_encoder(test_df, categorical_columns, continuous_columns, models_folder)
y_test = test_df[label_col]

<h1>Model Building: <span style="color:#6666CC">Model Training and Evaluation</span></h1>


In [9]:
def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

def build_model(X_train: pd.DataFrame, y_train: pd.DataFrame, X_test: pd.DataFrame, y_test: pd.DataFrame) -> dict:
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    rmsle = compute_rmsle(y_test, y_pred)
    
    return {'model': model, 'mse': mse, 'rmse': rmse, 'mae': mae, 'r2': r2, 'rmsle': rmsle}

result = build_model(X_train, y_train, X_test, y_test)

# save the model to a file called model.joblib in the models folder
model = result['model']
joblib.dump(model, os.path.join(models_folder, 'model.joblib'))

['../models/model.joblib']

<h1 style="color:#6666CC">Model inference</h1>

In [10]:
house_test = reading_csv('../data/test.csv')
house_test = preprocessing(house_test, categorical_columns, continuous_columns)
test_dataset = transform_scaler_encoder(house_test, categorical_columns, continuous_columns, models_folder)

def make_predictions(test_dataset: pd.DataFrame, models_folder: str) -> np.ndarray:
    # using joblib.load to make the predictions
    model = joblib.load(os.path.join(models_folder, 'model.joblib'))
    predictions = model.predict(test_dataset)

    return predictions

make_predictions(test_dataset, models_folder)

array([120547.89852744, 206552.7998175 , 165279.21985763, ...,
       181477.99967176, 158810.53764499, 204258.36547822])