# Model Building

## Model Training

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
import numpy as np
import joblib

# ************Features Selection************

# Continuous features
continuous_features = ['LotArea', 'YearBuilt', '1stFlrSF', 'GrLivArea']

# Categorical features
kitchen_quality_column = 'KitchenQual'
categorical_features = ['Neighborhood', 'HouseStyle', 'OverallQual', 'OverallCond', kitchen_quality_column]
kitchen_quality_dict = {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'Po': 0}

# Target value
targeted = 'SalePrice'

# ************Preprocessing*****************


def preprocess_data(X, fit=False):

    # Initialize the scalers and encoders
    scaler = StandardScaler()
    one_hot_encoder = OneHotEncoder(drop='first', sparse_output=False)
    
    # Initialize the imputers
    numeric_imputer = SimpleImputer(strategy='median')
    categorical_imputer = SimpleImputer(strategy='most_frequent')

    if(fit):
        # Fit and transform continuous features for training set
        X_train_continuous = X[continuous_features]
        numeric_imputer.fit(X_train_continuous)
        X_train_continuous = numeric_imputer.transform(X_train_continuous)
        scaler.fit(X_train_continuous)
        X_train_continuous = scaler.transform(X_train_continuous)
        
        # Fit and transform categorical features for training set
        X_train_categorical = X[categorical_features].copy()
        X_train_categorical[kitchen_quality_column] = X_train_categorical[kitchen_quality_column].map(kitchen_quality_dict)
        categorical_imputer.fit(X_train_categorical)
        X_train_categorical = categorical_imputer.transform(X_train_categorical)
        one_hot_encoder.fit(X_train_categorical[:, :-1])
        X_train_categorical_encoded = one_hot_encoder.transform(X_train_categorical[:, :-1])
        X_train_kitchen_quality = X_train_categorical[:, -1].reshape(-1, 1)
        # Combine preprocessed features for training set
        X_train_processed = np.hstack((X_train_continuous, X_train_categorical_encoded, X_train_kitchen_quality))
        # Save the model, encoders, and scalers
        joblib.dump(scaler, "../models/scaler.joblib")
        joblib.dump(one_hot_encoder, "../models/one_hot_encoder.joblib")
        joblib.dump(numeric_imputer, "../models/numeric_imputer.joblib")
        joblib.dump(categorical_imputer, "../models/categorical_imputer.joblib")
    else:
        # Load preprocessors
        scaler = joblib.load("../models/scaler.joblib")
        one_hot_encoder = joblib.load("../models/one_hot_encoder.joblib")
        numeric_imputer = joblib.load("../models/numeric_imputer.joblib")
        categorical_imputer = joblib.load("../models/categorical_imputer.joblib")
        
        # Transform continuous features
        X_continuous = X[continuous_features]
        X_continuous = numeric_imputer.transform(X_continuous)
        X_continuous = scaler.transform(X_continuous)
        
        # Transform categorical features
        X_categorical = X[categorical_features].copy()
        X_categorical[kitchen_quality_column] = X_categorical[kitchen_quality_column].map(kitchen_quality_dict)
        X_categorical = categorical_imputer.transform(X_categorical)
        X_categorical_encoded = one_hot_encoder.transform(X_categorical[:, :-1])
        X_kitchen_quality = X_categorical[:, -1].reshape(-1, 1)
        # Combine preprocessed features
        X_train_processed = np.hstack((X_continuous, X_categorical_encoded, X_kitchen_quality))
    
    print(X_train_processed)
    return  X_train_processed


## Model evaluation

In [4]:

def build_model(data: pd.DataFrame) -> dict[str, str]:
    # Split the data into train and test sets
    train_set, test_set = train_test_split(dataset, test_size=0.2, random_state=42)
    
    # Split features and target from the train set
    X_train = train_set.drop(columns=[targeted])
    y_train = train_set[targeted]
        
    # Preprocess training data
    X_train_processed = preprocess_data(X_train, fit=True)

    # Initialize and train the model
    model = LinearRegression()
    model.fit(X_train_processed, y_train)

    # Save the model
    joblib.dump(model, "../models/model.joblib")

    # Split features and target from the test set
    X_test = test_set.drop(columns=[targeted])
    y_test = test_set[targeted]

    # Model evaluation
    X_test = test_set.drop(columns=[targeted])
    y_test = test_set[targeted]
    X_test_processed = preprocess_data(X_test, fit=False)
    
    def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
        rmsle = np.sqrt(mean_squared_error(y_test, y_pred))
        return round(rmsle, precision)
    
    # Make predictions and evaluate the model
    y_pred_test = model.predict(X_test_processed)
    y_pred_train = model.predict(X_train_processed)
    
    rmsle_test = compute_rmsle(y_test, y_pred_test)
    rmsle_train = compute_rmsle(y_train, y_pred_train)
    
    print(f'Training RMSLE: {rmsle_train}')
    print(f'Testing RMSLE: {rmsle_test}')

    return


# Model inference

In [15]:
def make_predictions(input_data: pd.DataFrame) -> np.ndarray:
    # Load the model and preprocessors
    model = joblib.load("../models/model.joblib")
    scaler = joblib.load("../models/scaler.joblib")
    one_hot_encoder = joblib.load("../models/one_hot_encoder.joblib")
    numeric_imputer = joblib.load("../models/numeric_imputer.joblib")
    categorical_imputer = joblib.load("../models/categorical_imputer.joblib")
    
    # Preprocess inference data
    X_inference_processed = preprocess_data(input_data, fit=False)
    
    # Make predictions
    predictions = model.predict(X_inference_processed)
    print(predictions)
    return predictions

In [20]:
if __name__ == "__main__":
    # Load dataset
    FILE_PATH = "../data/train.csv"
    dataset = pd.read_csv(FILE_PATH)
    
    # Build model and evaluate
    model_performance = build_model(dataset)
    
    # Make predictions on new data
    inference_file_path = "../data/test.csv"
    inference_data = pd.read_csv(inference_file_path)
    predictions = make_predictions(inference_data)
    predictions

[[-0.21289571021689285 -0.4554689634533773 0.3742352301895073 ... 0.0 0.0
  2]
 [-0.26524462694629186 0.7186089531287432 -0.9582022093686505 ... 0.0 0.0
  2]
 [-0.17784146224711767 -1.9882929101022566 -0.9659639808612224 ... 0.0
  0.0 2]
 ...
 [-0.2340956267609479 -0.5206955143746061 -0.7900304936962618 ... 0.0 0.0
  2]
 [-0.2833761345168653 -1.7273867064173412 -0.665842149815113 ... 1.0 0.0
  3]
 [-0.6513992471544521 1.1751948095773455 0.9977642067594413 ... 0.0 0.0 3]]
[[-0.2115939609554158 -0.2597893106896905 -0.26223003220137975 ... 1.0
  0.0 2]
 [0.14564322922993247 0.7512222285893576 0.8554650627289585 ... 0.0 0.0 3]
 [-0.16082573975781034 -1.433867227271811 -0.3657203187690037 ... 0.0 0.0
  2]
 ...
 [-0.23158511032809925 1.1099682586561166 -1.141897468026183 ... 0.0 0.0
  3]
 [-0.14929596058472777 -1.009894646283823 -1.0720415245930368 ... 0.0 0.0
  2]
 [-0.2389306954464341 -0.031496382465389355 -0.7900304936962618 ... 0.0
  0.0 2]]
Training RMSLE: 30824.63
Testing RMSLE: 33015.