<h1 style="color: red;">House Prices Modeling</h1>

#### Data Science in Production - Practical Work 1

# Goal

To predict the sales price for each house. For each Id in the test set, there must be a prediction of value SalePrice. 

# 1.Model Building
<br>

In [2]:
# Import the necessary libraries

import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,mean_squared_log_error,r2_score

In [84]:
train_data = pd.read_csv('../data/house-prices-advanced-regression-techniques/train.csv')
#data splitting
X_train, X_test, y_train, y_test = data_splitting(train_data)

def model_analysis(data):
    data.head()
    print(f"\nshape of data {data.shape}")
    print(f"\ndata coloumns : {data.columns}")
    print(f"\nnull values in each features : {data.isnull().sum()}")
    print(f"\ndata types of each features : {data.dtypes}")    
    return
    
def data_splitting(train_data):
    '''
    now let's split the train_data set into train and test so that we can analyse 
    the performance of the model in an unbiased way
    '''

    # taking features into one dataframe and target into another dataframe

    X = train_data.drop('SalePrice', axis=1)
    y = train_data['SalePrice']

    # let's split the data - 80 % for training and 20% for testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print(f"shape after splitting : \n train data {X_train.shape} {y_train.shape} \n test data {X_test.shape} {y_test.shape}")
    
    return X_train, X_test, y_train, y_test

def feature_selection(data):
    # let's check which features are categorical and which are continuous
    print("continuous features :\n", data.select_dtypes(include=['int64']).dtypes.head())
    print("categorical features :\n", data.select_dtypes(include=['object']).dtypes.head())
    
    cate_feat = ['HouseStyle', 'Neighborhood', 'BldgType', 'KitchenQual', 'ExterQual'] 
    cont_feat = ['OverallQual', 'YearBuilt', 'GrLivArea', 'TotalBsmtSF', 'GarageArea']
    
    print("The following features are picked")
    print("\n categorical features")
    for feat in cate_feat:
        print(feat)
    print("\n continuous features")
    for feat in cont_feat:
        print(feat)   
    
    print(type(cate_feat))
    
    return cate_feat,cont_feat

def checkformissingval_in_data(feat,curr_data):
    # handling missing values
        
    for val in feat:
        count = curr_data[val].isnull().sum()
        if(count != 0):
            print(f"Null values in feature {val} is {count}")
        else:
            print(f"No null value in feature {val}")

def scale_data(data_to_scale,cont_feat):
    #Scaling is a technique used to normalize the range of independent variables or features of data.
    # seperate the dataframe  into cont subset
    data_cont = data_to_scale[cont_feat]
    
    def findrange(data):
        min_values = data.min()
        max_values = data.max()

        feature_ranges = max_values - min_values

        print("Feature Minimums:\n", min_values)
        print("\nFeature Maximums:\n", max_values)
        print("\nFeature Ranges:\n", feature_ranges)
        
    findrange(data_cont)
    print(data_cont.head())
    print(f"\nthe shape of data cont is \n {data_cont.shape}")
    
    # scaling cont features
    scaler = StandardScaler()
    scaler.fit(data_cont)
    
    joblib.dump(scaler, '../models/scaler.joblib')
    
    loaded_scaler = joblib.load('../models/scaler.joblib')
    
    data_cont_scaled = pd.DataFrame(loaded_scaler.transform(data_cont), columns=data_cont.columns)
    
    print("\nlet's see how scaled data looks like:")
    print(data_cont_scaled.head())
    print("see if range is reduced or not")
    findrange(data_cont_scaled)
    print("shape of data scaled :",data_cont_scaled.shape)
    
    return data_cont_scaled


def encode_data(data_to_encode,cate_feat,X_train_cate):
    # Categorical features need to be encoded to numerical values.

    # seperate the dataframe  into categorical subset
    data_cate = data_to_encode[cate_feat]

    print(data_cate.head())

    print(f"\nthe shape of data to encode is \n {data_cate.shape}")
    
    #encoding the categorical values
    encoder = OneHotEncoder(sparse_output=False, drop='first')  # drop='first' to avoid dummy variable trap
    joblib.dump(encoder, '../models/encoder.joblib')
    
    loaded_encoder = joblib.load('../models/encoder.joblib')
    loaded_encoder.fit(X_train_cate)
    
    data_cate_encoded = pd.DataFrame(loaded_encoder.transform(data_cate),
                                    columns=loaded_encoder.get_feature_names_out(data_cate.columns))
    print(f"\nthe shape after encoding   \n {data_cate_encoded.shape}")
    
    print("see if the data is encoded or not")
    print(data_cate_encoded.head())
    
    return data_cate_encoded

def model_train_and_predict(X_train_preprocessed,X_test_preprocessed,y_train):
    model = RandomForestRegressor(random_state=42)
    model.fit(X_train_preprocessed, y_train)
    y_test_pred = model.predict(X_test_preprocessed)
    return y_test_pred,model
    
    
def build_model_for_train(X_train,y_train):
    
    print("\n==================== train data ============================")
    
    model_analysis(X_train)
        
    #feature selection
    cate_feat,cont_feat = feature_selection(X_train)
    
    #feature processing
    checkformissingval_in_data(cont_feat,X_train)
    checkformissingval_in_data(cate_feat,X_train)
    
    #scaling the continuous data
    X_train_cont_scaled = scale_data(X_train,cont_feat)
    
    X_train_cate = X_train[cate_feat]
    
    #encoding the categorical data
    X_train_cate_encoded = encode_data(X_train,cate_feat,X_train_cate)
    
    # Concatenate back the scaled and encoded features
    X_train_preprocessed = pd.concat([X_train_cont_scaled, X_train_cate_encoded], axis=1)
    
    return X_train_preprocessed ,cate_feat   

def build_model_for_test(X_test,y_test,X_train_cate):
    
    print("\n==================== test data ============================")
    print(f"\n shape after splitting test data : {X_test.shape} {y_test.shape}")
    model_analysis(X_test)
    
    #feature selection
    cate_feat,cont_feat = feature_selection(X_test)
    
    #feature processing
    checkformissingval_in_data(cont_feat,X_test)
    checkformissingval_in_data(cate_feat,X_test)
    
    # scaling the continuous data
    X_test_cont_scaled = scale_data(X_test,cont_feat)
    
    #encoding the categorical data
    X_test_cate_encoded = encode_data(X_test,cate_feat,X_train_cate)
    
    # Concatenate back the scaled and encoded features
    X_test_preprocessed = pd.concat([X_test_cont_scaled, X_test_cate_encoded], axis=1)
    
    return X_test_preprocessed

def model_evaluation(y_test_pred,y_test,X_test_preprocessed,model):
    test_mse = mean_squared_error(y_test, y_test_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    print(f"Test MSE: {test_mse}")
    print(f"Test R^2: {test_r2}")
    print(f"model score is {model.score(X_test_preprocessed,y_test)}")

def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)    
    
def build_model(data: pd.DataFrame) -> dict[str, str]:
    
    model_analysis(data)
           
    X_train_preprocessed,cate_feat = build_model_for_train(X_train,y_train)
    
    X_train_cate = X_train[cate_feat]
    
    X_test_preprocessed = build_model_for_test(X_test,y_test,X_train_cate)
    
    # train and predict the model
    y_test_pred,model = model_train_and_predict(X_train_preprocessed,X_test_preprocessed,y_train)
    
    #evaluate the model
    model_evaluation(y_test_pred,y_test,X_test_preprocessed,model)
    
    test_rmsle = compute_rmsle(y_test, y_test_pred)
    
    print(f"Test RMSLE: {test_rmsle}")
    
    # Persist the trained model
    joblib.dump(model, '../models/model.joblib')
    
    rmse_dict = {"rmse":test_rmsle}
    return rmse_dict
    
rmse_dict = build_model(train_data)

shape after splitting : 
 train data (1168, 80) (1168,) 
 test data (292, 80) (292,)

shape of data (1460, 81)

data coloumns : Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu'

Test MSE: 868949106.3762934
Test R^2: 0.8867128877045563
model score is 0.8867128877045563
Test RMSLE: 0.17


# 2.Model Inference
<br>

In [80]:
# read test data
test_data = pd.read_csv('../data/house-prices-advanced-regression-techniques/test.csv')

def imputation(test_data):
    test_data['TotalBsmtSF'].fillna(test_data['TotalBsmtSF'].median(), inplace=True)
    test_data['GarageArea'].fillna(test_data['GarageArea'].median(), inplace=True)
    test_data['KitchenQual'].fillna(test_data['KitchenQual'].mode()[0], inplace=True)
    
def make_predictions(test_data: pd.DataFrame,X_train) -> np.ndarray:
    
    model_analysis(test_data)
    
    cate_feat,cont_feat = feature_selection(test_data)
    
    print(f"cont_values selected are : {cont_feat} \ncate feat selected are {cate_feat}")
    
    #feature processing
    checkformissingval_in_data(cont_feat,test_data)
    checkformissingval_in_data(cate_feat,test_data)
    
    #since there are missing values,perform imputation
    
    imputation(test_data)
    print("\nafter doing imputation the missing value status are :\n")
    checkformissingval_in_data(cont_feat,test_data)
    checkformissingval_in_data(cate_feat,test_data)
    
    # seperate the dataframe  into cont subset
    test_cont = test_data[cont_feat]
    test_cate = test_data[cate_feat]
    
    print(test_cont.columns)
    
    # scaling the continuous data
    loaded_scaler = joblib.load('../models/scaler.joblib')
    test_cont_scaled = pd.DataFrame(loaded_scaler.transform(test_cont), columns=test_cont.columns)
    print("\n",test_cont_scaled.head())
    
    
    #encoding categorical values
    X_train_cate = X_train[cate_feat]
    
    loaded_encoder = joblib.load('../models/encoder.joblib')
    loaded_encoder.fit(X_train_cate)
    
    test_cate_encoded = pd.DataFrame(loaded_encoder.transform(test_cate),
                                   columns=loaded_encoder.get_feature_names_out(test_cate.columns))
    print(f"\nthe shape after encoding X_test_cate_encoded is {test_cate_encoded.shape}")
    
    test_preprocessed = pd.concat([test_cont_scaled, test_cate_encoded], axis=1)
    print(test_preprocessed.head())
    
    loaded_model = joblib.load('../models/model.joblib')
    test_pred = loaded_model.predict(test_preprocessed)
    
    print("The first 10 predicted SalePrice values of the test set are :\n",test_pred[:10])
    
make_predictions(test_data,X_train)


shape of data (1459, 80)

data coloumns : Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'Ga