# Importing modules and loading datasets

In [63]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
import joblib
train_df = pd.read_csv('/Users/purnimaprabha/Downloads/house-prices-advanced-regression-techniques/train.csv') 
test_df = pd.read_csv('/Users/purnimaprabha/Downloads/house-prices-advanced-regression-techniques/test.csv')
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


# Model building function

In [67]:
def build_model(data: pd.DataFrame) -> dict[str, str]:
    num_features = ['GrLivArea', '1stFlrSF']
    cat_features = ['SaleCondition', 'HouseStyle']
    select_features = num_features + cat_features

    X = data[select_features]
    y = data['SalePrice']
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

    # Preprocessing
    cat_imputer = SimpleImputer(strategy='most_frequent')
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    scaler = StandardScaler()

    # Fit and transform on training data
    X_train_cat = pd.DataFrame(cat_imputer.fit_transform(X_train[cat_features]), columns=cat_features)
    X_train_cat_encoded = encoder.fit_transform(X_train_cat)
    X_train_num_scaled = scaler.fit_transform(X_train[num_features])

    X_train_transformed = np.hstack([X_train_cat_encoded, X_train_num_scaled])

    # Train model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train_transformed, y_train)

    # Save the model and preprocessing objects
    joblib.dump(model, 'models/model.joblib')
    joblib.dump(cat_imputer, 'models/cat_imputer.joblib')
    joblib.dump(encoder, 'models/encoder.joblib')
    joblib.dump(scaler, 'models/scaler.joblib')

    # Evaluate model on validation data
    X_valid_cat = pd.DataFrame(cat_imputer.transform(X_valid[cat_features]), columns=cat_features)
    X_valid_cat_encoded = encoder.transform(X_valid_cat)
    X_valid_num_scaled = scaler.transform(X_valid[num_features])

    X_valid_transformed = np.hstack([X_valid_cat_encoded, X_valid_num_scaled])

    y_pred = model.predict(X_valid_transformed)
    y_pred = np.maximum(0, y_pred)

    rmsle = compute_rmsle(y_valid, y_pred)
    
    return {'rmsle': rmsle}

# Making predictions function

In [73]:
def make_predictions(input_data: pd.DataFrame) -> np.ndarray:
    model = joblib.load('models/model.joblib')
    cat_imputer = joblib.load('models/cat_imputer.joblib')
    encoder = joblib.load('models/encoder.joblib')
    scaler = joblib.load('models/scaler.joblib')

    # Preprocess the input data
    num_features = ['GrLivArea', '1stFlrSF']
    cat_features = ['SaleCondition', 'HouseStyle']
    select_features = num_features + cat_features
    
    X_test = input_data[select_features]
    
    X_test_cat = pd.DataFrame(cat_imputer.transform(X_test[cat_features]), columns=cat_features)
    X_test_cat_encoded = encoder.transform(X_test_cat)
    X_test_num_scaled = scaler.transform(X_test[num_features])

    X_test_transformed = np.hstack([X_test_cat_encoded, X_test_num_scaled])

    # Make predictions
    y_test_pred = model.predict(X_test_transformed)
    
    # Return predictions
    return np.maximum(0, y_test_pred)


### Saving the preprocessed dataframe before refactoring

Scaled the numeric features using `StandardScaler` to ensure they have a mean of 0 and standard deviation of 1.  

In [14]:
import pandas as pd
column_names = [f"feature_{i}" for i in range(X_train_processed.shape[1])]
processed_df = pd.DataFrame(X_train_processed, columns=column_names)
processed_df = pd.DataFrame(X_train_processed)  
processed_df.to_parquet('/Users/purnimaprabha/Downloads/processed_df.parquet', index=False)

print("Processed dataframe saved before refactoring.")

Processed dataframe saved before refactoring.


In [18]:
expected_processed_df = pd.read_parquet('/Users/purnimaprabha/Downloads/processed_df.parquet')
actual_processed_df = pd.DataFrame(X_train_processed, columns=expected_processed_df.columns)
actual_processed_df = actual_processed_df.astype(expected_processed_df.dtypes)
pd.testing.assert_frame_equal(actual_processed_df, expected_processed_df)

print("Data verification successful: The processed dataframe remains unchanged after refactoring.")

Data verification successful: The processed dataframe remains unchanged after refactoring.


Trained using a `RandomForestRegressor` on the processed training data.

## models

In [96]:
import os
import joblib

models_folder = '/Users/purnimaprabha/dsp-purnima-prabha/models/'
os.makedirs(models_folder, exist_ok=True)

joblib.dump(model, os.path.join(models_folder, 'model.joblib'))
joblib.dump(cat_imputer, os.path.join(models_folder, 'cat_imputer.joblib'))
joblib.dump(encoder, os.path.join(models_folder, 'encoder.joblib'))
joblib.dump(scaler, os.path.join(models_folder, 'scaler.joblib'))

['/Users/purnimaprabha/dsp-purnima-prabha/models/scaler.joblib']

## Model Evaluation

In [39]:
def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

# Testing build_model and make_predictions functions

In [75]:
data = pd.read_csv('/Users/purnimaprabha/Downloads/house-prices-advanced-regression-techniques/train.csv')
model_performance = build_model(data)
print(f'Model Performance (RMSLE): {model_performance}')

test_data = pd.read_csv('/Users/purnimaprabha/Downloads/house-prices-advanced-regression-techniques/test.csv')
predictions = make_predictions(test_data)
print(predictions)

Model Performance (RMSLE): {'rmsle': 0.25}
[125899.63798701 156717.         175613.         ... 121937.85
 137930.9        175929.        ]
