# Importing modules and loading datasets

In [1]:
import pandas as pd
import numpy as np
train_df = pd.read_csv('/Users/purnimaprabha/Downloads/house-prices-advanced-regression-techniques/train.csv') 
test_df = pd.read_csv('/Users/purnimaprabha/Downloads/house-prices-advanced-regression-techniques/test.csv')
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


# Feature Selection

In [3]:
num_features = ['GrLivArea', '1stFlrSF']
cat_features = ['SaleCondition', 'HouseStyle']
select_features = num_features + cat_features
X = train_df[select_features]
Y = train_df['SalePrice']

In [29]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, Y, test_size=0.2, random_state=42)

# Importing and scaling modules

In [8]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [12]:
cat_imputer = SimpleImputer(strategy='most_frequent')
X_train_cat = pd.DataFrame(cat_imputer.fit_transform(X_train[cat_features]), columns=cat_features)
X_valid_cat = pd.DataFrame(cat_imputer.transform(X_valid[cat_features]), columns=cat_features)

In [19]:
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoder.fit(X_train_cat)

In [21]:
X_train_cat_encoded = encoder.transform(X_train_cat)
X_valid_cat_encoded = encoder.transform(X_valid_cat)

In [25]:
scaler = StandardScaler()
X_train_num_scaled = scaler.fit_transform(X_train[num_features])
X_valid_num_scaled = scaler.transform(X_valid[num_features])

In [33]:
X_train_transformed = np.hstack([X_train_cat_encoded, X_train_num_scaled])
X_valid_transformed = np.hstack([X_valid_cat_encoded, X_valid_num_scaled])

### Saving the preprocessed dataframe before refactoring

Scaled the numeric features using `StandardScaler` to ensure they have a mean of 0 and standard deviation of 1.  

In [14]:
import pandas as pd
column_names = [f"feature_{i}" for i in range(X_train_processed.shape[1])]
processed_df = pd.DataFrame(X_train_processed, columns=column_names)
processed_df = pd.DataFrame(X_train_processed)  
processed_df.to_parquet('/Users/purnimaprabha/Downloads/processed_df.parquet', index=False)

print("Processed dataframe saved before refactoring.")

Processed dataframe saved before refactoring.


In [18]:
expected_processed_df = pd.read_parquet('/Users/purnimaprabha/Downloads/processed_df.parquet')
actual_processed_df = pd.DataFrame(X_train_processed, columns=expected_processed_df.columns)
actual_processed_df = actual_processed_df.astype(expected_processed_df.dtypes)
pd.testing.assert_frame_equal(actual_processed_df, expected_processed_df)

print("Data verification successful: The processed dataframe remains unchanged after refactoring.")

Data verification successful: The processed dataframe remains unchanged after refactoring.


Trained using a `RandomForestRegressor` on the processed training data.

## model training

In [35]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_transformed, y_train)

Used the given `compute_rmsle()` function provided to evaluate model performance on the validation set. Made sure to model’s predictions are clipped to be non-negative to ensure RMSLE is valid.

## Model Evaluation

In [39]:
import numpy as np
from sklearn.metrics import mean_squared_log_error

def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

In [41]:
y_pred = model.predict(X_valid_transformed)
y_pred = np.maximum(0, y_pred)
rmsle = compute_rmsle(y_valid, y_pred)
print(f"Validation RMSLE: {rmsle}")
y_train_pred = model.predict(X_train_transformed)
y_train_pred = np.maximum(0, y_train_pred)
train_rmsle = compute_rmsle(y_train, y_train_pred)
print(f"Training RMSLE: {train_rmsle}")

Validation RMSLE: 0.25
Training RMSLE: 0.11


## Model Inference 

In [49]:
test_df = pd.read_csv('/Users/purnimaprabha/Downloads/house-prices-advanced-regression-techniques/test.csv')

X_test = test_df[select_features]
X_test_cat = pd.DataFrame(cat_imputer.transform(X_test[cat_features]), columns=cat_features)
X_test_cat_encoded = encoder.transform(X_test_cat)
X_test_num_scaled = scaler.transform(X_test[num_features])
X_test_transformed = np.hstack([X_test_cat_encoded, X_test_num_scaled])

# Making predictions
y_test_pred = model.predict(X_test_transformed)

test_df['SalePrice'] = np.maximum(0, y_test_pred)

print(test_df[['Id', 'SalePrice']].head())  # Display first 5 predictions

     Id      SalePrice
0  1461  125899.637987
1  1462  156717.000000
2  1463  175613.000000
3  1464  167000.000000
4  1465  168810.000000
