In [1]:
import pandas as pd
import numpy as np
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
print(train.shape)
train.head()
features = ["LotShape", "HouseStyle", "LandContour", "Utilities", "RoofStyle","LotArea", "MSSubClass", "OverallQual", "OverallCond"]
X = train[features]
y = train["SalePrice"]

(1460, 81)


In [2]:
test.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MiscVal            int64
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
Length: 80, dtype: object

In [3]:
from sklearn.preprocessing import StandardScaler
num_features = ["LotArea", "MSSubClass", "OverallQual", "OverallCond"]
cat_features = ["LotShape", "HouseStyle", "LandContour", "Utilities", "RoofStyle"]
X_num = X[num_features]
X_cat = X[cat_features]


In [4]:
from sklearn.model_selection import train_test_split
y_log = np.log(y)
X_train, X_val, y_train_log, y_val_log = train_test_split(
    X, y_log, 
    test_size=0.2, 
    random_state=42)
print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")
num_features = ["LotArea", "MSSubClass", "OverallQual", "OverallCond"]
cat_features = ["LotShape", "HouseStyle", "LandContour", "Utilities", "RoofStyle"]
X_train_num = X_train[num_features]
X_train_cat = X_train[cat_features]
X_val_num = X_val[num_features]
X_val_cat = X_val[cat_features]

X_train shape: (1168, 9)
X_val shape: (292, 9)


In [5]:
from sklearn.impute import SimpleImputer
num_imputer = SimpleImputer(strategy='mean')
X_train_num_imputed = pd.DataFrame(
    num_imputer.fit_transform(X_train_num), 
    columns=X_train_num.columns, 
    index=X_train_num.index)
scaler = StandardScaler()
X_train_num_scaled = scaler.fit_transform(X_train_num_imputed)
X_train_num_processed = pd.DataFrame(
    X_train_num_scaled, 
    columns=X_train_num.columns, 
    index=X_train_num_imputed.index)
print("Processed Training Numerical Features (Head):")
print(X_train_num_processed.head())
X_val_num_imputed = pd.DataFrame(
    num_imputer.transform(X_val_num), 
    columns=X_val_num.columns, 
    index=X_val_num.index)
X_val_num_scaled = scaler.transform(X_val_num_imputed)
X_val_num_processed = pd.DataFrame(
    X_val_num_scaled, 
    columns=X_val_num.columns, 
    index=X_val_num_imputed.index)
print("\nProcessed Validation Numerical Features (Head):")
print(X_val_num_processed.head())
X_train_cat_imputed = X_train_cat.fillna('Missing')
X_train_cat_processed = pd.get_dummies(X_train_cat_imputed, drop_first=True)
print("\nProcessed Training Categorical Features (Head):")
print(X_train_cat_processed.head())
X_val_cat_imputed = X_val_cat.fillna('Missing')
X_val_cat_processed = pd.get_dummies(X_val_cat_imputed, drop_first=True)
X_val_cat_processed = X_val_cat_processed.reindex(
    columns=X_train_cat_processed.columns, 
    fill_value=0)
print("\nProcessed Validation Categorical Features (Head):")
print(X_val_cat_processed.head())
X_train_processed = pd.concat([X_train_num_processed, X_train_cat_processed], axis=1)
X_val_processed = pd.concat([X_val_num_processed, X_val_cat_processed], axis=1)
print("\nFinal Combined Training Features (Head):")
print(X_train_processed.head())

Processed Training Numerical Features (Head):
       LotArea  MSSubClass  OverallQual  OverallCond
254  -0.212896   -0.866764    -0.820445     0.372217
1066 -0.265245    0.074110    -0.088934     1.268609
638  -0.177841   -0.631546    -0.820445     1.268609
799  -0.324474   -0.161109    -0.820445     1.268609
380  -0.529035   -0.161109    -0.820445     0.372217

Processed Validation Numerical Features (Head):
       LotArea  MSSubClass  OverallQual  OverallCond
892  -0.211594   -0.866764    -0.088934     2.165000
1105  0.145643    0.074110     1.374088    -0.524174
413  -0.160826   -0.631546    -0.820445     0.372217
522  -0.529035   -0.161109    -0.088934     1.268609
1036  0.205338   -0.866764     2.105599    -0.524174

Processed Training Categorical Features (Head):
      LotShape_IR2  LotShape_IR3  LotShape_Reg  HouseStyle_1.5Unf  \
254          False         False          True              False   
1066         False         False         False              False   
638          

In [6]:
from sklearn.linear_model import Ridge 
model = Ridge(random_state=42)
model.fit(X_train_processed, y_train_log)
print("Model trained successfully.")

Model trained successfully.


In [7]:
from sklearn.metrics import mean_squared_log_error
def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 4) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)
y_val_log_pred = model.predict(X_val_processed)
y_val_pred = np.exp(y_val_log_pred)
y_val_actual = np.exp(y_val_log) 
rmsle_score = compute_rmsle(y_val_actual, y_val_pred)
print(f"\nFinal Validation RMSLE Score: {rmsle_score}")


Final Validation RMSLE Score: 0.2196


In [11]:
X_train_processed.to_parquet("../data/processed_df.parquet", index=False)