In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
import kagglehub
import os

## Load dataset

In [2]:
path = kagglehub.dataset_download("shashanknecrothapa/ames-housing-dataset")

print("Path to dataset files:", path)

Path to dataset files: /Users/leo/.cache/kagglehub/datasets/shashanknecrothapa/ames-housing-dataset/versions/1


In [5]:
df = pd.read_csv(os.path.join(path, "AmesHousing.csv"))

## Feature Engineering

In [25]:
columns_to_use = [
    "Overall Qual",
    "Exter Qual",
    "Bsmt Qual",
    "Total Bsmt SF",
    "1st Flr SF",
    "Gr Liv Area",
    "Kitchen Qual",
    "Garage Cars",
    "Garage Area"
]

In [61]:
X = df[columns_to_use]
y = df['SalePrice']
transformers = dict()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns


numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), # Change for different strategies
    ('scaler', StandardScaler())
])


categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Change for different strategies
    ('labelencoder', LabelEncoder())
])


def apply_labelencoder(df, categorical_columns):
    le = LabelEncoder()
    for col in categorical_columns:
        df[col] = le.fit_transform(df[col].astype(str))
    return df

def apply_labelencoder_fixed(df, categorical_columns):
    
    for col in categorical_columns:
        if col in transformers:
            print(f"using cache for col: {col}")
            le = transformers[col]
            df.loc[:,col] = le.transform(df[col].astype(str))
        else:
            print(f"creating cache for col: {col}")
            le = LabelEncoder()
            df.loc[:,col] = le.fit_transform(df[col].astype(str))
            transformers[col] = le
    return df

# Can you spot the error?
X_train = apply_labelencoder(X_train, categorical_features)
X_test = apply_labelencoder(X_test, categorical_features)


X_train[numeric_features] = numeric_transformer.fit_transform(X_train[numeric_features])
X_test[numeric_features] = numeric_transformer.transform(X_test[numeric_features])



In [56]:
X_train.head()

Unnamed: 0,Overall Qual,Exter Qual,Bsmt Qual,Total Bsmt SF,1st Flr SF,Gr Liv Area,Kitchen Qual,Garage Cars,Garage Area
381,0.673941,3,2,0.098487,0.558877,-0.243522,3,0.339211,0.046722
834,-0.76675,3,4,-0.077964,0.013468,1.254956,3,0.339211,0.324575
1898,-1.487095,3,5,-2.399331,0.179688,-0.53291,3,0.339211,-0.033338
678,-1.487095,3,5,-2.399331,-0.298194,-0.897619,3,0.339211,-0.231132
700,-1.487095,1,1,-1.024386,-0.139766,0.412558,1,-2.3413,-2.209071


In [57]:
X_test.head()

Unnamed: 0,Overall Qual,Exter Qual,Bsmt Qual,Total Bsmt SF,1st Flr SF,Gr Liv Area,Kitchen Qual,Garage Cars,Garage Area
1357,1.394286,3,3,-1.051884,-0.835812,0.343184,2,-1.001045,-1.135333
2367,-0.046404,3,3,-1.267293,-1.716258,-0.91744,4,-1.001045,-0.965795
2822,0.673941,2,2,-0.575237,-0.905936,0.458146,2,0.339211,0.691907
2126,-1.487095,3,2,-0.130671,-0.428054,-0.996725,4,-2.3413,-2.209071
1544,-0.046404,3,3,-0.350662,-0.612454,-1.137455,4,-1.001045,-1.290742


In [58]:
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
}

# Train and evaluate each model
for name, model in models.items():
    # Fit the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Calculate evaluation metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    print(f"{name} Results:")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  R²: {r2:.4f}")
    print("-" * 50)
    
    # Cross-validation score
    cv_score = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    final_score = cv_score.mean()
    print(f"  Cross-Validation MSE (5-fold): {final_score:.4f}")
    print("=" * 50)


Linear Regression Results:
  RMSE: 36604.4655
  R²: 0.8329
--------------------------------------------------
  Cross-Validation MSE (5-fold): 1201208803.0700
Decision Tree Results:
  RMSE: 42760.4585
  R²: 0.7719
--------------------------------------------------
  Cross-Validation MSE (5-fold): 1417568023.8754
Random Forest Results:
  RMSE: 33711.4491
  R²: 0.8583
--------------------------------------------------
  Cross-Validation MSE (5-fold): 903171246.1877
