In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load the dataset
df = pd.read_csv('../data/train.csv')

### Handle Missing Garage Years

In [3]:
def bin_garage_year(year):
    if pd.isnull(year):
        return 'NoGarage'
    elif year < 1940:
        return 'Before1940'
    elif year < 1960:
        return '1940-1959'
    elif year < 1980:
        return '1960-1979'
    elif year < 2000:
        return '1980-1999'
    elif year < 2010:
        return '2000-2009'
    else:
        return '2010+'

df['GarageYrBlt'] = df['GarageYrBlt'].apply(bin_garage_year).astype('category')

### Handle MasVnrArea

In [4]:
df['MasVnrArea'] = df['MasVnrArea'].fillna(0)

### Handle Lot Frontage

In [5]:
from sklearn.ensemble import RandomForestRegressor

# Split into missing and non-missing
df_known = df[df['LotFrontage'].notnull()]
df_missing = df[df['LotFrontage'].isnull()]

# Use only relevant predictors
features = ['LotArea', 'Neighborhood', 'Street', 'LotConfig']
X_train = pd.get_dummies(df_known[features])
y_train = df_known['LotFrontage']
X_missing = pd.get_dummies(df_missing[features])

# Align columns in case of one-hot mismatch
X_missing = X_missing.reindex(columns=X_train.columns, fill_value=0)

# Train and predict
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(X_train, y_train)
df.loc[df['LotFrontage'].isnull(), 'LotFrontage'] = model.predict(X_missing)

In [6]:
import joblib

# Save
joblib.dump(model, 'lot_frontage_imputer.pkl')

['lot_frontage_imputer.pkl']