In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [5]:
X_full = pd.read_csv('home-data-for-ml-course/train.csv', index_col='Id')
X_test_full = pd.read_csv('home-data-for-ml-course/test.csv', index_col='Id')

In [6]:
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full['SalePrice']
X_full.drop(['SalePrice'], axis=1, inplace=True)

In [10]:
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, train_size=0.8, random_state=0)

In [13]:
# Separate numerical cols from categorical cols
numerical_cols = [col for col in X_train_full.columns
               if X_train_full[col].dtype in ['int64', 'float64']]

categorical_cols = [col for col in X_train_full.columns
                   if X_train[col].dtype == 'object'
                   and X_train[col].nunique() < 10]

selected_cols = numerical_cols + categorical_cols
X_train = X_train_full[selected_cols].copy()
X_valid = X_valid_full[selected_cols].copy()
X_test = X_valid_full[selected_cols].copy()

In [14]:
X_train.head()

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
619,20,90.0,11694,9,5,2007,2007,452.0,48,0,...,Attchd,Unf,TA,TA,Y,,,,New,Partial
871,20,60.0,6600,5,5,1962,1962,0.0,0,0,...,Detchd,Unf,TA,TA,Y,,,,WD,Normal
93,30,80.0,13360,5,7,1921,2006,0.0,713,0,...,Detchd,Unf,TA,TA,Y,,,,WD,Normal
818,20,,13265,8,5,2002,2002,148.0,1218,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
303,20,118.0,13704,7,5,2001,2002,150.0,0,0,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [24]:
# Step 1: Define your preprocessing steps
numerical_transformer = SimpleImputer(strategy='median')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

# Step 2: Bundle the preprocessor into a Transformer
preprocessor = ColumnTransformer(transformers=[
    ('numeric', numerical_transformer, numerical_cols),
    ('categorical', categorical_transformer, categorical_cols)
])

# Step 3: Define your model
model = RandomForestRegressor(n_estimators=150, random_state=0)

# Step 4: Define your Pipeline
pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('model', model)
])

# Step 5: Fit the data through the Pipeline
pipeline.fit(X_train, y_train)

# Step 6: Make predictions
predictions = pipeline.predict(X_valid)

print('MAE', mean_absolute_error(y_valid, predictions))

MAE 17364.280456621007


In [25]:
# Generate test predictions
test_preds = pipeline.predict(X_test)

