# Load data

In [11]:
import pandas as pd 
from sklearn.model_selection import train_test_split

train_df = pd.read_csv("./input/train.csv")

y = train_df['SalePrice']
X = train_df.drop(['SalePrice'],axis=1)

X_train_full,X_valid_full,y_train,y_valid = train_test_split(X,
                                                             y,
                                                             train_size=0.8,
                                                             test_size=0.2,
                                                             random_state=0)

# Choose attributes

Features include **low cardinality categorical** columns (n <10) and **numerical** columns.


In [12]:
# Find low cardinality categorical cols
categorical_cols = [cname for cname in X_train_full.columns
                    if X_train_full[cname].nunique() < 10 
                    and X_train_full[cname].dtype == 'object']

# Find numerical cols
numerical_cols = [cname for cname in X_train_full.columns
                  if X_train_full[cname].dtype in ['int64','float64']]


expected_cols = categorical_cols + numerical_cols
X_train = X_train_full[expected_cols].copy()
X_valid = X_valid_full[expected_cols].copy()

# Pipeline

## Define preprocessing steps

In [13]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('onehot',OneHotEncoder(handle_unknown='ignore'))
    ]
)

#Bundle them
preprocessor = ColumnTransformer(
    transformers=[
        ('num',numerical_transformer,numerical_cols),
        ('cat',categorical_transformer,categorical_cols)
    ]
)

## Define model

In [14]:
from sklearn.ensemble import RandomForestRegressor 
model = RandomForestRegressor(n_estimators=100,random_state=0)

## Create pipeline

In [16]:
from sklearn.metrics import mean_absolute_error as mae 

#Bundle preprocessing and modeling 
my_pipeline = Pipeline(
    steps=[
        ('preprocessor',preprocessor),
        ('model',model)
    ]
)

#Preprocessing training data and fit model 
my_pipeline.fit(X_train,y_train)

#Preprocessing validation data and predict
preds = my_pipeline.predict(X_valid)

score = mae(y_valid,preds)
print("MAE:",score)


MAE: 17867.65054794521


## Submit

In [17]:
last_X = X[expected_cols]
my_pipeline.fit(last_X,y)

In [21]:
test_df = pd.read_csv("./input/test.csv")

X_test = test_df[expected_cols]
preds = my_pipeline.predict(X_test)

In [23]:
output = pd.DataFrame({'id':X_test.index,
                       'SalePrice':preds})
output.to_csv("./output/submission.csv",index=False)