# Load the data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor

#load data
X_full = pd.read_csv('/Users/chuwen/Desktop/kaggle/House_pricing/data/train.csv', index_col = 'Id')
X_test_full = pd.read_csv('/Users/chuwen/Desktop/kaggle/House_pricing/data/test.csv', index_col = 'Id')
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)#drop the rows with null target values

## take a look at the data

In [None]:
X_full.head()

## take a look at the target value "SalePrice"

In [None]:
%matplotlib inline
prices = pd.DataFrame({"price":X_full["SalePrice"], "log(price + 1)":np.log1p(X_full["SalePrice"])})
prices.hist()

After normalization, the distribution is a bell curve. Use log plus one to normalize the target data and remember to change it back.

In [2]:
y = np.log1p(X_full.pop('SalePrice'))
# take the normalized target out

## handle the features

In [None]:
X_full.columns

### in some cases, like 'MSSubClass' is categorical data but regonized as numerical data. we need to change it's type

In [3]:
print('befor:', X_full['MSSubClass'].dtypes)
X_full['MSSubClass'] = X_full['MSSubClass'].astype(str)
X_test_full['MSSubClass'] = X_test_full['MSSubClass'].astype(str)
print('after:', X_full['MSSubClass'].dtypes)

befor: int64
after: object


### seperate numerical and categorical data with low cardinality, make train, valid data

In [4]:
# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, 
                                                                train_size=0.8, test_size=0.2,
                                                                random_state=0)
categorical_cols = [cname for cname in X_train_full.columns if
                    X_train_full[cname].nunique() < 15 and 
                    X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if 
                X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

## Implement pipeline

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant') # Your code here

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


## build model

In [6]:
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import Ridge

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import Ridge

# modle1: ridge with bagging
ridge = Ridge(15)

params = [1, 10, 15, 20, 25, 30, 40]
test_scores = []
for param in params:
    my_pipeline = Pipeline(steps=[('precrocessor', preprocessor),
                                 ('model', BaggingRegressor(n_estimators=param, base_estimator=ridge))])
    test_score = -cross_val_score(my_pipeline, X_train, y_train, cv=10, scoring='neg_mean_absolute_error')
    test_scores.append(np.mean(test_score))


In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(params, test_scores)
plt.title("n_estimator vs CV Error");

In [None]:
# model 2:xgboost
from xgboost import XGBRegressor
params = [200, 400, 600, 800, 1000, 1200]
test_scores = []
for param in params:
    my_pipeline = Pipeline(steps=[('precrocessor', preprocessor),
                                 ('model', XGBRegressor(n_estimators=param, learning_rate = 0.05, n_jobs=4))])
    test_score = -cross_val_score(my_pipeline, X_train, y_train, cv=10, scoring='neg_mean_absolute_error')
    test_scores.append(np.mean(test_score))

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(params, test_scores)
plt.title("n_estimator vs CV Error");

In [None]:
# model 3:gradient boost
from sklearn.ensemble import GradientBoostingRegressor
params = [200, 400, 600, 800, 1000, 1200]
test_scores = []
for param in params:
    my_pipeline = Pipeline(steps=[('precrocessor', preprocessor),
                                 ('model', GradientBoostingRegressor(n_estimators=param))])
    test_score = -cross_val_score(my_pipeline, X_train, y_train, cv=10, scoring='neg_mean_absolute_error')
    test_scores.append(np.mean(test_score))

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(params, test_scores)
plt.title("n_estimator vs CV Error");

In [None]:
params = [50, 100, 150, 200]
test_scores = []
for param in params:
    my_pipeline = Pipeline(steps=[('precrocessor', preprocessor),
                                 ('model', GradientBoostingRegressor(n_estimators=param))])
    test_score = -cross_val_score(my_pipeline, X_train, y_train, cv=10, scoring='neg_mean_absolute_error')
    test_scores.append(np.mean(test_score))
print(test_scores)

## make predictions with tunned model

In [7]:
def print_or_MAE(predic, val):
    predict_or = np.expm1(predic)
    val_or = np.expm1(val)
    score = mean_absolute_error(predict_or, val_or)
    print('MAE:', score)

In [8]:
my_pipeline1 = Pipeline(steps=[('precrocessor', preprocessor),
                                 ('model', BaggingRegressor(n_estimators=25, base_estimator=Ridge(15)))])
my_pipeline1.fit(X_train, y_train)
predict1 = my_pipeline1.predict(X_valid)

print_or_MAE(predict1, y_valid)

MAE: 22111.659457968846


In [9]:
my_pipeline2 = Pipeline(steps=[('precrocessor', preprocessor),
                                 ('model', XGBRegressor(n_estimators=400, learning_rate = 0.05, n_jobs=4, random_state = 0))])
my_pipeline2.fit(X_train, y_train)
predict2 = my_pipeline2.predict(X_valid)
print_or_MAE(predict2, y_valid)

MAE: 16857.09432523546


In [10]:
my_pipeline3 = Pipeline(steps=[('precrocessor', preprocessor),
                                 ('model', GradientBoostingRegressor(n_estimators=200, random_state=0))])
my_pipeline3.fit(X_train, y_train)
predict3 = my_pipeline3.predict(X_valid)
print_or_MAE(predict3, y_valid)

MAE: 16775.110576072184


In [12]:
predict_val = (predict1+predict2+predict3)/3
print_or_MAE(predict_val, y_valid)

MAE: 15753.761407504324


## apply on all data and make prediction

In [13]:
my_pipeline1 = Pipeline(steps=[('precrocessor', preprocessor),
                                 ('model', BaggingRegressor(n_estimators=25, base_estimator=Ridge(15)))])
my_pipeline1.fit(X_full[my_cols], y)
predict1 = my_pipeline1.predict(X_test)

my_pipeline2 = Pipeline(steps=[('precrocessor', preprocessor),
                                 ('model', XGBRegressor(n_estimators=400, learning_rate = 0.05, n_jobs=4))])
my_pipeline2.fit(X_full[my_cols], y)
predict2 = my_pipeline2.predict(X_test)

my_pipeline3 = Pipeline(steps=[('precrocessor', preprocessor),
                                 ('model', GradientBoostingRegressor(n_estimators=200))])
my_pipeline3.fit(X_full[my_cols], y)
predict3 = my_pipeline3.predict(X_test)

predict_out = np.expm1((predict1+predict2+predict3)/3)

In [14]:
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': predict_out})
output.to_csv('submission_ad.csv', index=False)

In [15]:
!kaggle competitions submit -c home-data-for-ml-course -f submission_ad.csv -m "Message"

100%|██████████████████████████████████████| 33.6k/33.6k [00:00<00:00, 72.6kB/s]
Successfully submitted to Housing Prices Competition for Kaggle Learn Users