In [1]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from mlxtend.regressor import StackingRegressor

# Read data
train = pd.read_csv('AmesHousing.csv')
test = pd.read_csv('test.csv')
train2 = pd.read_csv('train.csv')
train.columns = train.columns.str.replace(' ', '')
train = train.rename(columns = {"YearRemod/Add": "YearRemodAdd"}) # keep variables' name constant

# Remove duplicates in data
data = pd.concat([train,train2,test], axis = 0, sort = False)
useless = ['Id','PID','Order','SalePrice'] 
data = data.drop(useless, axis = 1) # only keep features
duplicate = data[data.duplicated(keep = 'last')].index
duplicate = duplicate[0:390]
train = train.drop(duplicate, axis = 0)
training = pd.concat([train,train2], axis = 0, sort = False)
useless = ['Id','PID','Order'] 
training = training.drop(useless, axis = 1) # final training dataset

# Separating Target and Features
target = training['SalePrice']
test_id = test['Id']
test = test.drop(['Id'],axis = 1)
training2 = training.drop(['SalePrice'], axis = 1)

# Concatenating train & test set
train_test = pd.concat([training2,test], axis = 0, sort = False)

In [7]:
# Converting non-numeric predictors stored as numbers into string
train_test['MSSubClass'] = train_test['MSSubClass'].apply(str)
train_test['YrSold'] = train_test['YrSold'].apply(str)
train_test['MoSold'] = train_test['MoSold'].apply(str)
train_test['OverallQual'] = train_test['OverallQual'].apply(str)
train_test['OverallCond'] = train_test['OverallCond'].apply(str)

# Fill missing values
# Categorical features
# Fill nan based on the description file 
train_test['Functional'] = train_test['Functional'].fillna('Typ')
train_test['Electrical'] = train_test['Electrical'].fillna("SBrkr")
train_test['KitchenQual'] = train_test['KitchenQual'].fillna("TA")

# Fill nan with most frequnt class
train_test['Exterior1st'] = train_test['Exterior1st'].fillna(train_test['Exterior1st'].mode()[0])
train_test['Exterior2nd'] = train_test['Exterior2nd'].fillna(train_test['Exterior2nd'].mode()[0])
train_test['SaleType'] = train_test['SaleType'].fillna(train_test['SaleType'].mode()[0])

# Fill nan with none
train_test["PoolQC"] = train_test["PoolQC"].fillna("None")
train_test["Alley"] = train_test["Alley"].fillna("None")
train_test['FireplaceQu'] = train_test['FireplaceQu'].fillna("None")
train_test['Fence'] = train_test['Fence'].fillna("None")
train_test['MiscFeature'] = train_test['MiscFeature'].fillna("None")
for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
    train_test[col] = train_test[col].fillna('None')
    
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    train_test[col] = train_test[col].fillna('None')
    
# Numeric features   
# Fill nan with zero
for col in ('GarageArea', 'GarageCars'):
    train_test[col] = train_test[col].fillna(0)
        
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea','BsmtUnfSF', 'TotalBsmtSF'):
    train_test[col] = train_test[col].fillna(0)
    
# Fill nan with median vlues 
train_test['LotFrontage'] = train_test['LotFrontage'].fillna(train['LotFrontage'].median())

# Add new features based on common knowledge
train_test["SqFtPerRoom"] = train_test["GrLivArea"] / (train_test["TotRmsAbvGrd"] +
                                                       train_test["FullBath"] +
                                                       train_test["HalfBath"] +
                                                       train_test["KitchenAbvGr"])

train_test['Total_Home_Quality'] = train_test['OverallQual'] + train_test['OverallCond']

train_test['Total_Bathrooms'] = (train_test['FullBath'] + (0.5 * train_test['HalfBath']) +
                               train_test['BsmtFullBath'] + (0.5 * train_test['BsmtHalfBath']))

train_test["HighQualSF"] = train_test["1stFlrSF"] + train_test["2ndFlrSF"]
train_test['renovated']=train_test['YearRemodAdd']+train_test['YearBuilt']

# Removing the useless variables
useless = ['GarageYrBlt','YearRemodAdd', 
    'MSSubClass', 'OverallCond', 'BsmtFinSF2', 'LowQualFinSF', 'BsmtHalfBath', 
    'KitchenAbvGr', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 
    'MoSold', 'YrSold'
          ]
train_test = train_test.drop(useless, axis = 1)

In [8]:
# Creating dummy variables from categorical features
train_test_dummy = pd.get_dummies(train_test)

from scipy.stats import skew
numeric_features = train_test_dummy.dtypes[train_test_dummy.dtypes != object].index
skewed_features = train_test_dummy[numeric_features].apply(lambda x: skew(x)).sort_values(ascending=False)
high_skew = skewed_features[skewed_features > 0.5]
skew_index = high_skew.index

# Normalize skewed features using log_transformation
for i in skew_index:
    train_test_dummy[i] = np.log1p(train_test_dummy[i] )

In [9]:
# SalePrice after transformation
target_log = np.log1p(target)

In [10]:
final_train = train_test

In [11]:
# Train-Test separation
X_train = train_test_dummy[0:4000]
X_test = train_test_dummy[4000:]

# Creation of the RMSE metric:
def rmse(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

def cv_rmse(model):
    rmse = np.sqrt(-cross_val_score(model, X_train, target_log, scoring = "neg_mean_squared_error", cv = kf))
    return (rmse)

In [22]:
# XGB Regressor
xgb = XGBRegressor()
kf = KFold(n_splits = 10, random_state = 42, shuffle = True)
score_xgb = cv_rmse(xgb)
print(score_xgb.mean())
print(score_xgb.std())

# Train model
submission = pd.read_csv("sample_submission.csv")
xgb_model = xgb.fit(X_train, target_log)
submission.iloc[:,1] = np.floor(np.expm1(xgb_model.predict(X_test)))
print(submission.head())
#submission.to_csv("xgb_oliver_08041414.csv", index = False)
#0.09156863522810113
#0.1230436287439903

0.0958417839320341
0.019434673101199708
     Id  SalePrice
0  1461   112625.0
1  1462   167654.0
2  1463   191887.0
3  1464   195845.0
4  1465   179472.0


In [2]:
# Gradient Boosting
gradientboost = GradientBoostingRegressor()
kf = KFold(n_splits = 10, random_state = 42, shuffle = True)
score_gradientboost = cv_rmse(gradientboost)
print(score_gradientboost.mean())
print(score_gradientboost.std())

# Train model
submission = pd.read_csv("sample_submission.csv")
gradientboost_model = gradientboost.fit(X_train, target_log)
submission.iloc[:,1] = np.floor(np.expm1(gradientboost_model.predict(X_test)))
print(submission.head())
#submission.to_csv("oliver_08061722_gradientboost.csv", index = False)

# RandomizedSearch to improve model
para_grid = { 'max_depth': [3,6,10], 'learning_rate': [0.01, 0.05, 0.1],
           'n_estimators': [100, 500, 1000], 'subsample': [0.3, 0.7], 'alpha': np.logspace(-9, 0, 10)}
grid_search_gradientboost = RandomizedSearchCV(gradientboost, para_grid, scoring = 'neg_mean_squared_error', n_iter = 250, verbose = 1)
grid_search_gradientboost.fit(X_train, target_log)
print("Best parameters:", grid_search_gradientboost.best_params_)
print("Lowest RMSE: ", (-grid_search_gradientboost.best_score_)**(1/2.0))
submission.iloc[:,1] = np.floor(np.expm1(grid_search_gradientboost.predict(X_test)))
print(submission.head())
#submission.to_csv("oliver_08061722_gradientboost.csv", index = False)


NameError: name 'cv_rmse' is not defined

In [28]:
# GridSearch/RandomizedSearch to improve the model
para_grid = { 'max_depth': [3,6,10], 'learning_rate': [0.01, 0.05, 0.1],
           'n_estimators': [100, 500, 1000], 'colsample_bytree': [0.3, 0.7], 'alpha': np.logspace(0, 8, 100)}
xgb = XGBRegressor()
#grid_search_xgb = GridSearchCV(xgb, para_grid, cv = 5, scoring = 'neg_mean_squared_error')
#grid_search_xgb = RandomizedSearchCV(xgb, para_grid, scoring = 'neg_mean_squared_error', n_iter = 250, verbose = 1)
grid_search_xgb = RandomizedSearchCV(xgb, para_grid, scoring = 'neg_mean_squared_error', n_iter = 250, verbose = 1)
grid_search_xgb.fit(X_train, target_log)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


RandomizedSearchCV(estimator=XGBRegressor(base_score=None, booster=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None, gamma=None,
                                          gpu_id=None, importance_type='gain',
                                          interaction_constraints=None,
                                          learning_rate=None,
                                          max_delta_step=None, max_depth=None,
                                          min_child_weight=None, missing=nan,
                                          monotone_constraints=None,
                                          n_estimators=100, n_jobs...
       6.13590727e+06, 7.39072203e+06, 8.90215085e+06, 1.07226722e+07,
       1.29154967e+07, 1.55567614e+07, 1.87381742e+07, 2.25701972e+07,
       2.71858824e+07, 3.27454916e+07, 3.94420606e+07, 4.75081016e+0

In [29]:
print("Best parameters:", grid_search_xgb.best_params_)
print("Lowest RMSE: ", (-grid_search_xgb.best_score_)**(1/2.0))
submission.iloc[:,1] = np.floor(np.expm1(grid_search_xgb.predict(X_test)))
print(submission.head())
#submission.to_csv("xgb_oliver_08051226.csv", index = False)

Best parameters: {'n_estimators': 1000, 'max_depth': 10, 'learning_rate': 0.1, 'colsample_bytree': 0.7, 'alpha': 13.530477745798075}
Lowest RMSE:  0.1230436287439903
     Id  SalePrice
0  1461   119191.0
1  1462   154002.0
2  1463   196317.0
3  1464   196199.0
4  1465   181732.0


In [28]:
stack = StackingRegressor(regressors = (XGBRegressor(n_estimators = 1000, max_depth = 10, learning_rate = 0.1, colsample_bytree = 0.7, alpha = 13.530477745798075), 
                                        GradientBoostingRegressor(subsample = 0.3, n_estimators = 500, max_depth = 10, learning_rate = 0.05, alpha = 0.0001)), 
                          meta_regressor = XGBRegressor(), 
                          use_features_in_secondary = True)

In [25]:
stack_score = cv_rmse(stack)
print(stack_score.mean())
print(stack_score.std())
# 0.08279143055492903
# 0.01901712515315616

0.08703126483397647
0.020400837690123658


In [29]:
# Train model
submission = pd.read_csv("sample_submission.csv")
#stack_model = stack.fit(X_train, target_log)
#submission.to_csv("oliver_08281253_stack.csv", index = False)

# GridSearch/RandomizedSearch to improve the model
para_grid = { 'meta_regressor__max_depth': [3,6,10], 'meta_regressor__learning_rate': [0.01, 0.05, 0.1],
           'meta_regressor__n_estimators': [100, 500, 1000], 'meta_regressor__colsample_bytree': [0.3, 0.7], 'meta_regressor__alpha': np.logspace(0, 8, 100)}
#stack = XGBRegressor()
grid_search_stack = RandomizedSearchCV(stack, para_grid, scoring = 'neg_mean_squared_error', n_iter = 250, verbose = 1)
grid_search_stack.fit(X_train, target_log)
submission.iloc[:,1] = np.floor(np.expm1(grid_search_stack.predict(X_test)))
print(submission.head())
#submission.to_csv("oliver_08281257_stack.csv", index = False)

Fitting 5 folds for each of 250 candidates, totalling 1250 fits
     Id  SalePrice
0  1461   107795.0
1  1462   167184.0
2  1463   191392.0
3  1464   192861.0
4  1465   186055.0


In [30]:
#submission.to_csv("oliver_08281831_stack.csv", index = False)