In [1]:

import numpy as np                # linear algebra
import pandas as pd               # data processing, CSV file I/O (e.g. pd.read_csv)
import sys

from sklearn.linear_model    import SGDRegressor
from sklearn.linear_model    import LinearRegression
from sklearn.linear_model    import ElasticNet
from sklearn.linear_model    import BayesianRidge
from sklearn.linear_model    import Lasso
from sklearn.neural_network  import MLPRegressor
from sklearn.ensemble        import GradientBoostingRegressor
from sklearn.ensemble        import VotingRegressor
from sklearn.preprocessing   import StandardScaler
from sklearn.preprocessing   import PolynomialFeatures
from sklearn.preprocessing   import Imputer
from sklearn                 import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.ensemble        import RandomForestRegressor
from sklearn.model_selection import GridSearchCV


from sklearn.metrics         import mean_absolute_error
from xgboost                 import XGBRegressor

# suppress all warnings
import warnings
warnings.filterwarnings("ignore")


%matplotlib inline
import matplotlib.pyplot     as plt

In [2]:
def modelAccuracyOnTrainingData(x_train, y_train, model):
    y_predict = model.predict(x_train)
    
    #for i in range(0, len(y_train)):
    #    print(str(y_train[i]) + " " + str(y_predict[i]))
    
    return mean_absolute_error(y_train, y_predict) 

def computeKFoldCrossValidationScore(model, x_train, y_train, K, comment):
    
    scores = cross_val_score(model, x_train, y_train, cv=K)
    print("Cross-Validation Score(" + comment + "): ")
    print(scores)
    print("Cross-Validation Accuracy(" + comment + "):")
    print ("%0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    

In [3]:

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

'''
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
''' 

doGridSearch = False # do grid search once and save the results for future use

X_train = pd.read_csv("/kaggle/input/home-data-for-ml-course/train.csv")
X_test  = pd.read_csv("/kaggle/input/home-data-for-ml-course/test.csv")

y_train = X_train.iloc[:, -1]
X_train = X_train.iloc[:, :-1] # training set after removing last column

# Get a summary of the traning data
#print ("Summary of training data : ")
#print (X_train.info())

# Training data 
#print ("Training data overview")
#print (X_train.head())

# plot histogram for numerical values
#X_train.hist(bins=50, figsize=(40, 30))
#plt.show()

##################################################### DATA PREPOROCESSING ######################################################

# do one-hot-encoding to handle categorical values
X_train_encoded = pd.get_dummies(X_train)
X_test_encoded  = pd.get_dummies(X_test)

# Impute missing values
imputer = Imputer(strategy="mean")
imputer.fit (X_train_encoded)
X_train_encoded_tr = pd.DataFrame( imputer.transform(X_train_encoded), columns = X_train_encoded.columns)

imputer.fit (X_test_encoded)
X_test_encoded_tr  = pd.DataFrame( imputer.transform(X_test_encoded), columns = X_test_encoded.columns)

#print(imputer.statistics_)
#print(X_train_encoded.median().values)

# statistical analysis of traning set
#print(X_train_encoded.describe())

# normalize data for better performance of GD
#X_train_encoded_tr_norm = (X_train_encoded_tr - X_train_encoded_tr.mean()) / X_train_encoded_tr.std()
scaler = StandardScaler()
scaler.fit(X_train_encoded_tr)
X_train_encoded_tr_norm =  pd.DataFrame(scaler.transform(X_train_encoded_tr), columns = X_train_encoded.columns)

train_data_headers = list(X_train_encoded_tr_norm.columns)
test_data_headers  = list(X_test_encoded_tr.columns)
##################################################################################################################################

######################################################## TRAINING ################################################################

'''
# [1]. apply linear regression
regressor = LinearRegression()  
regressor.fit(X_train_encoded_tr_norm, y_train) #training the algorithm
#print(regressor.score(X_train_encoded_tr, y_train))
#print(regressor.coef_)

#print(X_test_encoded_tr.head().values)

print("Linear Regression Mean Absolute Error on Training Data: " + str(modelAccuracyOnTrainingData(X_train_encoded_tr_norm, y_train, regressor)))

## computing cross-validation score of our model -- linear regression
computeKFoldCrossValidationScore(regressor, X_train_encoded_tr_norm, y_train, 5, "Linear Regression")

# [2]. apply regularized linear regression
reg_regressor = Lasso(normalize=True)  
reg_regressor.fit(X_train_encoded_tr_norm, y_train) #training the algorithm

print("Linear Regression with Reg Mean Absolute Error on Training Data: " + str(modelAccuracyOnTrainingData(X_train_encoded_tr_norm, y_train, reg_regressor)))

## computing cross-validation score of our model -- linear regression
computeKFoldCrossValidationScore(reg_regressor, X_train_encoded_tr_norm, y_train, 5, "Regularized Lin Reg")
'''

# [3]. Gradient Boosting Regressor

gb_regressor = None
if doGridSearch: # perform grid search to find the best parameter set
    params = [ {'n_estimators': [300, 400, 500, 600, 700], 'max_depth': [4,5, 6], 
            'min_samples_split': [2,3], 'learning_rate': [ 0.01], 'loss': ['ls']} ]
    
    gb_regressor       = GradientBoostingRegressor()
    gb_grid_search     = GridSearchCV(gb_regressor, params, cv = 5, scoring='neg_mean_squared_error')
    gb_grid_search.fit(X_train_encoded_tr_norm, y_train)
    
    print("Grid Search Best Parameters(GB Regressor): ")
    print(gb_grid_search.best_params_)
    
    cvres = gb_grid_search.cv_results_
    with open("GB_Regressor_GridSearch.txt", "w+") as outFile:
        for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
            outFile.write(str(np.sqrt(-mean_score)) +  str(params))
        
else:
    params       = {'n_estimators': 700, 'max_depth': 4, 'min_samples_split': 2, 'learning_rate': 0.01, 'loss': 'ls'}
    gb_regressor = GradientBoostingRegressor(**params)
    gb_regressor.fit(X_train_encoded_tr_norm, y_train)
    print("GB Regressor Mean Absolute Error on Training Data: " + 
           str(modelAccuracyOnTrainingData(X_train_encoded_tr_norm, y_train, gb_regressor)))
    
    ## computing cross-validation score of our model -- 
    computeKFoldCrossValidationScore(gb_regressor, X_train_encoded_tr_norm, y_train, 5, "GB Regressor")

# [4]. XGBoost

xgb_regressor = None
if doGridSearch: # perform grid search to find the best parameter set
    
    params = [ {'objective' : ['reg:linear'],'colsample_bytree' : [0.3], 'learning_rate' : [0.01, 0.05, 0.1],
                'max_depth' : [3, 4 ,5, 6], 'alpha' : [10],  'n_estimators' : [100, 200, 300, 400, 500,600] }]
    
    xgb_regressor       = XGBRegressor()
    xgb_grid_search     = GridSearchCV(xgb_regressor, params, cv = 5, scoring='neg_mean_squared_error')
    xgb_grid_search.fit(X_train_encoded_tr_norm, y_train)
    
    print("Grid Search Best Parameters(XGB Regressor): ")
    print(xgb_grid_search.best_params_)
    
    cvres = xgb_grid_search.cv_results_
    with open("XGB_Regressor_GridSearch.txt", "w+") as outFile:
        for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
            outFile.write(str(np.sqrt(-mean_score)) + str(params))
        
else:
    xgb_regressor = XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3,  learning_rate = 0.05, max_depth = 3, alpha = 10, 
                                 n_estimators = 600)
    xgb_regressor.fit(X_train_encoded_tr_norm, y_train)
    print("XGB Regressor Mean Absolute Error on Training Data: " + 
           str(modelAccuracyOnTrainingData(X_train_encoded_tr_norm, y_train, xgb_regressor)))
    
    ## computing cross-validation score of our model -- 
    computeKFoldCrossValidationScore(xgb_regressor, X_train_encoded_tr_norm, y_train, 5, "XGB Regressor")

# [5]. RandomForestRegressor

rf_regressor = None
if doGridSearch: # perform grid search to find the best parameter set
    
    params = [ {'max_depth': [3, 4, 5, 6], 'random_state' : [0], 'n_estimators' : [100, 200, 300, 400, 500, 600] }]
    
    rf_regressor       = RandomForestRegressor()
    rf_grid_search     = GridSearchCV(rf_regressor, params, cv = 5, scoring='neg_mean_squared_error')
    rf_grid_search.fit(X_train_encoded_tr_norm, y_train)
    
    print("Grid Search Best Parameters(RF Regressor): ")
    print(rf_grid_search.best_params_)
    
    cvres = rf_grid_search.cv_results_
    with open("RF_Regressor_GridSearch.txt", "w+") as outFile:
        for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
            outFile.write(str(np.sqrt(-mean_score)) + str(params))
else:
    rf_regressor = RandomForestRegressor(max_depth=6, random_state=0, n_estimators=300)
    rf_regressor.fit(X_train_encoded_tr_norm, y_train)  
    print("RF Regressor Mean Absolute Error on Training Data: " + str(modelAccuracyOnTrainingData(X_train_encoded_tr_norm, y_train, rf_regressor)))
    ## computing cross-validation score of our model -- 
    computeKFoldCrossValidationScore(rf_regressor, X_train_encoded_tr_norm, y_train, 5, "RF Regressor")

if doGridSearch:
    sys.exit(0)

#################################################################################################################################

######################################################## PREDICTION #############################################################
# augment missing columns to test dataset
index = 0
for col in X_train_encoded_tr_norm.columns:
    if col not in X_test_encoded_tr.columns:
        X_test_encoded_tr.insert(index, col, [0.0] * X_test_encoded.shape[0], True)
    index += 1
    
# test code
for i in range(len(X_train_encoded_tr_norm.columns)):
    assert  X_train_encoded_tr_norm.columns[i] == X_test_encoded_tr.columns[i]

scaler = StandardScaler()
scaler.fit(X_test_encoded_tr)
X_test_encoded_tr_norm =  pd.DataFrame(scaler.transform(X_test_encoded_tr), columns = X_test_encoded_tr.columns)

# NOTE : Voting classifier didn't give good result.
#vr = VotingRegressor([('gb', gb_regressor), ('xgb', xgb_regressor)])
#predictions = vr.fit(X_train_encoded_tr_norm, y_train).predict(X_test_encoded_tr_norm) #
predictions = xgb_regressor.predict(X_test_encoded_tr_norm)
#print("Voting Regressor Mean Absolute Error on Training Data: " + str(modelAccuracyOnTrainingData(X_train_encoded_tr_norm, y_train, vr)))
#computeKFoldCrossValidationScore(vr, X_train_encoded_tr_norm, y_train, 5, "Voting Regressor")

outputDF = pd.DataFrame({ 'Id' : list(X_test_encoded_tr['Id']), 'SalePrice' : predictions})
outputDF = outputDF.astype({'Id': 'int64'})
print(outputDF)


# dump output to CSV
outputDF.to_csv("price_submission.csv", index=False)

# Any results you write to the current directory are saved as output.

########################################################################################################################################

GB Regressor Mean Absolute Error on Training Data: 9165.93482391279
Cross-Validation Score(GB Regressor): 
[0.89853514 0.84743567 0.8926072  0.90853114 0.88068371]
Cross-Validation Accuracy(GB Regressor):
0.89 (+/- 0.04)
XGB Regressor Mean Absolute Error on Training Data: 7753.905795162671
Cross-Validation Score(XGB Regressor): 
[0.9225105  0.83783023 0.90267499 0.91473808 0.88908022]
Cross-Validation Accuracy(XGB Regressor):
0.89 (+/- 0.06)
RF Regressor Mean Absolute Error on Training Data: 14588.249414972543
Cross-Validation Score(RF Regressor): 
[0.8495295  0.83355811 0.85891236 0.8554533  0.79927979]
Cross-Validation Accuracy(RF Regressor):
0.84 (+/- 0.04)
        Id      SalePrice
0     1461  127439.742188
1     1462  163052.078125
2     1463  186650.765625
3     1464  196859.703125
4     1465  192786.968750
...    ...            ...
1454  2915   81889.226562
1455  2916   82291.742188
1456  2917  175641.578125
1457  2918  124890.953125
1458  2919  219678.015625

[1459 rows x 2 col