In [96]:
#
#This code was used to output predictions for the "House Prices - Advanced Regression Techniques" Kaggle Competition.
#It returns an RMSE of 0.13894
#


# Importing pandas to explore data
import pandas as pd

# Creating a dataframe
data = pd.read_csv("train.csv")

# Exploring columns
print(data.columns)

# Assigning values of X,y
y = data.SalePrice
X = data.drop(['SalePrice'], axis=1)


Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [97]:
#importing scikit built-in class to split data
from sklearn.model_selection import train_test_split


# Splitting data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size= 0.8, test_size=0.2, 
                                                      random_state=0)

In [98]:
# This is a way to differentiate between categorical,numerical data
categorical = [cname for cname in X_train.columns if X_train[cname].nunique() < 10 and X_train[cname].dtype == "object"]
numerical   = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]

# Merging them together
my_cols = categorical + numerical

# Assigning them
X_train = X_train[my_cols].copy()
X_valid = X_valid[my_cols].copy()



In [99]:
#importing preprocessing classes
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder


# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical),
        ('cat', categorical_transformer, categorical)
    ])



In [100]:
# We're using mean squared error to calculate error
from sklearn.metrics import mean_squared_error

# Importing regression methods
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR


# Function to iterate over each of our regression methods, for now, we're using 4.
# Tried custom parameters for the most promising, RFR and XGB.
# ////////////////

def get_models():
    rf = RandomForestRegressor(n_estimators=1000, random_state=0)
    xgb_reg = xgb.XGBRegressor(n_estimators=301, max_depth=3, learning_rate=0.095)
    lr = LinearRegression()
    SVR_reg = SVR()
    
    regression = [rf,xgb_reg,lr,SVR_reg]
    
    for i in regression:
        model = i
        my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)])
        my_pipeline.fit(X_train,y_train)
        i_preds = my_pipeline.predict(X_valid)
        i_MSE = mean_squared_error(y_valid, i_preds,squared=False)
        print(str(i) + ": " + str(i_MSE))

get_models()

RandomForestRegressor(n_estimators=1000, random_state=0): 33446.55783490342
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.095, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=301, n_jobs=12,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None): 33213.862278805864
LinearRegression(): 61175.77397900344
SVR(): 85096.07858146935


In [101]:
# Now that we know XGB performs (a little bit) better than RFR, now we'll be training the full test data
# ////Training the model
model = xgb.XGBRegressor(n_estimators=301, max_depth=3, learning_rate=0.095)
my_pipeline_full = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('model', model)])
my_pipeline_full.fit(X,y)


#//// Importing test data
validate_data = pd.read_csv("test.csv")

#//// Exporting predictions
final_data = {'Id': validate_data.Id, 'SalePrice': my_pipeline_full.predict(validate_data[my_cols])}
final_data = pd.DataFrame(data=final_data)
final_data.to_csv('your_name.csv', index=False)

# Result exported in kaggle outputs an evaluation of 0.13894.
# This is evaluating on RMSE between the logarithm of the predicted value and the logarithm of the observed sales price.
# This result can be improved with further work on preprocessing, trying other methods of encoding. 