In [3]:
import pandas as pd
import numpy as np
import sklearn
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.base import TransformerMixin
import mlxtend
from scipy import stats
from sklearn import ensemble

# Importing and Inspecting Data:
housing_data = pd.read_excel("training.xls")
test = pd.read_csv('randTestHousing-NoSale.csv')
print(housing_data)

# Removing NaNs
housing_data = housing_data.fillna(housing_data.mean()) # Removes NaNs for numerical data (mean imputation)
test = test.fillna(test.mean())
class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

housing_data = DataFrameImputer().fit_transform(housing_data) # Removes NaNs for categorical columns (mode imputation)
test = DataFrameImputer().fit_transform(test)
print(housing_data)
# Isolating Numerical Columns:
numerical = housing_data.select_dtypes(include=[np.number])
# Isolating Categorical Columns:
categorical = ['MS Zoning', 'Street', 'Alley', 'Lot Shape',

                                'Land Contour', 'Utilities', 'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',

                                'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl', 'Exterior 1st', 'Exterior 2nd',

                               'Mas Vnr Type', 'Exter Qual', 'Exter Cond', 'Foundation', 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1', 'Bsmt Qual',

                               'BsmtFin Type 2', 'Heating', 'Heating QC', 'Central Air', 'Electrical', 'Kitchen Qual', 'Functional', 'Fireplace Qu', 'Garage Type', 'Garage Finish', 'Garage Qual',

                               'Garage Cond', 'Paved Drive', 'Pool QC', 'Fence', 'Misc Feature', 'Sale Type', 'Sale Condition']
            
'''
# Normalize:
#scaler = StandardScaler()
#filled[numerical.columns] = scaler.fit_transform(filled[numerical.columns])
# One hot encoding:
housing_data = pd.get_dummies(housing_data)
'''

# Hashing
label_encoder = LabelEncoder()
for i in categorical:
    housing_data[i] = label_encoder.fit_transform(housing_data[i])
    test[i] = label_encoder.fit_transform(test[i])
'''           
One hot encoding:
housing_data = pd.get_dummies(housing_data)  
'''
nan_check = []
for i in housing_data.isna().sum(): # Checks that there are no NaN values in the data anymore
    nan_check.append(i)
print(nan_check)
print(housing_data['SalePrice'].isna().sum())

      MS SubClass MS Zoning  Lot Frontage  Lot Area Street Alley Lot Shape  \
0              60        RL          71.0      8220   Pave   NaN       IR1   
1              20        RL          87.0     10037   Pave   NaN       Reg   
2              60        RL          46.0     20544   Pave   NaN       IR1   
3              20        RL         129.0     16870   Pave   NaN       IR1   
4             120        RM           NaN      4224   Pave   NaN       Reg   
...           ...       ...           ...       ...    ...   ...       ...   
1695           20        RL           NaN     16635   Pave   NaN       IR1   
1696           20        RL          60.0      7200   Pave   NaN       Reg   
1697           20        RL          74.0      7785   Pave   NaN       IR1   
1698           80        RL          65.0      8385   Pave   NaN       Reg   
1699           80        RL          74.0      9620   Pave   NaN       Reg   

     Land Contour Utilities Lot Config  ... Pool Area Pool QC  

In [5]:
# Train Test Split:
housing_data = housing_data[['SalePrice', 'Overall Qual', 'Kitchen AbvGr', 'Bedroom AbvGr', 'Bsmt Full Bath', 'Overall Cond', 'Full Bath', 'Half Bath', 'TotRms AbvGrd', 'Bldg Type', 'Bsmt Qual', 'Bsmt Unf SF', 'BsmtFin Type 1', 'Exter Qual', 'Foundation', 'Heating QC', 'Kitchen Qual', 'Lot Shape', 'Total Bsmt SF']]
test = test[['Overall Qual', 'Kitchen AbvGr', 'Bedroom AbvGr', 'Bsmt Full Bath', 'Overall Cond', 'Full Bath', 'Half Bath', 'TotRms AbvGrd', 'Bldg Type', 'Bsmt Qual', 'Bsmt Unf SF', 'BsmtFin Type 1', 'Exter Qual', 'Foundation', 'Heating QC', 'Kitchen Qual', 'Lot Shape', 'Total Bsmt SF']]
#housing_data = housing_data[['SalePrice', 'Overall Qual', 'Kitchen AbvGr', 'Bedroom AbvGr', 'Bsmt Full Bath', 'Overall Cond', 'Full Bath', 'Half Bath', 'TotRms AbvGrd']]
#test = test[['Overall Qual', 'Kitchen AbvGr', 'Bedroom AbvGr', 'Bsmt Full Bath', 'Overall Cond', 'Full Bath', 'Half Bath', 'TotRms AbvGrd']]
y = housing_data['SalePrice']
X = housing_data.loc[:, housing_data.columns != 'SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3) 
# Gradient Boosting: (I just experiment with different methods here)
params = {'n_estimators': 500, 'max_depth': 3, 'min_samples_split': 2,
          'learning_rate': 0.1, 'loss': 'ls'}
gdboost = ensemble.GradientBoostingRegressor(**params)
#reg = BaggingRegressor(n_estimators=10,max_samples=0.5, max_features=0.5)
#voting = VotingRegressor(estimators=[('gb', gdboost), ('bg', reg), ('las', lasso)])
gdboost.fit(X_train, y_train)
# Linear Regression:
linreg = LinearRegression()
lasso = Lasso()
ridge = Ridge()
elastic = ElasticNet()
#lasso.fit(X_train,y_train)
pred = gdboost.predict(X_test)
y_test = np.array(y_test)
#print(y_test.isna().sum()
print(pred)
print(y_test)
mse = mean_squared_error(pred, y_test)
rmse = sqrt(mse)
print(rmse)

[270333.42391135 222203.8181243  130532.31039627 143274.37907889
 140649.97882031 164868.70982324 157024.40117658 199471.40603734
 112056.65841832 274978.93667701 263915.25733494 205124.7076912
  83725.00347396  94763.92512073 221688.97658987 170258.55618336
 128526.37117729 228587.36874288 113287.40777548 344306.87564094
 162792.77891692 160378.51658703 342369.2715434   91073.94207418
 200516.3940211  172038.60131173  80577.93369997 238801.21347004
 365228.75744357 209352.26523881 176810.88235349 163051.84096354
 125241.7728654  227239.81709793 147641.32985943 148520.29585565
 189025.16330953 133066.86794451 143437.11298893 477983.08781511
 131969.11902937 252520.68605513 116728.40763059 103974.5084892
 135829.01278595 146529.95072876 144397.75189407 311743.90402818
 184176.06872315 140766.64612281 329977.93888292 175617.54749762
 184722.92706645 238387.01056802 182878.90600403 193854.95786272
 126711.56990377 143149.40156355 178705.88963787 167290.93707062
 189535.41192466  99404.021

In [6]:
# Test on the testing data:
test_pred = gdboost.predict(test)
print(test_pred)
print(len(test_pred))

[174900.35822876  68150.72822616 197400.85632044 ... 216137.55491907
 128526.48405203 226355.28607663]
1230
