This problem is a kaggle competition:
Ask a home buyer to describe their dream house, and they probably won't begin with the height of the basement ceiling or the proximity to an east-west railroad. But this playground competition's dataset proves that much more influences price negotiations than the number of bedrooms or a white-picket fence.

With 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa, this competition challenges aim to predict the final price of each home.

Random forest regression, linear regression, lasso regression, and xgboost are performed and results are written in seprate files.

In [13]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import r2_score
from sklearn import metrics

# Exploring Data

In [26]:
# Read data
train=pd.read_csv("train.csv")
test=pd.read_csv("test.csv")

print(train.shape)
print(test.shape)

y_train=train["SalePrice"]
# Labelindex=train.columns.get_loc("SalePrice")
x_train=train.drop(["SalePrice"], axis=1)

#Keep column Id for output
IdTest=pd.DataFrame(test['Id'])
IdTest.columns =['Id']

(1460, 81)
(1459, 80)


Replace missing values:

In [27]:
train.dtypes
obj_df = train.select_dtypes(include=['object']).copy()
obj_df.head()
# obj_df[obj_df.isnull().any(axis=1)]

 
imp = SimpleImputer(strategy="most_frequent")
x_train=imp.fit_transform(x_train)
test=imp.fit_transform(test)

Apply PCA for feature reduction:

In [21]:
from sklearn.decomposition import PCA
pca = PCA(n_components=50)
pca.fit(x_train)
x_train = pca.transform(x_train) # transformed data
x_train

array([[-7.11556429e+02, -2.14000529e+02,  2.16212543e+02, ...,
        -6.23138176e-02,  2.68360578e-01, -1.52067532e-01],
       [-7.34023594e+02, -5.35635115e+00,  3.24440568e+02, ...,
         5.18993445e-01,  3.63570209e-01,  1.26484763e-01],
       [-7.32872338e+02,  7.71530495e+01,  2.59561309e+01, ...,
        -1.10588030e-02,  1.79512793e-02, -1.87074364e-01],
       ...,
       [ 7.12069818e+02,  2.32046528e+02, -3.33854895e+02, ...,
        -4.00526354e-01, -6.51314704e-01,  7.05793174e-01],
       [ 7.43273411e+02, -1.54748721e+02,  1.85502683e+02, ...,
        -1.02672634e+00,  6.18045574e-01, -1.51936169e+00],
       [ 7.20347998e+02,  9.44045822e+01,  3.73019641e+02, ...,
         2.71224953e-01, -6.26060001e-01,  4.29664427e-01]])

Change Categorical Values to numerical:

In [28]:
oe_style = OrdinalEncoder()
x_train = oe_style.fit_transform(x_train)
test=oe_style.fit_transform(test)

# Building Model

Random Forest Regression

In [29]:
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf.fit(x_train, y_train)

RandomForestRegressor(n_estimators=1000, random_state=42)

predicting values and Making outputs

In [30]:
predictions=rf.predict(test)
xs=pd.DataFrame(predictions,columns=['SalePrice'])
x=IdTest['Id']
y= xs["SalePrice"]
z=pd.concat([x,y], axis=1, ignore_index=True)
z.columns =['Id','SalePrice']
z.to_csv('RandomForest.csv', index=False) 

# HyperParameter Tuning

In [31]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(x_train, y_train)
predictions=rf_random.predict(test)
xs=pd.DataFrame(predictions,columns=['SalePrice'])
x=IdTest['Id']
y= xs["SalePrice"]
z=pd.concat([x,y], axis=1, ignore_index=True)
z.columns =['Id','SalePrice']
z.to_csv('RandomForestOptimal.csv', index=False) 

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}
Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [32]:
rf_random.best_params_

{'n_estimators': 800,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 100,
 'bootstrap': False}

Using other regression models: linear regression

In [33]:
from sklearn import datasets, linear_model
reg = linear_model.LinearRegression()
reg.fit(x_train, y_train)
predictions=reg.predict(test)
xs=pd.DataFrame(predictions,columns=['SalePrice'])
x=IdTest['Id']
y= xs["SalePrice"]
z=pd.concat([x,y], axis=1, ignore_index=True)
z.columns =['Id','SalePrice']
z.to_csv('linearResult1.csv', index=False) 

Xgboost

In [34]:
import xgboost
print(xgboost.__version__)
from xgboost import XGBRegressor

# create an xgboost regression model
model = XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)
model.fit(x_train, y_train)
predictions=model.predict(test)
xs=pd.DataFrame(predictions,columns=['SalePrice'])
Id=IdTest['Id']
salePrice= xs["SalePrice"]
combine=pd.concat([Id,salePrice], axis=1, ignore_index=True)
combine.columns =['Id','SalePrice']
combine.to_csv('XgboostResult2.csv', index=False) 

1.0.1


Using other regression models: lasso

In [36]:
lasso = linear_model.Lasso()
lasso.fit(x_train, y_train)
predictions=lasso.predict(test)
xs=pd.DataFrame(predictions,columns=['SalePrice'])
Id=IdTest['Id']
salePrice= xs["SalePrice"]
combine=pd.concat([Id,salePrice], axis=1, ignore_index=True)
combine.columns =['Id','SalePrice']
combine.to_csv('lassoResult.csv', index=False) 

In [None]:
# Evaluates train eror
print("R_Squared_test_Error:", r2_score(y_test, y_pred))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, predictions))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))