In [None]:
# Competition description

'''
Ask a home buyer to describe their dream house, and they probably won't begin with the height of the basement ceiling 
or the proximity to an east-west railroad. But this playground competition's dataset proves that much more influences 
price negotiations than the number of bedrooms or a white-picket fence.

With 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa, this competition 
challenges you to predict the final price of each home.
'''

# Importing libraries
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder
from scipy import stats
from sklearn.preprocessing import StandardScaler
import sklearn.linear_model as linear_model
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer, r2_score
from sklearn.model_selection import KFold, cross_val_score
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, GradientBoostingRegressor
import xgboost as xgb
from sklearn.model_selection import GridSearchCV



# Ignorar warnings
import warnings
warnings.filterwarnings('ignore')


'''
Steps:

1. Collecting data
2. Cleaning data
3. Exploratory data analysis
4. Model building

'''

In [None]:
%matplotlib inline

# Loading data

In [None]:
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
SalePrice = train['SalePrice']

In [None]:
SalePrice.shape

In [None]:
## concatenating train and test

df = pd.concat((train, test))
print("Shape of df: ", df.shape)

In [None]:
df.head(10)

In [None]:
df.shape

In [None]:
df.columns

In [None]:
## Verificando número de variáveis numéricas

numericalFeatures = df.select_dtypes(include = [np.number])
print("The number of numerical features is: {}".format(numericalFeatures.shape[1]))

In [None]:
numericalFeatures.columns

In [None]:
## Verificando número de variáveis categóricas

categoricalFeatures = df.select_dtypes(exclude = [np.number])
print("The number of categorical features is: {}".format(categoricalFeatures.shape[1]))

In [None]:
categoricalFeatures.columns

# Data distribution

In [None]:
## Checking data distribution only in the training set

plt.subplots(figsize=(12,9))
sns.distplot(train['SalePrice'], fit=stats.norm)

# Get the fitted parameters used by the function
(mu, sigma) = stats.norm.fit(train['SalePrice'])

# Plot with the distribution
plt.legend(['Normal dist. ($/mu=$ {:.2f} and $/sigma=$ {:.2f})'.format(mu, sigma)], loc='best')

# Probability plot
fig=plt.figure()
stats.probplot(train['SalePrice'], plot=plt)
plt.show()

# Correlation


In [None]:
corr = numericalFeatures.corr()

sns.set(style="white")

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=np.bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(20, 10))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
## Correlation greater than 0.5

top_feature = corr.index[abs(corr['SalePrice']>0.5)]
plt.subplots(figsize=(12,8))
top_corr = df[top_feature].corr()
sns.heatmap(top_corr, annot=True)
plt.show()

In [None]:
catFeatures = categoricalFeatures.columns
train[catFeatures] = train[catFeatures].fillna('Missing')

# Onward...
anova = {'feature':[], 'f':[], 'p':[]}
for cat in catFeatures:
  group_prices = []
  for group in train[cat].unique():
      group_prices.append(train[train[cat] == group]['SalePrice'].values)
  f, p = stats.f_oneway(*group_prices)
  anova['feature'].append(cat)
  anova['f'].append(f)
  anova['p'].append(p)
anova = pd.DataFrame(anova)
anova = anova[['feature','f','p']]
anova.sort_values('p', inplace = True)

In [None]:
anova

Null hypothesis (H0): There is no difference

If P<0.05 we can reject H0

The features Street, LandSlope and Utilities have P>0.05, which means, they make difference in sales price.

In [None]:
df = df.drop(['SalePrice', 'Id'], axis=1)

In [None]:
# Checando colunas com valores nulos
nullValues = (df.isnull().sum() / len(df)) * 100
nullValues = round(nullValues.drop(nullValues[nullValues == 0].index).sort_values(ascending=False)[:30],2)
missingData = pd.DataFrame({'Percente of null values' :nullValues})
missingData.head(30)

In [None]:
df.shape

In [None]:
### Percentage of null values

f, ax = plt.subplots(figsize=(15, 12))
plt.xticks(rotation='90')
sns.barplot(x=nullValues.index, y=nullValues)
plt.xlabel('Features', fontsize=15)
plt.ylabel('Percent of null values', fontsize=15)
plt.title('Percent null values by feature', fontsize=15)
plt.show()

In [None]:
## Change the categorical features related to the quality of the house

number = LabelEncoder()
df['Alley'] = number.fit_transform(df['Alley'].astype('str'))
df['LotShape'] = number.fit_transform(df['LotShape'].astype('str'))
df['LandContour'] = number.fit_transform(df['LandContour'].astype('str'))
df['Utilities'] = number.fit_transform(df['Utilities'].astype('str'))
df['LandSlope'] = number.fit_transform(df['LandSlope'].astype('str'))
df['ExterQual'] = number.fit_transform(df['ExterQual'].astype('str'))
df['BsmtQual'] = number.fit_transform(df['BsmtQual'].astype('str'))
df['BsmtCond'] = number.fit_transform(df['BsmtCond'].astype('str'))
df['BsmtExposure'] = number.fit_transform(df['BsmtExposure'].astype('str'))
df['BsmtFinType1'] = number.fit_transform(df['BsmtFinType1'].astype('str'))
df['BsmtFinType2'] = number.fit_transform(df['BsmtFinType2'].astype('str'))
df['HeatingQC'] = number.fit_transform(df['HeatingQC'].astype('str'))
df['KitchenQual'] = number.fit_transform(df['KitchenQual'].astype('str'))
df['Functional'] = number.fit_transform(df['Functional'].astype('str'))
df['FireplaceQu'] = number.fit_transform(df['FireplaceQu'].astype('str'))
df['GarageFinish'] = number.fit_transform(df['GarageFinish'].astype('str'))
df['GarageQual'] = number.fit_transform(df['GarageQual'].astype('str'))
df['GarageFinish'] = number.fit_transform(df['GarageFinish'].astype('str'))
df['GarageCond'] = number.fit_transform(df['GarageCond'].astype('str'))
df['PavedDrive'] = number.fit_transform(df['PavedDrive'].astype('str'))
df['PoolQC'] = number.fit_transform(df['PoolQC'].astype('str'))


In [None]:
train.groupby(['YrSold', 'MoSold']).Id.count().plot(kind='bar', figsize=(14,4))
plt.title("Sale date")
plt.show()

In [None]:
# Conversion from numeric feature to Category features

df['MSSubClass'] = df.MSSubClass.apply(lambda x: str(x))
df['MoSold'] = df.MoSold.apply(lambda x: str(x))
df['YrSold'] = df.YrSold.apply(lambda x: str(x))

In [None]:
df['MSSubClass'] = number.fit_transform(df['MSSubClass'].astype('str'))
df['MoSold'] = number.fit_transform(df['MoSold'].astype('str'))
df['YrSold'] = number.fit_transform(df['YrSold'].astype('str'))

In [None]:
df.columns[df.isnull().any()]

In [None]:
# Lot Frontage (how can there be no street infront of the lot) Hence we replace it with the median value
df.LotFrontage = df.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

# Garage Year Built, if missing we can set it to zero
df.GarageYrBlt.fillna(0, inplace=True)

# Masonary Veneer Area here most values are zero
df.MasVnrArea.fillna(0, inplace=True)

In [None]:
df.columns[df.isnull().any()]

In [None]:
df.Electrical.value_counts()

In [None]:
df.Electrical.fillna(df.Electrical.mode()[0], inplace=True)

In [None]:
df.MasVnrType.value_counts()

In [None]:
# First let's correct our assignment
df.MasVnrType.replace({'Missing':'None'}, inplace=True)

# Second, we are going to replace them with the mean value
df.loc[(df.MasVnrType == 'None') & (df.MasVnrArea > 1), 'MasVnrType'] = 'BrkFace' # Most common
df.loc[(df.MasVnrType == 'None') & (df.MasVnrArea == 1), 'MasVnrType'] = 0  # M1 sq ft is basically 0

for vnr_type in df.MasVnrType.unique():
    # so here we set area equal to the mean of the given veneer type
    df.loc[(df.MasVnrType == vnr_type) & (df.MasVnrArea == 0), 'MasVnrArea'] = df[df.MasVnrType == vnr_type].MasVnrArea.mean()
    
df.MasVnrType.fillna(df.MasVnrType.mode()[0], inplace=True)    

In [None]:
df.GarageType.value_counts()

In [None]:
df.GarageType.fillna(df.GarageType.mode()[0], inplace=True)

In [None]:
df.Fence.value_counts()

In [None]:
df.Fence.fillna(0, inplace=True)

In [None]:
df.MiscFeature.value_counts()

In [None]:
df.MiscFeature.fillna(0, inplace=True)

In [None]:
df.GarageArea.fillna(df.GarageArea.mean(), inplace=True)

In [None]:
df.SaleType.value_counts()

In [None]:
df.SaleType.fillna(df.SaleType.mode()[0], inplace=True)

In [None]:
df.GarageCars.value_counts()

In [None]:
df.GarageCars.fillna(df.GarageCars.mode()[0], inplace=True)

In [None]:
df.BsmtFinSF1.value_counts()

In [None]:
df.BsmtFinSF1.fillna(df.BsmtFinSF1.mean(), inplace=True)
df.BsmtFinSF2.fillna(df.BsmtFinSF2.mean(), inplace=True)

In [None]:
df.BsmtFullBath.value_counts()

In [None]:
df.BsmtFullBath.fillna(df.BsmtFullBath.mode()[0], inplace=True)
df.BsmtHalfBath.fillna(df.BsmtHalfBath.mode()[0], inplace=True)
df.Exterior1st.fillna(df.Exterior1st.mode()[0], inplace=True)
df.Exterior2nd.fillna(df.Exterior2nd.mode()[0], inplace=True)
df.BsmtUnfSF.fillna(df.BsmtUnfSF.mode()[0], inplace=True)
df.MSZoning.fillna(df.MSZoning.mode()[0], inplace=True)
df.TotalBsmtSF.fillna(df.TotalBsmtSF.mean(), inplace=True)

In [None]:
df.columns[df.isnull().any()]

In [None]:
# Calculating total square feet (area)

df['Total_SF'] = df.TotalBsmtSF + df.GrLivArea
df['TotalFloorSF'] = df['1stFlrSF'] + df['2ndFlrSF']
df['TotalPorchSF'] = df.OpenPorchSF + df.EnclosedPorch + df['3SsnPorch'] + df['ScreenPorch']

In [None]:
# Now let's create some boolean features (Yes-No type)

df['HasBasement'] = df.TotalBsmtSF.apply(lambda x: 1 if x>0 else 0)
df['HasGarage'] = df.GarageArea.apply(lambda x: 1 if x>0 else 0)
df['HasPorch'] = df.TotalPorchSF.apply(lambda x: 1 if x>0 else 0)
df['HasPool'] = df.PoolArea.apply(lambda x: 1 if x>0 else 0)
df['WasRemodeled'] = (df.YearRemodAdd != df.YearBuilt).astype(np.int64)
df['IsNew'] = (df.YearBuilt > 2000).astype(np.int64)
df['WasCompleted'] = (df.SaleCondition != 'Partial').astype(np.int64)

In [None]:
booleanFeatures = ['HasBasement','HasGarage','HasPorch','HasPool','WasRemodeled','IsNew','WasCompleted']

In [None]:
numericalFeatures = numericalFeatures.drop(['Id','SalePrice'], axis=1)
numFeatures = numericalFeatures.columns
catFeatures = categoricalFeatures.columns

In [None]:
numFeatures = [f for f in numFeatures if f not in booleanFeatures]

In [None]:
# Total Bathrooms

df['TotalBathrooms'] = df.FullBath + 0.5*df.HalfBath + df.BsmtFullBath + 0.5*df.BsmtHalfBath

In [None]:
for f in numFeatures:
  df.loc[:,f] = np.log1p(df[f])

In [None]:
SalePrice = np.log1p(SalePrice)

In [None]:
df = pd.get_dummies(df).copy()

In [None]:
dfColumns = df.columns

In [None]:
df.head()

In [None]:
## Checando distribuição dos dados no dataset de treinamento

plt.subplots(figsize=(12,9))
sns.distplot(SalePrice, fit=stats.norm)

# Get the fitted parameters used by the function
(mu, sigma) = stats.norm.fit(SalePrice)

# Plot with the distribution
plt.legend(['Normal dist. ($/mu=$ {:.2f} and $/sigma=$ {:.2f})'.format(mu, sigma)], loc='best')

# Probability plot
fig=plt.figure()
stats.probplot(SalePrice, plot=plt)
plt.show()

In [None]:
# scaling dataset with robust scaler

scaler = StandardScaler()

df.loc[:, numFeatures] = scaler.fit_transform(df[numFeatures])

In [None]:
trainLen = len(train)
y_train = SalePrice
x_train = df[:trainLen]
x_test = df[trainLen:]


print(x_train.shape)
print(x_test.shape)
print(len(y_train))

# Building models

In [None]:
def test_model(model, x_train, y_train):
    cv = KFold(n_splits = 3, shuffle=True, random_state = 45)
    r2 = make_scorer(r2_score)
    r2_val_score = cross_val_score(model, x_train, y_train, cv=cv, scoring = r2)
    score = [r2_val_score.mean()]
    return score

In [None]:
def rsme(model, x, y):
  cv_scores = -cross_val_score(model, x, y, scoring='neg_mean_squared_error', cv=10)
  return np.sqrt(cv_scores)

In [None]:
## Tuning parameters

param_grid = {'alpha':[0.0001,0.001,0.01,1.,5.,10.,25.],'max_iter':[50000]}
lasso = GridSearchCV(Lasso(), cv=5, param_grid=param_grid, scoring='neg_mean_squared_error')
lasso.fit(x_train, y_train)
alpha = lasso.best_params_['alpha']

# Home in
param_grid = {'alpha':[x/100. * alpha for x in range(50,150,5)],'max_iter':[50000]}
lasso = GridSearchCV(Lasso(), cv=5, param_grid=param_grid, scoring='neg_mean_squared_error')
lasso.fit(x_train, y_train)
alpha = lasso.best_params_['alpha']
lasso = lasso.best_estimator_

print('Lasso -> Train RSME: {:,.5f}| alpha {:,.5f}'.format(rsme(lasso,x_train,y_train).mean(),alpha))

In [None]:
coefs = pd.DataFrame({'coefs':lasso.coef_,'Positive':lasso.coef_>0}, index=dfColumns)
coefs['coefs_abs'] = np.abs(coefs.coefs)
print('Lasso dropped {} of {} features.'.format(sum(coefs.coefs==0), coefs.shape[0]))

top_coefs = coefs.sort_values('coefs_abs', ascending=False).head(30)
plt.figure(figsize=(8,10))
sns.barplot(top_coefs.coefs_abs, top_coefs.index, orient='h', hue=top_coefs.Positive)
plt.title=('Lasso Regression: Top Features')
plt.xlabel('Absolute Coeficient')
plt.show()

In [None]:
# Linear Regression

LR = linear_model.LinearRegression()
acc_LR = test_model(LR, x_train, y_train)

LR_rsme = rsme(LR, x_train, y_train)

print('Score: {:.5f}'.format((acc_LR[0])))
print('RSME: {:.5f}'.format(LR_rsme.mean()))

In [None]:
# Support Vector Regressor

svr_reg = SVR(kernel='rbf')
acc_SVR = test_model(svr_reg, x_train, y_train)

svr_rsme = rsme(svr_reg, x_train, y_train)
print('Score: {:.5f}'.format((acc_SVR[0])))
print('RSME: {:.5f}'.format(svr_rsme.mean()))

In [None]:
#Decision Tree
dt_reg = DecisionTreeRegressor(random_state=21)
acc_tree = test_model(dt_reg, x_train, y_train)

dt_rsme = rsme(dt_reg, x_train, y_train)
print('Score: {:.5f}'.format((acc_tree[0])))
print('RSME: {:.5f}'.format(dt_rsme.mean()))

In [None]:
# Random Forest
rf_reg = RandomForestRegressor(n_estimators = 1000, random_state=51)
acc_rf = test_model(rf_reg, x_train, y_train)

rf_rsme = rsme(rf_reg, x_train, y_train)
print('Score: {:.5f}'.format((acc_rf[0])))
print('RSME: {:.5f}'.format(rf_rsme.mean()))

In [None]:
# Bagging Regressor
br_reg = BaggingRegressor(n_estimators=1000, random_state=51)
acc_br = test_model(br_reg, x_train, y_train)

br_rsme = rsme(br_reg, x_train, y_train)
print('Score: {:.5f}'.format((acc_br[0])))
print('RSME: {:.5f}'.format(br_rsme.mean()))

In [None]:
# Gradient Boosting Regressor
gbr_reg = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.1, loss='ls', random_state=51)
acc_gbr = test_model(gbr_reg, x_train, y_train)

gbr_rsme = rsme(gbr_reg, x_train, y_train)
print('Score: {:.5f}'.format((acc_gbr[0])))
print('RSME: {:.5f}'.format(gbr_rsme.mean()))

In [None]:
# XGBoost

xgb_reg = xgb.XGBRegressor(colsample_bytree=0.2, 
                        gamma=0.0,
                        learning_rate=0.05,
                        max_depth=6,
                        min_child_weight=1.5,
                        n_estimators=7200,
                        reg_alpha=0.9,
                        reg_lambda=0.6,
                        subsample=0.2,
                        seed=42,
                        silent=1)

acc_xgb = test_model(xgb_reg,x_train[top_coefs.index], y_train)
xgb_rsme = rsme(xgb_reg, x_train[top_coefs.index], y_train)

print('Score: {:.5f}'.format((acc_xgb[0])))
print('RSME: {:.5f}'.format(xgb_rsme.mean()))

In [None]:
results = pd.DataFrame({
    'Model': ['Linear Regression', 'Support Vector Regressor', 
              'Decision Tree', 'Random Forest', 'Bagging Regressor', 'Gradient Boosting Regressor ','XGBoost'],
    'Score': [acc_LR[0], acc_SVR[0], acc_tree[0], acc_rf[0], acc_br[0], acc_gbr[0], acc_xgb[0]],
    'RSME': [LR_rsme[0], svr_rsme[0], dt_rsme[0], rf_rsme[0], br_rsme[0], gbr_rsme[0], xgb_rsme[0]]
})

result = results.sort_values(by='RSME', ascending=True)
result = result.set_index('Model')
display(result.head(8))