# House Prices: Advanced Regression Techniques

Predict sales prices and practice feature engineering, RFs, and gradient boosting

https://www.kaggle.com/c/house-prices-advanced-regression-techniques/

### Importing the Libraries

In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from mlxtend.regressor import StackingCVRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import ExtraTreesRegressor
from mlxtend.regressor import StackingCVRegressor

from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

### Getting the Data

In [None]:
df_train = pd.read_csv('./train.csv', sep=',')
df_test = pd.read_csv('./test.csv', sep=',')

### Data Exploration/Analysis

In [None]:
df_train.info()

In [None]:
df_train.head()

In [None]:
df_train.describe()

### Missing Data

In [None]:
total = df_train.isnull().sum().sort_values(ascending=False)
percent_1 = df_train.isnull().sum()/df_train.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data

Visualizing the null values using HeatMaps

In [None]:
sns.heatmap(df_train.isnull(), yticklabels=False, cbar=False, cmap='Blues')
plt.xticks(rotation=90)
plt.show()

Correlation heatmap of dataset

In [None]:
def correlation_heatmap(df):
    _ , ax = plt.subplots(figsize =(7, 5))
    colormap = sns.diverging_palette(220, 10, as_cmap=True)
    
    _ = sns.heatmap(
        df.corr(), 
        cmap = colormap,
        square=True, 
        cbar_kws={'shrink':.9}, 
        ax=ax,
        annot=True, 
        linewidths=0.1,vmax=1.0, linecolor='white',
        annot_kws={'fontsize':12}
    )
    
    plt.title('Pearson Correlation of Features', y=1.05, size=15)
    plt.xticks(rotation=90)
    plt.show()

#correlation_heatmap(df_train)

Applies the average where the values are null

In [None]:
def appty_mean(df):
    columns = df.select_dtypes(exclude=[object]).columns
    for c in columns:
        index = df[df[c].isnull()].index
        df.loc[index, c] = np.mean(df[c])
    return df

df_train = appty_mean(df_train)

Applies the mode where the values are null

In [None]:
def appty_mode(df):
    columns = df.select_dtypes(['object']).columns
    for c in columns:
        mode = df[c].mode().values
        index = df[df[c].isnull()].index
        df.loc[index, c] = mode
    return df

df_train = appty_mode(df_train)

Visualizing the null values using HeatMaps

In [None]:
sns.heatmap(df_train.isnull(), yticklabels=False, cbar=False, cmap='Blues')
plt.xticks(rotation=90)
plt.show()

Dropping the remaining rows with null values in them

In [None]:
#df_train.dropna(inplace=True)

### Dealing with categorical column, using LabelEncoder

XXX

In [None]:
def to_categorial_columns(df):
    columns = df.select_dtypes(['object']).columns
    for c in columns:
        le = LabelEncoder()
        label = le.fit_transform(df[c])
        df.loc[:, c] = label
    return df

df_train = to_categorial_columns(df_train)

In [None]:
df_train.info()

In [None]:
df_train.head()

### Removing columns

In [None]:
y_train = np.log(df_train['SalePrice'])
df_train = df_train.drop(columns=['Id', 'SalePrice'])
x_train = df_train.values

scaler = MinMaxScaler(feature_range=(0, 1))
x_train = scaler.fit_transform(x_train)
df_train.head()

### Test Data

In [None]:
df_test = appty_mean(df_test)

df_test = appty_mode(df_test)

df_test = to_categorial_columns(df_test)
df_test.info()

In [None]:
df_test.head()

In [None]:
Id = df_test['Id'].values
Id = Id.astype(int)
df_test = df_test.drop(columns=['Id'])
x_test = scaler.transform(df_test.values)

In [None]:
Id

### Regression

Teste inicial de uma modelo

In [None]:
X, y = x_train, y_train
RANDOM_SEED = 42

lasso = Lasso()
xgbr = XGBRegressor()
gbr = GradientBoostingRegressor()
svr_linear = SVR(kernel='linear')
knnR = KNeighborsRegressor(n_neighbors=20, n_jobs=-1)
rf = RandomForestRegressor(n_estimators=900,  random_state=RANDOM_SEED)
svr_rbf = SVR(kernel= 'rbf', gamma= 'auto', tol=0.001, C=100.0, max_iter=-1)
et  = ExtraTreesRegressor(n_estimators=950 , max_features='auto', max_leaf_nodes=None,
                          n_jobs=-1, random_state=0, verbose=0)
lr = LinearRegression(fit_intercept=True, normalize=True, copy_X=True, n_jobs=-1)
#reg = StackingCVRegressor(regressors=[xgbr , gbr, lr, et],
#                          random_state=RANDOM_SEED,
#                          meta_regressor=lr)
reg = StackingCVRegressor(regressors=(svr, lasso, rf),
                          random_state=RANDOM_SEED,
                          meta_regressor=lasso)

alg = zip([lasso, gbr, svr_linear, knnR, rf, svr_rbf, et, reg],
          ['Lasso', 'GBR', 'SVM Linear', 'KNN', 'RF', 'SVM RBF', 'ET', 'Stack'])

print('5-fold cross validation scores:\n')
for clf, label in alg:
    scores = cross_val_score(clf, X, y, cv=5, scoring='neg_mean_squared_error')
    print('Neg. MSE Score: {:.5f} (+/- {:.5f}) [{}]'.format(scores.mean(), scores.std(), label))

### Training and Predicting

In [None]:
#model = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1)
#model.fit(x_train, y_train)
y_pred = reg.predict(x_test)
y_pred = np.exp(y_pred)
y_pred

### Writing the predictions to a csv file

In [None]:
dict_ = dict(Id=Id, SalePrice=y_pred)
df = pd.DataFrame(dict_).reset_index()
df = df.drop('index', axis=1)

df.to_csv('result.csv', index=False)

In [None]:
df.head()