In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import style
style.use("ggplot")

In [None]:
data = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")

In [None]:
data.head()

In [None]:
data.info()

In [None]:
f = open("../input/house-prices-advanced-regression-techniques/data_description.txt")
for line in f.readlines():
    print(line)

In [None]:
def plot_missing(data, height=15):
    plt.figure(figsize=(20,height))
    missing_per_column_ratio = data.isna().sum().sort_values(ascending=True)/data.shape[0] * 100
    missing_per_column_ratio.plot(kind="bar")
    plt.show()

In [None]:
plot_missing(data)

In [None]:
to_drop = ["Id"]
for col in data.columns:
    if data[col].isna().sum()/data.shape[0]>0.1:
        to_drop.append(col)
data.drop(to_drop, axis=1, inplace=True)

In [None]:
plot_missing(data)

In [None]:
for col in data.columns:
    if data[col].isna().sum()!=0:
        try:
            data[col].fillna(data[col].median(), inplace=True)
        except:
            data[col].fillna(data[col].mode()[0], inplace=True)

In [None]:
data.isna().sum().sum()

In [None]:
for col in data.columns:
    plt.figure(figsize=(20,9))
    if data[col].dtype==object:
        sns.countplot(x=col, data=data)
    else:
        sns.histplot(x=col, data=data)
    plt.show()

In [None]:
cat = []
numer = []
for col in data.columns:
    if data[col].dtype == float:
        numer.append(col)
    else:
        cat.append(col)

In [None]:
x = data.drop("SalePrice", axis=1)
y = data["SalePrice"]

In [None]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.15)

In [None]:
from category_encoders import CountEncoder
enc = CountEncoder(normalize=True)
xtrain = pd.DataFrame(enc.fit_transform(xtrain), columns=xtrain.columns)
xtest = pd.DataFrame(enc.transform(xtest), columns=xtrain.columns)

# Baseline Models

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, max_error

In [None]:
def regression_report(ytrue, ypred):
    print(f"R2: {r2_score(ytrue, ypred): .2f}")
    print(f"MAE: {mean_absolute_error(ytrue, ypred): .2f}")
    print(f"Max Error: {max_error(ytrue, ypred): .2f}")

In [None]:
def train_and_evaluate(model, xtr, ytr, xts, yts):
    model.fit(xtr, ytr)
    ypred_tr = model.predict(xtr)
    ypred_ts = model.predict(xts)
    print(f"Training Results:\n")
    regression_report(ytr, ypred_tr)
    print(f"\n\nTesting Results:\n")
    regression_report(yts, ypred_ts)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
lin_baseline = LinearRegression()
tree_baseline = DecisionTreeRegressor(max_depth=25, random_state=11)
print("Linear Regression:\n\n")
train_and_evaluate(lin_baseline, xtrain, ytrain, xtest, ytest)
print("\n\n\nDecision Tree:\n\n")
train_and_evaluate(tree_baseline, xtrain, ytrain, xtest, ytest)

# Feature Engineering

## Feature Extraction

### 1) Manual Feature Extraction

In [None]:
list(xtrain.columns)

In [None]:
xtrain['OverallQual']

In [None]:
xtrain['OverallCond']

In [None]:
xtr = xtrain.copy(deep=False)
ytr = ytrain.copy(deep=False)
xts = xtest.copy(deep=False)
yts = ytest.copy(deep=False)

In [None]:
plt.scatter(xtr['OverallQual'], ytrain)

In [None]:
plt.scatter(xtr['OverallCond'], ytrain)

In [None]:
plt.scatter(xtr["LotArea"], ytrain)

In [None]:
to_drop = list(xtr[xtr["LotArea"]>25000].index) + list(ytr[ytr>600000].index)

In [None]:
to_drop

In [None]:
xtr.drop(to_drop, axis=0, inplace=True)
ytr.drop(to_drop, axis=0, inplace=True)

In [None]:
plt.scatter(xtr["LotArea"], ytr)

In [None]:
xtr["OverallQual"] = ( xtr["OverallQual"] - min(xtr["OverallQual"]) ) / xtr["OverallQual"].std()
xtr["OverallCond"] = ( xtr["OverallCond"] - min(xtr["OverallCond"]) ) / xtr["OverallCond"].std()
xtr["LotArea"] = ( xtr["LotArea"] - min(xtr["LotArea"]) ) / xtr["LotArea"].std()

In [None]:
plt.scatter(xtr['OverallQual'], ytr)

In [None]:
plt.scatter(xtr['OverallCond'], ytr)

In [None]:
plt.scatter(xtr["LotArea"], ytr)

In [None]:
plt.scatter(xtr["LotArea"]*xtr['OverallCond']*xtr['OverallQual'], ytr)
plt.show()

In [None]:
xtr["extrafeat"] = xtr["LotArea"]*xtr['OverallCond']*xtr['OverallQual']
xts["extrafeat"] = xts["LotArea"]*xts['OverallCond']*xts['OverallQual']

In [None]:
train_and_evaluate(DecisionTreeRegressor(), xtr, ytr, xts, yts)

### 2) Feature Extraction Algorithms

#### a) PCA

In [None]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
xtrain = pd.DataFrame(ss.fit_transform(xtrain), columns=xtrain.columns)
xtest = pd.DataFrame(ss.fit_transform(xtest), columns=xtest.columns)

In [None]:
from sklearn.decomposition import PCA
pca = PCA()
xtrain_pca = pca.fit_transform(xtrain)
xtest_pca = pca.transform(xtest)

In [None]:
pd.DataFrame(xtrain_pca)

In [None]:
plt.figure(figsize=(20,10))
plt.bar(x=range(0, 73), height=pca.explained_variance_ratio_)
plt.show()

In [None]:
xtrain_pca = xtrain_pca[:, :30]
xtest_pca = xtest_pca[:, :30]

In [None]:
train_and_evaluate(LinearRegression(), xtrain_pca, ytrain, xtest_pca, ytest)

In [None]:
train_and_evaluate(DecisionTreeRegressor(max_depth=20), xtrain_pca, ytrain, xtest_pca, ytest)

In [None]:
xtrain["Main_PCA_Component"] = xtrain_pca[:, 0]
xtest["Main_PCA_Component"] = xtest_pca[:, 0]

In [None]:
train_and_evaluate(LinearRegression(), xtrain, ytrain, xtest, ytest)

In [None]:
train_and_evaluate(DecisionTreeRegressor(max_depth=30), xtrain, ytrain, xtest, ytest)

In [None]:
xtrain.drop("Main_PCA_Component", axis=1, inplace=True)
xtest.drop("Main_PCA_Component", axis=1, inplace=True)

# Feature Selection

## Univariate Selection: Correlation example

In [None]:
from scipy.stats import pearsonr

In [None]:
print("Correlations between features and target:\n")
correlations = {}
for col in xtrain.columns:
    corr, _ = pearsonr(xtrain[col], ytrain)
    correlations[col] = corr
    print(f"{col}: {corr: .4f}")
plt.figure(figsize=(20,13))
sns.barplot(y=list(correlations.keys()), x=list(correlations.values()))
plt.title("Correlations between features and target:\n", fontsize=30)
plt.show()

In [None]:
train_and_evaluate(DecisionTreeRegressor(max_depth=30),
                   xtrain.drop(["BsmtFinSF2", "BsmtCond", "MiscVal", "BsmtHalfBath"], axis=1),
                   ytrain,
                   xtest.drop(["BsmtFinSF2", "BsmtCond", "MiscVal", "BsmtHalfBath"], axis=1),
                   ytest)

## Forward & Backward Selection

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector

In [None]:
selector = SequentialFeatureSelector(estimator=DecisionTreeRegressor(max_depth=30),
                                     k_features=30,
                                     forward=True,
                                     cv=7,
                                     scoring="r2")
selector.fit(xtrain, ytrain)

In [None]:
selector.k_feature_names_

In [None]:
xtrain_sel = selector.transform(xtrain)
xtest_sel = selector.transform(xtest)

In [None]:
train_and_evaluate(LinearRegression(), xtrain_sel, ytrain, xtest_sel, ytest)

In [None]:
train_and_evaluate(DecisionTreeRegressor(max_depth=30), xtrain_sel, ytrain, xtest_sel, ytest)

In [None]:
selector = SequentialFeatureSelector(estimator=DecisionTreeRegressor(max_depth=30),
                                     k_features=30,
                                     forward=False,
                                     cv=7,
                                     scoring="r2")
selector.fit(xtrain, ytrain)

In [None]:
selector.k_feature_names_

In [None]:
xtrain_sel = selector.transform(xtrain)
xtest_sel = selector.transform(xtest)

In [None]:
train_and_evaluate(LinearRegression(), xtrain_sel, ytrain, xtest_sel, ytest)

In [None]:
train_and_evaluate(DecisionTreeRegressor(max_depth=30), xtrain_sel, ytrain, xtest_sel, ytest)