In [None]:
import numpy as np 
import pandas as pd
from dateutil.parser import parse
from typing import List
import matplotlib.pyplot as plt
import seaborn as sns

# Introduction

When I was a beginner to machine learning and Kaggle, I made a real mess of attempting this competition. Now I hope to go through look at the data and create a prediction in a more skillful way. I hope you can learn from my notebooks as I learned from other peoples.

In [None]:
df = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
df.head()

Let's look to see if there are any object columns and what we might do to fix them

In [None]:
df.select_dtypes(include='object')

Below I've defined a function to change the datatypes to more useful representations by replacing object columns with either one hot encoded columns or with categorical encoded columns based on the number of unique entries in a column. 

In [None]:
def process_df(df, cat_col_list = []):
    new_df = df.copy()
    for col in df.select_dtypes(include='object').columns:
        if df[col].nunique() < 6 and col not in cat_col_list:
            new_df = pd.get_dummies(new_df, columns=[col])
        else: 
            if col not in cat_col_list:
                cat_col_list.append(col)
            new_df[col] = df[col].astype('category').cat.codes
    return new_df, cat_col_list
df, cat_col_list = process_df(df)

# Exploratory Data Analysis

I will now take a quick look at the data with a multiplot:

In [None]:
pltdf = df.copy()
rename = [cname[0:10] for cname in df.columns]
pltdf.columns = rename
pltdf.iloc[:100, :24].plot(subplots=True, layout=(20,4), figsize=(25,20))

plt.show()

Let's look at our target value of Sale Price

In [None]:
sns.displot(x=df["SalePrice"])

Let's look at the important correlations

In [None]:
# calculate the correlation matrix
cols = []
cols_done = []
for col_one in df.iloc[:,:].columns:
    if (df[col_one].corr(df['SalePrice']) > 0.6):
        cols.append(col_one)
    cols_done.append(col_one)
corrdf = df.copy()
corrdf = corrdf[cols].corr()

sns.heatmap(corrdf, cmap="Blues")

In [None]:
df[cols].iloc[:200,:].plot(subplots=True, layout=(7,1), figsize=(25,20))
display()

Let's look at the most important relationship there OverallQual and SalePrice. Doesn't seem a big surprise that they are linked

In [None]:
g = sns.PairGrid(df[cols].iloc[:200,:], diag_sharey=False)
g.map_upper(sns.scatterplot, s=15)
g.map_lower(sns.kdeplot)
g.map_diag(sns.kdeplot, lw=2)

Let's explore with a simple decision tree

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree

tree_set = df.copy()
tree_set = tree_set.fillna(0)
target = tree_set["SalePrice"]
tree_set.drop(["SalePrice"], axis=1, inplace=True)

tree_clf = DecisionTreeRegressor(max_depth=3, random_state=1)
tree_clf.fit(tree_set, target)
text_representation = tree.export_text(tree_clf, feature_names=tree_set.columns.tolist())
print(text_representation)
print("accuracy: " + str(tree_clf.score(tree_set, target)))

That's a pretty good accuracy considering how simple a tree that is. 

# Prediction

For this notebook I will use Pycaret to create a ridge regression based model.

In [None]:
!pip install pycaret

In [None]:
from pycaret.regression import *

df = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
setup(data = df.copy(), 
             target = "SalePrice",
             silent = True, normalize = True, session_id=42)
display()

In [None]:
ridge = create_model('ridge')

In [None]:
kernel_ridge = create_model('kr')

In [None]:
blender = blend_models(estimator_list = [ridge, kernel_ridge])

Reasonable Performance. Now we can create an ensemble for better performance.

In [None]:
ensemble_ridge = ensemble_model(blender, n_estimators=50)

In [None]:
predict_model(ensemble_ridge);

# Look at Model Metrics

In [None]:
plot_model(ensemble_ridge, plot = 'error')

In [None]:
plot_model(ensemble_ridge, plot = 'residuals')

In [None]:
plot_model(ridge, plot = 'feature')

# Train Model on Full Dataset

In [None]:
bagged_ridge = finalize_model(ensemble_ridge)
predict_model(ensemble_ridge);

# Create Submission

In [None]:
test_data = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")
predictions = predict_model(ensemble_ridge, data=test_data)
predictions.head()

In [None]:
submissiondf = pd.read_csv("../input/house-prices-advanced-regression-techniques/sample_submission.csv")
submissiondf["SalePrice"] = predictions.Label
submissiondf.head()

In [None]:
submissiondf.to_csv("submission.csv", index=False)