In [None]:
import numpy as np
import pandas as pd
hp_train=pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
hp_test=pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")
sample_submission=pd.read_csv("../input/house-prices-advanced-regression-techniques/sample_submission.csv")
saleprice=hp_train["SalePrice"]

# **1. Clean Data**

In [None]:
hp=pd.concat([hp_train.drop(["SalePrice"],axis=1),hp_test],axis=0,ignore_index=True,sort=False)
hp["MSSubClass"]=hp["MSSubClass"].astype("str") 
##because each number represents different types of dwelling, thus it should be categorial rather than numeric.

#Separate categorial and numeric variables
numeric_column=[]
category_column=[]
for i in hp.columns:
    if np.issubdtype(hp[i],np.number):
        numeric_column.append(i)
    else:
        category_column.append(i)


In [None]:
#Tell how many NaN values in our data
dummy_catagory=pd.DataFrame(columns=["col","NaValue"])
for i in category_column:
    if hp[i].isna().sum()>0:
        val=hp[i].isna().sum()
        dummy_catagory=dummy_catagory.append({"col":i,"NaValue":val},ignore_index=True)
dummy_catagory=dummy_catagory.assign(rate=lambda x: x.NaValue/len(hp)*100)

dummy_numeric=pd.DataFrame(columns=["col","NaValue"])
for i in numeric_column:
    if hp[i].isna().sum()>0:
        val=hp[i].isna().sum()
        dummy_numeric=dummy_numeric.append({"col":i,"NaValue":val},ignore_index=True)
dummy_numeric=dummy_numeric.assign(rate=lambda x: x.NaValue/len(hp)*100)

print(dummy_catagory)
print("")
print(dummy_numeric)
"""
As the following results, both two types of varialbes contains NaN. 
However, in categorial ones, it's relatively easy by transforming into dummy variable 
so we just treate the NaN value as a feature.
For numeric varialbes, we feel that patching them by KMeans is a comfortable way.
"""

In [None]:
#Transform categorial variables into dummies
hp_dummy=pd.DataFrame()
for i in category_column:
    if hp[i].isna().sum()>0:
        j=pd.get_dummies((i+"_"+hp[i]).fillna(i+"_NoValue"),drop_first=True)
    else:
        j=pd.get_dummies((i+"_"+hp[i]),drop_first=True)
    hp_dummy=pd.concat([hp_dummy,j],axis=1)
hp_dummy=pd.concat([hp["Id"],hp_dummy],axis=1)
hp_dummy

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import ticker as mtick
import seaborn as sns
import math
from sklearn.cluster import KMeans

#Concatenate dummies and numeric variables
hp_clean=pd.concat([hp_dummy,hp[numeric_column].drop("Id",axis=1)],axis=1)
df=hp_clean.drop(dummy_numeric["col"],axis=1)

#Determine the optimal number of clusters
sse=[]
for i in range(1,16):
    kmeans_cluster=KMeans(init="k-means++",n_clusters=i,max_iter=500,n_init=50).fit(df)
    sse.append(kmeans_cluster.inertia_)
plt.plot(range(1,16),sse,marker="o") 
##by Elbow Method, the optimal is 4 to 5 and in this case, we choose 5.

In [None]:
#Patch NaN by KMeans
kmeans_cluster=KMeans(init="k-means++",n_clusters=5,max_iter=500,n_init=50).fit(df)
hp_clean.insert(1,"labels",kmeans_cluster.labels_)

for i in dummy_numeric["col"]:
    row=np.where(hp_clean[i].isna())[0]
    x=hp_clean[["labels",i]].groupby("labels").mean()
    for j in row:
        hp_clean.loc[j,i]=x.iloc[hp_clean.loc[j,"labels"],0]

In [None]:
#Spot highly linear dependent variable(because these features can only provide little information)
hp_corr=hp_dummy.drop("Id",axis=1).corr(method ='pearson')
for i in np.arange(hp_corr.shape[0]):
    j=i
    while j<hp_corr.shape[0]:
        hp_corr.values[i][j]=0
        j=j+1

x=np.absolute(hp_corr)
linear_dependence={}
for i in x.columns:
    if x[i].max()>0.9:
        linear_dependence[i]=list(x[i][x[i]>0.9].index)
linear_dependence 
##we define high linear dependency for the correlation of any two variables which is greater than 0.9 or less than -0.9.

In [None]:
#Truncate those variable and finish the data clean process.
x=list(linear_dependence.values())
x=set([i for j in x for i in j])
hp_clean=hp_clean.drop(x,axis=1)
hp_clean=(hp_clean-hp_clean.min())/(hp_clean.max()-hp_clean.min()) ##standardization

# **2. Reduce Features**

In [None]:

#We now have 267 varialbes, but not all variables can provide useful information.
#PCA (Principal Component Analysis) is a perfect tool to reduce and extract crucial features.
from sklearn.decomposition import PCA
pca_decomp=PCA(n_components=265).fit(hp_clean.drop(["Id","labels"],axis=1))
pd.DataFrame({
    "pca":["{:.4f}".format(x*100) for x in pca_decomp.explained_variance_ratio_],
    "cumulated_rate":["{:.4f}".format(x*100) for x in np.cumsum(pca_decomp.explained_variance_ratio_)]
}).head(166) ##our data can be explained over 99% by 166 components

In [None]:
#Finish the feature reduce process.
x=pd.DataFrame(pca_decomp.fit_transform(hp_clean.drop(["Id","labels"],axis=1)))
hp_pca19=x.iloc[:,0:166]
y=[]
for i in range(166):
    y.append("pca"+str(i+1))

hp_pca19.columns=y
hp_pca19

# **3. Regression Anlysis and Prediction**

# 3-1. Simple Linear Regression

In [None]:
hp_pca19_train=hp_pca19.iloc[range(len(saleprice))]
hp_pca19_test=hp_pca19.iloc[range(len(saleprice),len(hp_pca19))]

from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import statsmodels.formula.api as smf

#OLS Regression fitting
X=hp_pca19_train
X=sm.add_constant(X)
lm=sm.OLS(np.log(saleprice),X).fit() 
##it's a common way to use log price rather than price because it can make sure our model always produce positive number (exp(y)>0)
##and log can also depict the percentage change of house price.

#Breusch-Pagen Test for heteroscedasticity
from statsmodels.stats.diagnostic import het_breuschpagan
bp_test = het_breuschpagan((lm.resid),lm.model.exog)

print(lm.summary())
print("")
print(
    pd.DataFrame({
    "stats":["Lagrange","Lagrange-score","f-test","f-score"],
    "score":["{:.6f}".format(i) for i in bp_test]})
) ##we reject null hypothesis, the regression is heteroscedastic

##as the following results, the overall f-test of OLS regression and almost all coefficients are significant.
##R-square tells us that this model can explain over 90% of variance of the house price.
##However, Breusch-Pagen Test is failed so this model has the heteroscedastic problem,
##so even though our coefficients are still unbiased, they are on longer BLUE (best linear unbiased estimator) 
##which means our coefficients are inefficient and inconsistent.

In [None]:
#Plot residuals
x=pd.concat([np.log(saleprice),(lm.resid)],axis=1)
x.rename(columns={0:"lm_resid"},inplace=True)
fig, axe=plt.subplots(figsize=(10,6))
axe.scatter(x.iloc[:,0],x.iloc[:,1],s=3)
axe.plot([10.5,13.5],[0,0],color="black",alpha=0.5,linestyle="--")
plt.show()
##as the following plot, we can observe residuals of the OLS has some positive relationship with log price.
##therefore, the variance of residuals is not a constant and violates Gauss-Markov Theorem.

In [None]:
#GLS Regression fitting
resid=sm.OLS(np.log(list(lm.resid**2)),X).fit()
sigma=np.zeros((X.shape[0],X.shape[0]))
##we estimate the sigma matrix to weight our regression model (Generalized least squares).
##this process is a feasible way to alleviate the heteroscedastic problem.
np.fill_diagonal(sigma,np.exp(resid.predict(X)))
X2=pd.DataFrame.dot(pd.DataFrame(sigma),X)
glm=sm.GLS(np.log(saleprice),X,sigma=sigma).fit()

bp_test2 = het_breuschpagan(np.dot(np.linalg.inv(sigma**0.5),glm.resid),glm.model.exog)

print(glm.summary())
print("")
print(
    pd.DataFrame({
    "stats":["Lagrange","Lagrange-score","f-test","f-score"],
    "score":["{:.6f}".format(i) for i in bp_test2]
})
) #not rejecting null hypothesis


In [None]:
#Compare OSL and GSL coefficients
x=[]
for i in lm.pvalues:
    if i<0.01: x.append("***")
    else: 
        if i<0.05: x.append("**")
        else: 
            if i <0.1: x.append("*")
            else: x.append("insignificant")
y=[]
for i in glm.pvalues:
    if i<0.01: y.append("***")
    else: 
        if i<0.05: y.append("**")
        else: 
            if i <0.1: y.append("*")
            else: y.append("insignificant")
            
pd.DataFrame({"ols":list(lm.params),"gls":list(glm.params),"ols_pvalue":x,"gls_pvalue":y},index=lm.params.index)
##GLS will tend to suppress insignificant coefficients(make it closer to 0) and elevate significant ones.

# 3-1. Spline Regression

In [None]:
#Linear regression assumes the relationship detween explanatory variables and the outcome is linear and continuous.
#However, this assumption is too strong to be true in almost every cases.
#Because the slope of relationship usually fluctuates, we need to capture the chage.
#Here, we apply the B-Spline function to describe the potential change over log(saleprice)

import patsy
knot=[]
for i in range(10):
    knot.append(np.quantile(X.iloc[:,1],q=i/10))

simpleOLS=sm.OLS(np.log(saleprice),X.iloc[:,[0,1]]).fit().predict(X.iloc[:,[0,1]])
#line=np.linspace(X["pca1"].min(),X["pca1"].max(),len(X["pca1"]))
b=pd.DataFrame(patsy.dmatrix("bs(x,knots=knot,degree=1)",{"x":X.iloc[:,1]}))
spline1=sm.OLS(np.log(saleprice),b).fit().predict(b)
b=pd.DataFrame(patsy.dmatrix("bs(x,knots=knot,degree=2)",{"x":X.iloc[:,1]}))
spline2=sm.OLS(np.log(saleprice),b).fit().predict(b)
b=pd.DataFrame(patsy.dmatrix("bs(x,knots=knot,degree=3)",{"x":X.iloc[:,1]}))
spline3=sm.OLS(np.log(saleprice),b).fit().predict(b)


fig, axe=plt.subplots(figsize=(14,8))
axe.scatter(X.iloc[:,1],np.log(saleprice),color="#696969",s=2)
axe.scatter(X.iloc[:,1], simpleOLS, s=3,color='#B22222')
axe.scatter(X.iloc[:,1], spline1,s=3, color='#008000')
axe.scatter(X.iloc[:,1], spline2, s=3,color='#FF8C00',alpha=0.8)
axe.scatter(X.iloc[:,1], spline3, s=3,color='#5686BF',alpha=0.8)
axe.legend(["Data Points","Simple OLS","Spline Regression (order 1)","Spline Regression (order 2)","Spline Regression (order 3)"])

#as the following graph, spline regressions can describe the change of the relationship among lower and higher prices.
#the higher the order is, the more it can fit; but overfitting problem may also rise

In [None]:
#We don't have a module for spline regression (python 3.7 or higher version didn't support sklearn_contrib_py_earth)
#But we can write one to achieve our goal.
def spline_parm(X=X,y=saleprice,q=40,degree=1): #q for maxium knots

    spline=sm.OLS(np.log(y),X).fit()
    AIC0=spline.aic #AIC for picking the optional parameter
    b=X

    parm=[]
    for k in range(len(X.columns)-1):
        b0=b.drop("pca"+str(k+1),axis=1)
        for i in range(1,q):
            knot=[]
            for j in range(1,i):
                knot.append(np.quantile(X["pca"+str(k+1)],q=j/(i+1)))
            b=pd.DataFrame(patsy.dmatrix("bs(x,knots=knot,degree=degree)",{"x":X["pca"+str(k+1)]})).drop(0,axis=1)
            column_name=[]
            for j in range(len(b.columns)):
                column_name.append("pca"+str(k+1)+"_"+str(j+1))
            b.columns=column_name
            b=pd.concat([b0,b],axis=1)
            spline=sm.OLS(np.log(y),b).fit()
            if spline.aic>AIC0:break #coordinate descent and greedy algorithm 
            else:AIC0=spline.aic
        parm.append(i-1)
    return parm, spline # parm is to get the optional parameters in each b-spline degree

parm1, spline1=spline_parm(degree=1)
parm2, spline2=spline_parm(degree=2)
parm3, spline3=spline_parm(degree=3)

pd.DataFrame({
    "r2_order1":[spline1.rsquared],
    "r2_order2":[spline2.rsquared],
    "r2_order3":[spline3.rsquared]
},) 
#higher order (like order2 and order3) gets better r2-scores; 
#however, it may also causes overfitting problem.

# **4. Random Forest**

In [None]:
from sklearn.ensemble import RandomForestRegressor

rfr=RandomForestRegressor(n_estimators=1000).fit(X.drop("const",axis=1),np.log(saleprice))
##It's very easy to run since we have a powerful tool "sklearn" in Python.
##we use 1000 trees to determine the house prices.


# **5. Compare Results**

# 5-1 Cross Validation

In [None]:
#Cross Validation
##we can compare the expected performance of each model(OLS, GLS RFR) by CV.
##However, it seems there is no such an off-the-shelf CV function for GLS so we need to write one by our own.

X=hp_pca19_train
X=sm.add_constant(X)
import random

def cv_data(cv=20,x=X,y=saleprice): #generating CV data
##to ramdomly split the dataset
    k0=int(np.floor(len(x)/cv))
    k1=random.sample(range(len(x)),len(x))
    cv_index=[];k=k1
    for j in range(cv-1):
        cv_index.append([k1.index(a) for a in range(k0*j,k0*(j+1))])
        k=[a for a in k if a not in cv_index[j]]
    cv_index.append(k)

    x_train,y_train,x_val,y_val=[],[],[],[]
    for i in range(cv):
        x_val.append(x.iloc[cv_index[i],:])
        x_train.append(x.drop(cv_index[i],axis=0))
        y_train.append(np.log(y.drop(cv_index[i],axis=0)))
        y_val.append(np.log(y[cv_index[i]]))
    return x_train,y_train,x_val,y_val

x_train,y_train,x_val,y_val=cv_data(x=X,y=saleprice)

def OLS(xtrain=x_train,ytrain=y_train,xval=x_val,yval=y_val):
    r2=[]
    for i in range(len(xtrain)):
        yperid=sm.OLS(ytrain[i],xtrain[i]).fit().predict(xval[i])
        sse=((yperid-yval[i])**2).sum()
        sst=((yval[i].mean()-yval[i])**2).sum()
        r2.append(1-(sse/sst))
    return(r2)

def GLS(xtrain=x_train,ytrain=y_train,xval=x_val,yval=y_val):
    r2=[]
    for i in range(len(xtrain)):

        yperid0=sm.OLS(ytrain[i],xtrain[i]).fit().predict(xtrain[i])
        resid=((yperid0-ytrain[i]))
        sigma=np.exp(sm.OLS(np.log(resid**2),xtrain[i]).fit().predict(xtrain[i]))

        yperid=sm.GLS(ytrain[i],xtrain[i],sigma=sigma).fit().predict(xval[i])
        sse=((yperid-yval[i])**2).sum()
        sst=((yval[i].mean()-yval[i])**2).sum()
        r2.append(1-(sse/sst))
    return(r2)


In [None]:
#CV for spline regression
def spline_regression_cv(parm,degree=2,x_train=x_train,y_train=y_train,x_val=x_val,y_val=y_val):

    r2=[]
    for k in range(len(x_train)):

        x_train[k]=x_train[k].reset_index(drop=True)
        y_train[k]=y_train[k].reset_index(drop=True)
        x_val[k]=x_val[k].reset_index(drop=True)
        y_val[k]=y_val[k].reset_index(drop=True)

        b1=x_train[k]["const"]
        b1_val=x_val[k]["const"]

        min=pd.DataFrame(x_train[k].min(),columns=["min"]).T
        max=pd.DataFrame(x_train[k].max(),columns=["max"]).T
        tmp=pd.concat([x_val[k],min,max])


        for i in range(len(parm)):
            knot=[]
            if parm[i]>0:
                for j in range(1,parm[i]+1):
                    a=x_train[k]["pca"+str(i+1)]
                    knot.append(np.quantile(a,q=j/(parm[i]+1)))
            b0=pd.DataFrame(patsy.dmatrix("bs(x,knots=knot,degree=degree)",{"x":x_train[k]["pca"+str(i+1)]})).drop(0,axis=1)
            b0_val=pd.DataFrame(patsy.dmatrix("bs(x,knots=knot,degree=degree)",{"x":tmp["pca"+str(i+1)]})).drop(0,axis=1)
            column_name=[]
            for j in range(len(b0.columns)):
                column_name.append("pca"+str(i+1)+"_"+str(j+1))
            b0.columns=column_name
            b0_val.columns=column_name
            b1=pd.concat([b1,b0],axis=1)
            b1_val=pd.concat([b1_val,b0_val],axis=1)
        b1_val=b1_val.drop(len(tmp)-1).drop(len(tmp)-2)

        yv=sm.OLS(y_train[k],b1).fit().predict(b1_val)

        sse=((yv-y_val[k])**2).sum()
        sst=((y_val[k].mean()-y_val[k])**2).sum()
        r2.append(1-sse/sst)
    return(r2)

In [None]:
#Compare the performance of predictions of OLS, GLS and Random Forest
from sklearn.model_selection import cross_val_score
cvOLS=OLS()
cvGLS=GLS()
cvSpline1=spline_regression_cv(parm=parm1,degree=1)
cvSpline2=spline_regression_cv(parm=parm2,degree=2)
cvSpline3=spline_regression_cv(parm=parm3,degree=3)
cvRFR=cross_val_score(rfr,X,np.log(saleprice),cv=20,scoring="r2")

In [None]:
pd.DataFrame({
    "OLS_cv":cvOLS,
    "GLS_cv":cvGLS,
    "Spline1_cv":cvSpline1,
    "Spline2_cv":cvSpline2,
    "Spline3_cv":cvSpline3,
    "RFR_cv":cvRFR
}).describe()

##we use R2 score to gauge the performance

##Spline1(b-spline regression order1) gets the highest score.
##even thought Spline2 and Spline3 has better r2-scores on the previous occasion, 
##the overfitting problem reduce its performance of perdiction.

#there is no significant difference between OLS, GLS.

##also, RFR's performance is relatively stable in compare with others.(the smallest std)
##it's actually straightforward because RFR is a kind of bagging algorithm to reduce variance and stablize the fluctuation.

# 5-2 Prediction

In [None]:
hp_pca19_test_lm=sm.add_constant(hp_pca19_test)

min=pd.DataFrame(hp_pca19_test.min(),columns=["min"]).T
max=pd.DataFrame(hp_pca19_test.max(),columns=["max"]).T
tmp=sm.add_constant((pd.concat([hp_pca19_test,min,max])).reset_index(drop=True))
b_test=tmp["const"]

parm,spline=spline_parm(X=X,y=saleprice,q=40,degree=1)
for i in range(len(parm)):
    knot=[]
    if parm[i]>0:
        for j in range(1,parm[i]+1):
            a=tmp["pca"+str(i+1)].reset_index(drop=True)
            knot.append(np.quantile(a,q=j/(parm[i]+1)))

    b=pd.DataFrame(patsy.dmatrix("bs(x,knots=knot,degree=1)",{"x":tmp["pca"+str(i+1)].reset_index(drop=True)})).drop(0,axis=1)
    column_name=[]
    for j in range(len(b.columns)):
        column_name.append("pca"+str(i+1)+"_"+str(j+1))
    b.columns=column_name
    b_test=pd.concat([b_test,b],axis=1)

b_test=b_test.drop(len(tmp)-1).drop(len(tmp)-2)


In [None]:
lm_test=np.exp(lm.predict(hp_pca19_test_lm))
glm_test=np.exp(glm.predict(hp_pca19_test_lm))
rfr_test=np.exp(rfr.predict(hp_pca19_test))
spline1_test=np.exp(spline.predict(b_test))

regress_test=pd.DataFrame({
    "guideline":sample_submission["SalePrice"],
    "OLS":lm_test.reset_index(drop=True),
    "GLS":glm_test.reset_index(drop=True),
    "Spline1":spline1_test,
    "RFR":rfr_test  
})
regress_test

In [None]:
fig, axe=plt.subplots(figsize=(15,6))
for i in range(1,10):
    q_up=np.quantile(regress_test["guideline"],i/10)
    q_low=np.quantile(regress_test["guideline"],(i-1)/10)
    y=regress_test[(
        regress_test["guideline"]<q_up) & (regress_test["guideline"]>=q_low)].median()
    axe.bar(i-0.3,y.values[0],width=0.15,color="#A9A9A9",edgecolor="black",align="center")
    axe.bar(i-0.15,y.values[1],width=0.15,color="#6495ED",edgecolor="black",align="center")
    axe.bar(i,y.values[2],width=0.15,color="#FF4D40",edgecolor="black",align="center")
    axe.bar(i+0.15,y.values[3],width=0.15,color="#008080",edgecolor="black",align="center")
    axe.bar(i+0.3,y.values[4],width=0.15,color="#9ACD32",edgecolor="black",align="center")
from matplotlib import ticker as mtick
axe.xaxis.set_major_locator(mtick.FixedLocator([i for i in range(1,10)]))
axe.set_xticklabels(["q"+str(i*10) for i in range(1,10)],fontsize=11)
val=[int(i) for i in axe.get_yticks()]
axe.yaxis.set_major_locator(mtick.FixedLocator([i for i in val]))
axe.set_yticklabels(["{:,}".format(i) for i in val],fontsize=11)
axe.set_xlabel(xlabel="Quantiles for each house price",fontsize=16);axe.set_ylabel(ylabel="")
axe.legend(y.index,loc=4)

In [None]:
submission=pd.DataFrame({
     "id":[i for i in range(1461,2920)],
      "SalePrice":spline1_test
})
submission.to_csv('submission.csv',index=False)
submission["SalePrice"].describe()