In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error,mean_squared_log_error
import sklearn
import sklearn.feature_selection
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns

# load the datasets into dataframe
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv',index_col='Id')
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv',index_col='Id')
sample = pd.read_csv('../input/house-prices-advanced-regression-techniques/sample_submission.csv')
Y = train['SalePrice']

train = train.drop(['SalePrice'],axis=1)

Hello everyone, this is my attempt at the housing prices. I will explain my thought process on how I designed my model, and areas that I think can still be improved. Feel free to fork this notebook for your own attempt and try to improve the model yourself! If this notebook has helped you, please upvote and leave a comment if you notice any mistakes!

Cheers,
Eugene

In [None]:
#Here we are just taking a look at what variables are there, and what datatypes are there
N,M = train.shape
print('Number of Samples',N,'Number of Features',M)
train.head()

In [None]:
N_2,m_2 = test.shape

test.head()

I think the first thing that I noticed was that there are lot of NaNs in some of the features. This can be quite a problem, since NaNs can imply a lack of knowledge for that sample. However given that we have so many features, it is possible that we can remove a few features due to missing values.

In [None]:
percent_missing = train.isnull().mean() * 100 
percent_missing = percent_missing.sort_values()

In [None]:
plt.figure(figsize=(15,6))
plt.bar(np.arange(len(percent_missing)), percent_missing)
plt.ylabel('Percentage of Missing Values')
plt.xticks(np.arange(len(percent_missing)),percent_missing.keys(),rotation=90)
plt.show()

So clearly, there are a few features where there a large number of samples are missing values. Rather than just removing them outright, I took a slightly different approach. I am first going measure the mutual information of each feature for our target variable and then select the features that pass a cutoff. To do this, we are first going to normalize and transform the values before computing the mutual information.

In [None]:

#Normalization and feature selection
mutual = []


train_x = np.zeros((N,1))
test_x = np.zeros((N_2,1))
for name in train.columns:
    
    #Categorical Features

    if train[name].dtypes == 'O':
        
        #Label Encoder to embed our categories
        le = preprocessing.LabelEncoder()
        
        
        #print(name,train[name].unique())
        
        #Rather than filling our missing values with known quantities, I am simply gonna leave it as a seperate category
        train[name].fillna('missing',inplace=True)
        
        train[name] = train[name].astype(str)
        Canadate = le.fit_transform(train[name])
        Canadate = Canadate.reshape(-1, 1)
        
        ##Now we are going to calculate the mutual information 
        m  = sklearn.feature_selection.mutual_info_regression(Canadate,Y)[0]
        mutual += [m]
        
        #informational cutoff of 0.1
        if m > 0.1:
            print('Variable Added')
            print('Type:',train[name].dtypes,'Variable',name,'Most Frequent Label:',train[name].mode().values,'Number of Labels:', len(train[name].unique()))
            #print(OneHotEncoder().fit_transform(Canadate).toarray())
            
            #Finally since they are categorical variables, we are going to perform one_hot_encoding on them
            One_hot = OneHotEncoder()
            train_x = np.append(train_x,One_hot.fit_transform(Canadate).toarray(),axis=1)
            
            #Same thing as above but for the test set
            test[name].fillna(train[name].mode().values[0],inplace=True)
            test[name] = test[name].astype(str)

            Canadate = le.transform(test[name])
            Canadate = Canadate.reshape(-1, 1)
            test_x = np.append(test_x,One_hot.transform(Canadate).toarray(),axis=1)
            
            
            
    elif train[name].dtypes == 'int' or train[name].dtypes == 'float':
        
        #Now continuous variables, we are going to use a robust scaler
        norm = preprocessing.RobustScaler()
        
        #Since we are going normalize each variable, we are filling the Nans with the mean value
        train[name].fillna(train[name].mean(),inplace=True)
        
        
        Canadate = norm.fit_transform(train[name].values.reshape(-1, 1))
        
        #Outlier Cutoff of 4 and -2, feel free to play around with this
        Canadate[Canadate > 4] = 4
        Canadate[Canadate < -2] = -2
        
        #Again mutual information
        m  = sklearn.feature_selection.mutual_info_regression(Canadate,Y)[0]
        mutual += [m]
        
        
        if m > 0.1:
            
            
            print('Variable Added')
            print('Type:',train[name].dtypes,'Variable',name,'Average Value',train[name].mean())
            train_x = np.append(train_x,Canadate,axis=1)
            
            
            
            #Repeat for test set
            test[name].fillna(train[name].mean(),inplace=True)
            Canadate = norm.transform(test[name].values.reshape(-1, 1))
            Canadate[Canadate > 4] = 4
            Canadate[Canadate < -2] = -2
            #print(min(Canadate))
            #print(max(Canadate))
            test_x = np.append(test_x,Canadate,axis=1)
            
            
train_x= train_x[:,1:]
test_x = test_x[:,1:]

In [None]:
print('Final Training Shape:',train_x.shape)
print('Final Test Shape:',test_x.shape)

Now let's take a look at the mutual information of each variable. We see that there are quite a few standouts and quite a few variables that show some promise. Some might argue that the 0.1 cutoff was too lenient or too harsh. That might be true, but the variables are already quite correlated, and so the addition of lesser informative variable most likely won't help too much.

In [None]:
plt.figure(figsize=(15,6))

plt.bar(np.arange(79), mutual)
plt.xticks(np.arange(79),train.columns,rotation=90)
plt.ylabel('Mutual Information')
plt.show()

Now lets scale our target variable. I tried several methods, but the log method won in the end and produced the best results.

In [None]:

#Price_scale =  preprocessing.StandardScaler()
#Y = Price_scale.fit_transform(Y.values.reshape(-1,1))
Y =  np.log1p(Y)
X_train,X_val,y_train,y_val = train_test_split(train_x,Y,test_size = 0.1,random_state=42)

**Model Design:**

We are going to make use of two models; ElasticNet and XGBregressor. Our final predictions are going to an average of the two models. One could add more models and most likely improve on the accuracy of the final prediction.

In [None]:
import xgboost as xgb
lr = linear_model.ElasticNetCV(cv=5,n_alphas=10,l1_ratio=[.1, .5, .7, .9, .95, .99, 1])

xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', n_estimators = 20000)

In [None]:
#fitting
xg_reg.fit(X_train,y_train)
lr.fit(X_train,y_train)

In [None]:
#validation testing
y_train_pred = xg_reg.predict(X_val)
y_train_pred_l = lr.predict(X_val)

In [None]:
plt.figure(1)
plt.title('XGBRegressor')
plt.plot(np.exp(y_val),np.exp(y_train_pred),'.')
plt.xlabel('True Values')
plt.ylabel('Predictions')
print(mean_squared_log_error(np.exp(y_val),np.exp(y_train_pred)))
plt.figure(2)
plt.title('ElasticNetCV')
plt.plot(np.exp(y_val),np.exp(y_train_pred_l),'.')
print(mean_squared_log_error(np.exp(y_val),np.exp(y_train_pred_l)))
plt.xlabel('True Values')
plt.ylabel('Predictions')

In [None]:
#predicting our test sets
y_pred_l=lr.predict(test_x)
y_pred=xg_reg.predict(test_x)


In [None]:
#Taking the average
pred_avg= np.mean([y_pred_l,y_pred],axis=0)


In [None]:
#Submission
submission=pd.DataFrame()
submission['Id']=test.index
submission['SalePrice']=np.exp(pred_avg)
submission.to_csv("submission.csv",index=False)
