In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# House Price Prediction

This project will try to predict housing price and see if there are some unexpected features that could predict housing price well in this case.

Some coding credit belong to :
https://github.com/krishnaik06/Advanced-House-Price-Prediction-
https://www.kaggle.com/code/apapiu/regularized-linear-models
https://www.kaggle.com/code/lavanyashukla01/how-i-made-top-0-3-on-a-kaggle-competition

## Part 0: Load data and packages

In [None]:
#Load packages 

!pip install pydotplus
from IPython.display import Image  
from sklearn import tree
import pydotplus
import pandas as pd
import numpy as np
import collections
from math import sqrt
import scipy.stats as ss
from scipy import stats
from scipy.stats import norm, skew

import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns


from sklearn import preprocessing, tree
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder, MinMaxScaler,StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, LassoLarsCV, Ridge, RidgeCV 
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.utils import shuffle
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.tree import DecisionTreeRegressor

from xgboost import plot_importance
from matplotlib import pyplot
import xgboost

#Special display options
pd.set_option("display.max_columns", None) 

In [None]:
#Load data
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test= pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
#I would like to distinguish year, numerical, and categorical features first so I can run them and skip some EDA steps to later feature engineering
#But it's after some EDA I know how to distinguish year features

#Find year features
col_year= [feature for feature in train.columns if "Year" in feature or "Yr" in feature]
col_year_test = [feature for feature in test.columns if "Year" in feature or "Yr" in feature]

#Find numerical & cateogorical features for training data
col_NonYr= [feature for feature in train.columns if feature not in col_year]
col_num=[]
col_cat=[]
for c in col_NonYr:    
    if (train[c].dtype!='O') and (c not in col_year):
        col_num.append(c)
    else:
        col_cat.append(c)

#Find numerical & cateogorical features for testing data
col_NonYr_test= [feature for feature in test.columns if feature not in col_year_test]
col_num_test=[]
col_cat_test=[]
for c in col_NonYr_test:    
    if (test[c].dtype!='O') and (c not in col_year_test):
        col_num_test.append(c)
    else:
        col_cat_test.append(c)

## Part 1: EDA (Exploratory Data Analytics)


### Light EDA of overall data

In [None]:
#Find out number of rows & columns in data
print ('Training set Number of (rows,columns): ' + (str(train.shape) ))
print ('Testing set Number of (rows,columns): ' + str(test.shape))

In [None]:
#Check some statistics properties of training data for numerical features
train.describe()

Observations: Seems there are lots of columns having median values of 0 while the mean aren't. Those columns probably have lots of zero values.

### Check dependent variable sale price 
First we will check the distribution of sale price. Histogram, skewness, kurtosis are good descriptive statistic tools to use. And since for linear regression model, the dependent variable should be normally distributed, I used probability plot too. 


In [None]:
#Check skewness & kurtosis of dependent variable SalePrice
print("Training data \"SalePrice\" skewness: %f" % train['SalePrice'].skew())
print("Training data \"SalePrice\" kurtosis: %f" % train["SalePrice"].kurt())

#Check distribution of dependent variable SalePrice 
plt.figure(figsize=(6,3))
sns.distplot(train['SalePrice'], fit= norm)

In [None]:
#Use probability plot to check normality of data
plt.figure(figsize=(6,3))
res=stats.probplot(train['SalePrice'], plot=plt)

In [None]:
#Boxplot is a good way to check distribution, spread, and outlier of numerical variables 
plt.figure(figsize=(6,3))
train.boxplot(column =['SalePrice'])

Observation: Seems sale price is extreamly right skewed (skewness >1)  and has a postive kurtosis (kurtosis >3). The sale price isn't normally ditributed and there are lots of outliers. Since linear regression has normality assumption on both dependent and independent variables and are very sensitive to outliers, we can't just use this to train our model. Log transformation is usually one good way to fix this. Let't try and see if things improve.

In [None]:
#Perform some log normalization and see how sale price distribution improved
#Since we are only on EDA, I only perform transformation on copy of data
data = train.copy()
data['SalePrice'] = np.log(data["SalePrice"])
print("Training data \"SalePrice\" skewness: %f" % data['SalePrice'].skew())
print("Training data \"SalePrice\" kurtosis: %f" % data["SalePrice"].kurt())
plt.figure(figsize=(6,3))
sns.distplot(data['SalePrice'], fit= norm)

In [None]:
#Use probability plot again to check normality of data
plt.figure(figsize=(6,3))
res=stats.probplot(data['SalePrice'], plot=plt)

In [None]:
#Now check outliers again using boxplot
plt.figure(figsize=(6,3))
data.boxplot(column =['SalePrice'])

Observation: After log transformation, the sale price is much more normally distributed and the kutosis changed from positive (kurtosies>3) to negative (kurtosis<3). The skewness changed from extreamly right skewed (skewness >1) to almost symmetrical (-0.5 < skewness < 0.5). There are less high sale price outliers. I am satisfied with the transformation results. Later when we train the model, we will use the log transfomation on sale price.

### Missing values and duplicated values

In [None]:
# Check missing values and duplicated values 
print ('Missing value:', train.isnull().sum().sum())
print ('Duplicated rows:', train.duplicated().sum())
print ('Duplicated columns:',train.columns.duplicated().sum())

In [None]:
#Check which columns have missing values in training and testing data
col_train_with_missing=[c for c in train.columns if train[c].isnull().sum()>=1]
col_test_with_missing=[c for c in test.columns if test[c].isnull().sum()>=1]

#Check if missing value columns are consistent in training vs. testing
print(f"Have missing values in testing but not in training data: { set(col_test_with_missing).difference(set(col_train_with_missing)) }\n")
print(f"Have missing values in training but not in testing data: { set(col_train_with_missing).difference(set(col_test_with_missing)) }")

[](http://)

In [None]:
#Plot to see % of missing values in each columns 
plt.figure(figsize=(6,3))
train[col_train_with_missing].isnull().mean().sort_values(ascending=False).plot.bar(ylabel="Missing value %", title="Missing value% of columns with missing values",color='cadetblue')

In [None]:
#take a look to see records with missing values
train[train[col_train_with_missing].isnull().any(axis=1)]

Observations: 
There are columns with lots of missing values. From the output above, it seems if the value isn't relavant, it will be missing (e.g., if there is 0 "FirePlaces", "FireplaceQu" would be "NaN"). So, for each numerical features, I'll fill in missing vaues and also have other columns to indicate where missing values are filled. For categorical features, filling them with "missing" is enough to servce both purposes. I'll do this in later feature preprocessing.

Also, it seems testing data has similar columns with missing values but just a few more. So, we can deal with them similarly.

### Datetime variables


In [None]:
#Check columns types of training data. 
train[col_year].info()

In [None]:
#Use plots to see relationship of target variable SalePrice and Year features.
fig=plt.figure(figsize=(30,5))
for count, feature in enumerate(col_year, 1):
    data= train.copy()
    ax=fig.add_subplot(1,len(col_year),count)
    if feature == 'YrSold':
        ax.plot(data.groupby(feature)["SalePrice"].median(), color='fuchsia')
        ax.annotate("Price drops with yr sold makes no sense!", xy=[2008.0,164000], xytext=[2006.0, 160000], arrowprops={'arrowstyle':"->",'color':"blue"})
    else:
        ax.plot(data.groupby(feature)["SalePrice"].median(), color="slategray")
    ax.set_xlabel(feature, fontsize='xx-large')
    ax.set_ylabel("SalePrice ($)")
    
plt.show()


Observations: It seems "YrSold" isn't a good feature to use directly since the sale price drop as the year increase. But the difference between year sold and other year features would be some ages of the houses, which might be good features to predict sale price. Let's draw some plots and see how it goes.

In [None]:
#boxplot to take a look at year distribution
sns.boxplot(data=train[col_year])

In [None]:
#Create a few features based on "YrSold", which are difference of year features with year sold and draw plots to see relationships again

fig=plt.figure(figsize=(30,5))
for count, feature in enumerate(['YearBuilt', 'YearRemodAdd', 'GarageYrBlt'], 1):
    data= train.copy()
    data[feature]=data['YrSold'] - data[feature]
    ax=fig.add_subplot(1,len(col_year),count)
    ax.plot(data.groupby(feature)["SalePrice"].median(), color="slategray")
    ax.set_xlabel("YearSold- " +str(feature), fontsize='xx-large' )
    ax.set_ylabel("SalePrice ($)")
plt.show 

Observations: Now the relationship seems reasonable. We will use those new features in later feature preprocessing.

## Numerical variables

I will first try to see if I can use linear regression model. There are 4 basic assumptions of linear regression, I should check and see if I can transform data accordingly to fit those assumptions in order for our model to perform well: <br>
1. Linearity <br>
2. Homoscedasticity <br>
3. Independence <br>
4. Normality <br>

#### Other important things for linear regression model  
1. Feature scaling - linear regression uses gradient descent to find optimal fitting line so scaling is required <br>
2. Impact of missing values - linear regression is sensitive to missing values so need to be careful to handle them <br>
3. Impact of outliers -  linear regression needs the relationship between the independent variable and dependent variable to be linear. So, outliers have big impact on model performance although regularization will help.<br>



In [None]:
#Check counts of each data type for training and testing data
print("There are {} numerical fields, {} categorical fields in training data".format(len(col_num),len(col_cat)))
print("There are {} numerical fields, {} categorical fields in testing data".format(len(col_num_test),len(col_cat_test)))

## Discrete and continuous numerical variables
#### I will treat discrete variables more like categorical variables so I need to distinguish them first

In [None]:
#Check unique values in each numerical feature (training data)
uniques={}
for feature in col_num:
    unique=len(train[feature].unique())
    uniques[feature]=unique    

#Sorted the unique values
uniques_sorted = {k: v for k, v in sorted(uniques.items(), key=lambda item: item[1])}
for pair in uniques_sorted.items():
  print(pair)

Observations: Seems in this case, there is a jump from unique values count 24 to 76. I will set the cut-off point to be 25. When there are less than 25 unique values in a numerical feature, I will define it as a discrete feature.

In [None]:
#Seperate numerical features by discrete and continuous features in trainng and testing data
col_disc=[feature for feature in col_num if len(data[feature].unique())<=25]
col_cont=[feature for feature in col_num if feature not in col_disc and feature != "Id"]

col_disc_test=[feature for feature in col_num_test if len(test[feature].unique())<=25]
col_cont_test=[feature for feature in col_num_test if feature not in col_disc_test]

print(f"Discrete features: {col_disc}\n")
print(f"Continuous features: {col_cont}")

### Linear regression assumption check - normality

In [None]:
#Draw histograms for all continuous numerical features (training data)
grh_per_row =3
fig, ax = plt.subplots(len(col_cont)//grh_per_row+1,grh_per_row, figsize = (30, 30))

for count, feature in enumerate(col_cont, 0):
       data= train.copy()
       row =count // grh_per_row
       col=(count )% grh_per_row    
       ax[row,col].hist( train[feature], color="thistle")
       ax[row,col].set_xlabel(feature, fontsize='xx-large') 
       ax[row,col].set_ylabel("Count") 
    
plt.show()

Observation: Seems most of the numerical fields aren't normally distributed.

### Linear regression assumption check - linearity, homoscedasticity

In [None]:
#Draw scatter plots for all continuous numerical features (training data) 
#Scatter plot is good way to see linearity, homoscedasticity, and even some normality
grh_per_row=3
fig, ax = plt.subplots(len(col_cont)//grh_per_row,grh_per_row, figsize = (40,40))

for count, feature in enumerate(col_cont, 0):
    if feature=="SalePrice":
        pass
    else:
      data= train.copy()
      row=count // grh_per_row
      col=(count )% grh_per_row
      ax[row,col].scatter(data[feature],data['SalePrice'], color="tan")    
      ax[row,col].set_xlabel(feature, fontsize='xx-large')
      ax[row,col].set_ylabel("SalePrice ($)")    
    
plt.show()

Observations: 
It seems most features violate the following: <br>
1. Homoscedasticity. The variance of residual isn't the same for all values of x <br>
2. Normality 

One way to fix this is using log transofmration on both dependent and independent variables. This method would probably fix both homoscedasticity and normality a bit. We will try this in later feature engineering.

In [None]:
#Draw histograms for discrete features (training data)
grh_per_row =3
fig, ax = plt.subplots(len(col_disc)//grh_per_row+1,grh_per_row, figsize = (30, 30))

for count, feature in enumerate(col_disc, 0):
    data= train.copy()
    row=count // grh_per_row
    col=(count )% grh_per_row    
    ax[row,col].hist(train[feature],color="lightsteelblue", bins=data[feature].unique().sort()) 
    ax[row,col].set_xlabel(feature, fontsize='xx-large')   
    ax[row,col].set_ylabel("Count") 
    
plt.show()

In [None]:
#Draw scatter plots of all discrete numerical features (training data) 
grh_per_row=3
fig, ax = plt.subplots(len(col_disc)//grh_per_row+1,grh_per_row, figsize = (30,30))

for count, feature in enumerate(col_disc, 0):
    data= train.copy()
    row=count // grh_per_row
    col=(count )% grh_per_row
    ax[row,col].scatter(data[feature],data['SalePrice'], color="cornflowerblue")    
    ax[row,col].set_xlabel(feature, fontsize='xx-large')
    ax[row,col].set_ylabel("SalePrice ($)")    
    
plt.show()

Observations: We can see housing price can be different for some categories in some discrete numerical features. For example, price seem to be higher for higher "OverallCond" and "OverallQual". Those would be good predictive features to use to train models.

### Linear regression assumption check - Independence
Linear regression require features to be independent to each other. Since we have so many features, it's likely lots of them are dependent to each other. Let's take a look.

In [None]:
# Use heatmap to check feature correlations
corr=train[col_num].corr()
plt.figure(figsize=(25,23))
plt.title("Housing data numerical feature correlation")
sns.heatmap(data=corr, annot=True, cmap="BuPu")

Observations: According to this article (https://www.dummies.com/article/academics-the-arts/math/statistics/how-to-interpret-a-correlation-coefficient-r-169792), here is how I would interprete correlations:

- no linear relationship: [-0.3, 0.3]
- weak/none correlation: [-0.3, -0.5] or [0.3, 0.5]
- moderate: [-0.5, -0.7] or [0.5, 0.7]
- strong: <=-0.7 or >=0.7

It seems more than 50% features have moderate correlations with a few features. A few has strong correlations with each other. For example, "1stFlrSF" are highly correlated to "TotalBsmtSF". We might just keep one of them since they are probably very similar. For 2 features which are highly correlated to each other, Lasso regression would probably drop one by setting the coefficient to 0 and keep the other one. We would need to do regularization later in model training.

## Categorical variables

In [None]:
#Check to see categorical features' relationships with sale price
grh_per_row=3
fig, ax = plt.subplots(len(col_cat)//grh_per_row+1,grh_per_row, figsize = (40, 120))

for count, feature in enumerate(col_cat, 0):
    data= train.copy()
    row=count //grh_per_row
    col=(count )% grh_per_row 
    df = data.groupby(feature)['SalePrice'].mean().to_frame('SalePrice')
    
    ax[row,col].bar(df.index, df.SalePrice, color="burlywood")
    ax[row,col].set_xlabel(feature, fontsize='xx-large')
    ax[row,col].set_ylabel("SalePrice ($)") 
    ax[row,col].set_xticklabels(df.index, fontsize="large",rotation=30 )  
    
plt.show()

Observations: 
There are some categories in some cateogorical features having obviously higher or lower sale prices. For example, "SaleCondition","Neighborhood", and "CentralAir". But that depends on the count for each unique values. For example, if there aren't many houses with "Partial" sale condition, "partial" sale condition may not imply higher sale price. 

Some features have so many unique values. Since we already have so many features, we need to be careful not to create too many additional columns when do categorical feature encoding. I will try to put rare categories in each categorical feature together.

In [None]:
#Let's check the counts of each categorical features to find the cut-off count for rare features
for feature in col_cat:
    data=train.copy()
    print(feature)
    print(data[feature].value_counts())
    print("______________________")

Observations: Seems category count < 15 is a good cut-off point. I will later group those with <1% (count < 15) features as rare features in feature engineering.

## Part 2: Feature preprocessing

### Missing values - Numerical variables

In [None]:
#As seen in the data EDA, missing values have meanings. I replace missing values with median
for feature in col_num:
    if train[feature].isnull().sum()>0:
      train[feature+"_nan"]=np.where(train[feature].isnull(),1,0)
      train[feature].fillna(train[feature].median(),inplace=True)

for feature in col_num_test:
    if test[feature].isnull().sum()>0:
      test[feature+"_nan"]=np.where(test[feature].isnull(),1,0)
      test[feature].fillna(test[feature].median(), inplace=True)    
    
print(train[col_num].isnull().sum().sum())
print(test[col_num_test].isnull().sum().sum())

### Missing values - Categorical variables

In [None]:
#Fill in categorical missing values with "Missing"
for feature in col_cat:
      train[feature].fillna("Missing",inplace=True)

for feature in col_cat_test:
      test[feature].fillna("Missing", inplace=True)    

#Check missing values again    
print(train[col_cat].isnull().sum().sum())
print(test[col_cat_test].isnull().sum().sum())

### Missing values - Year variables 

In [None]:
#Fill in missing values for year variables similar to numerical ones
for feature in col_year:
    if train[feature].isnull().sum()>0:
       train[feature+"_nan"]=np.where(train[feature].isnull(),1,0)
       train[feature].fillna(train[feature].median(),inplace=True)

for feature in col_year_test:
    if test[feature].isnull().sum()>0:
      test[feature+"_nan"]=np.where(test[feature].isnull(),1,0)
      test[feature].fillna(test[feature].median(), inplace=True)   
    
#Check missing values again    
print(train.isnull().sum().sum())
print(test.isnull().sum().sum())

## Part 3: Feature engineering

### Create some needed additional fields

In [None]:
#As we've discussed in EDA, we need to create new features based on YrSold
#There are a few records having other year features > YrSold (i.e.,GarageYrBlt > YrSold). Those are data errors. Gladly they are very few.
#Since there are only around 2 of records having this problem, I just set those new features to be 0 if they are negative
for feature in col_year:
    if feature !="YrSold": 
       train["YrSold-"+feature]=np.maximum(train["YrSold"] - train[feature],0)

for feature in col_year_test:
    if feature !="YrSold": 
      test["YrSold-"+feature]=np.maximum(test["YrSold"] - test[feature],0)

### Categorical Features Encoding
As seen in EDA, some categorical features are with very rare categories. The cut off count for them is 15 or 1%. I will group those <=1% categories together for later encoding

In [None]:
#Group < 1% rare categories together 
for feature in col_cat:
    temp=train.groupby(feature)['SalePrice'].count()/len(train)
    temp_df=temp[temp>0.01].index
    train[feature]=np.where(train[feature].isin(temp_df),train[feature],'Rare_var')
    test[feature]=np.where(test[feature].isin(temp_df),test[feature],'Rare_var')

In [None]:
#Label encoding refering to average sale price of each "label"
for feature in col_cat:
    labels_ordered=train.groupby([feature])['SalePrice'].mean().sort_values().index
    labels_ordered={k:i for i,k in enumerate(labels_ordered,0)}
    train[feature]=train[feature].map(labels_ordered)
    test_default = collections.defaultdict(lambda: 0.0, labels_ordered) #map categories not in training data but in testing to 0
    test[feature]=test[feature].map( test_default)

In [None]:
#log transform the target variable:
train["SalePrice"] = np.log1p(train["SalePrice"])

#log transform skewed numeric features (note to add 1 since some features are zero):
skewed_feats = train[col_num].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
train[skewed_feats] = np.log1p(train[skewed_feats])
test[skewed_feats] = np.log1p(test[skewed_feats])

#log transform categorical features:
train[col_cat] = np.log1p(train[col_cat])
test[col_cat] = np.log1p(test[col_cat])

In [None]:
#Draw scatter plots for all features in training data 
grh_per_row=4
fig, ax = plt.subplots(len(train.columns)//grh_per_row+1,grh_per_row, figsize = (30, 100))

for count, feature in enumerate(train.columns, 0):
    data= train.copy()
    row=count // grh_per_row
    col=(count )% grh_per_row
    ax[row,col].scatter(data[feature],data['SalePrice'], color="mediumpurple")    
    ax[row,col].set_xlabel(feature, fontsize='xx-large')
    ax[row,col].set_ylabel("SalePrice with log normalization ($)")   
    ax[row,col].grid()
    
plt.show()

### Remove some outliers

In [None]:
#As we can see from the plots above, there are some outliers. Let's removing some of them.

train = train.drop(train[(train['LotFrontage']>5)].index)
train = train.drop(train[(train['LotArea']>11.5) ].index)
train = train.drop(train[(train['YearBuilt']<1900) & (train['SalePrice']>12.4)].index)
train = train.drop(train[(train['1stFlrSF']>8) & (train['SalePrice']<12.5)].index)

train.shape

## Part 4: Model training

### Linear Regression with regularizations

In [None]:
#Drop "Id" and seperate independent variables vs. target variables for training data
y_train=train['SalePrice']
X_train=train.drop(["SalePrice","Id"], axis=1)

#Drop "Id" from testing data
X_test=test.drop(["Id"], axis=1)

In [None]:
#Create a function to get scores
def get_score(model, scaler, alpha):
    my_pipeline=Pipeline(steps=[('My scaler', scaler()), ('My classifier',model(alpha =alpha))])
    scores=-1 * cross_val_score(my_pipeline, X_train, y_train, cv=3, scoring='neg_mean_squared_error')
    return scores.mean()

In [None]:
#Create a function so I can easily find the best alpha with different linear models & scalers and compare model performances later
def model_scores(model, scaler, alphas):
    results={}
    best_score=float('inf')
    best_alpha=0
    for alpha in alphas:
        score= get_score(model=model, scaler=scaler, alpha=alpha)
        if score < best_score:
            best_score=score
            best_alpha=alpha
        results[alpha]=score
    
    print(f"\nBest alpha: {best_alpha} with score of {best_score}")
    fig, ax = plt.subplots(figsize=(10,5))
    ax.plot(list(results.keys()), list(results.values()), markersize=5, marker="o", color="royalblue")
    ax.set_title(str(model) + str(scaler))
    ax.set_xlabel("alpha")
    ax.set_ylabel("MSE score")   
    return best_score


In [None]:
def draw_results_lasso(model, scaler, alpha):
    X_cv, X_test_train, y_cv, y_test_train= train_test_split(X_train, y_train, test_size=0.2, random_state=123)
    my_pipeline=Pipeline(steps=[('My scaler', scaler()), ('My classifier',model(alpha =alpha))])
    my_pipeline.fit(X_cv,y_cv)
    y_pred=my_pipeline.predict(X_test_train) 
    plt.figure(figsize=(10,10))
    plt.scatter(y_test_train, y_pred, c='royalblue')
    plt.xlabel("log(Actual Sale Price) $ ")
    plt.ylabel("log(Predicted Sale Price) $ ")
    plt.title(str(model)+str(scaler))
    plt.grid()
    plt.show() 
    

In [None]:
#create a score dictionary to later compare scores of different models
scores={}

In [None]:
#Lasso with StandardScaler
scores["LS_Std"]=model_scores(Lasso, StandardScaler, alphas=np.logspace (-3,-2, num=10))

In [None]:
#take a look to see if we have additional columns that created through feature pre-processing
a=set(list(X_train.columns))
b=set(list(X_test.columns))
c=b.difference(a)
c

In [None]:
#It seems test set has addtional columns because some testing set columns has missing values while the same columns in training set there are no missing values
#I will just drop those additional column
X_test=X_test[X_train.columns]
print(X_test.shape)

In [None]:
#draw actual vs. predicted values - Lasso with Standard Scaler
draw_results_lasso(Lasso, StandardScaler, alpha= 0.0027825594022071257)

In [None]:
#Lasso with MinMaxScaler
scores["LS_MM"]=model_scores(Lasso, MinMaxScaler, alphas=np.logspace (-3.6,-3.2, num=12))

In [None]:
#draw actual vs. predicted values - Lasso with MinMax Scaler
draw_results_lasso(Lasso,MinMaxScaler, alpha= 0.000296980047740645)

In [None]:
#Ridge with StandardScaler
scores["Rig_Std"]=model_scores(Ridge, StandardScaler, alphas=np.logspace (1.7,1.9, num=10))

In [None]:
#draw actual vs. predicted values - Ridge with Standard Scaler
draw_results_lasso(Ridge,StandardScaler, alpha= 58.434141337351754)

In [None]:
#Ridge with MinMaxScaler
scores["Rid_MM"]=model_scores(Ridge, MinMaxScaler, alphas=np.logspace (-1.4,0.5, num=10))

In [None]:
#draw actual vs. predicted values - Ridge with MinMax Scaler
draw_results_lasso(Ridge, MinMaxScaler, alpha= 58.434141337351754)

### Tree models

#### Decision Tree

In [None]:
#First let's take a look at a simple decision tree. I only used max_depth of 3 so I can take visualize it clearly here
#from sklearn.tree import DecisionTreeRegressor
dt_regr = DecisionTreeRegressor(max_depth=3, random_state=1234)
model = dt_regr.fit(X_train, y_train)
#text_representation = tree.export_text(dt_regr)
#print(text_representation)
fig = plt.figure(figsize=(20,5))
_ = tree.plot_tree(dt_regr, feature_names=X_train.columns, filled=True)


#### Random Forest

In [None]:
#Random Forest
X_cv, X_test_train, y_cv, y_test_train= train_test_split(X_train, y_train, test_size=0.2, random_state=123)
rf=RandomForestRegressor()
rf.fit(X_cv, y_cv)
y_pred=rf.predict(X_test_train)
rmse_test = mean_squared_error(y_test_train, y_pred)
print('Test set RMSE of rf: {:.2f}'.format(rmse_test))
scores["RF"]=rmse_test

#Let's see feature importance of Random Forest
coef = pd.Series(data=rf.feature_importances_, index = X_train.columns)
imp_coef = pd.concat([coef.sort_values().head(10), coef.sort_values().tail(10)])
matplotlib.rcParams['figure.figsize'] = (10, 12)
imp_coef.plot(kind = "barh", color="violet")
plt.title("Feautre Importances in the Random Forest Model")

#### Gradient Boosting

In [None]:
#Use Random Search to tune Gradient Boost
num_leaves=list(range(4,6))
learn_rate_list=[0.003,0.005,0.01]
n_estimators_list=[5000,6000,7000]
max_bin=[150,200,250]
bagging_fraction=np.linspace(0.7,0.8,5)
bagging_freq=[4,5,6]
bagging_seed=[6,7,8]
feature_fraction=np.linspace(0.15,0.25,5)
feature_seed=[6,7,8]

params_grid={"num_leaves":num_leaves,
        "learning_rate":learn_rate_list,
        "n_estimators":  n_estimators_list,
        "max_bin": max_bin,
        "bagging_fraction":bagging_fraction,
        "bagging_freq":bagging_freq,
        "bagging_seed":bagging_seed,
        "feature_fraction":feature_fraction,
        "feature_fraction_seed":feature_seed  }
random_GBM_class=RandomizedSearchCV(estimator = LGBMRegressor(objective='regression',                                   
                                       
                                       max_bin=200, 
                                       bagging_fraction=0.75,
                                       bagging_freq=5, 
                                       bagging_seed=7,
                                       feature_fraction=0.2,
                                       feature_fraction_seed=7,
                                       verbose=-1),
                              param_distributions=params_grid,
                              n_iter=40, 
                              scoring='neg_mean_squared_error',
                              cv=2,
                              refit=True, 
                              return_train_score=True )
random_GBM_class.fit(X_train,y_train)

In [None]:
#Exploring Gradient Boost results
cv_results_df = pd.DataFrame(random_GBM_class.cv_results_).sort_values(by="rank_test_score")
cv_results_df.head(15)

In [None]:
#Take a look at the best score and the best parameters
best_score = -1*random_GBM_class.best_score_
scores["GB"] =best_score
print(f"best score is {best_score}")
print(f"best parameters are {random_GBM_class.best_params_}")

In [None]:
#Let's see feature importance of Gradient Boost
coef = pd.Series(data=random_GBM_class.best_estimator_.feature_importances_, index = X_train.columns)
imp_coef = pd.concat([coef.sort_values().head(10), coef.sort_values().tail(10)])
matplotlib.rcParams['figure.figsize'] = (10, 12)
imp_coef.plot(kind = "barh", color="violet")
plt.title("Feautre Importances in the Gradient Boost Model")

#### XGBoost

In [None]:
#Use Random Search to tune XGBoost
max_depth_list=[4,5]
learn_rate_list=[0.005,0.01]
n_estimators_list=[6000,7000,8000]
colsample_bytree_list=[0.9]
min_child_weight_list=[0,1]
subsample_list=[0.9,1]
alpha_list= [0,1,2]
lambda_list =[7,8]
reg_alpha_list=[0.00005,0.00006,0.00007]
gamma_list=[0.5,0.6,0.7]

params_grid={"max_depth":max_depth_list,
        "learning_rate":learn_rate_list,
        "n_estimators":  n_estimators_list,
        "colsample_bytree":colsample_bytree_list,
        "min_child_weight": min_child_weight_list,
        "subsample":subsample_list,
        "alpha":alpha_list,
        "lambda":lambda_list,
      "reg_alpha": reg_alpha_list,
        "gamma":gamma_list }
random_XGBst_class=RandomizedSearchCV(estimator = XGBRegressor(
                       objective='reg:linear',
                       nthread=-1,
                       scale_pos_weight=1,
                       seed=27,
                       random_state=42),
                              param_distributions=params_grid,
                              n_iter=20, 
                              scoring='neg_mean_squared_error',
                              cv=2,
                              refit=True, 
                              return_train_score=True )
random_XGBst_class.fit(X_train,y_train)

In [None]:
#Exploring Gradient Boost results
cv_results_df = pd.DataFrame(random_XGBst_class.cv_results_).sort_values(by="rank_test_score")
cv_results_df.head(7)

In [None]:
#Take a look at the best score and the best parameters
best_score = -1*random_XGBst_class.best_score_
scores['XGB']=best_score
print(f"best score is {best_score}")
print(f"best parameters are {random_XGBst_class.best_params_}")

In [None]:
#Let's see feature importance XGoost
coef = pd.Series(data=random_XGBst_class.best_estimator_.feature_importances_, index = X_train.columns)
imp_coef = pd.concat([coef.sort_values().head(10), coef.sort_values().tail(10)])
matplotlib.rcParams['figure.figsize'] = (10, 12)
imp_coef.plot(kind = "barh", color="violet")
plt.title("Feautre Importances in the XGBoost Model")

In [None]:
scores

In [None]:
#Predict using different models
X_cv, X_test_train, y_cv, y_test_train= train_test_split(X_train, y_train, test_size=0.2, random_state=123)

#Lasso with Standard Scaler
my_pipeline=Pipeline(steps=[('My scaler', StandardScaler()), ('My classifier',Lasso(alpha =0.0027825594022071257))])
my_pipeline.fit(X_train,y_train)
y_pred_LS=my_pipeline.predict(X_test)
y_pred_LS_train=my_pipeline.predict(X_test_train)

#Random Forest
y_pred_RF=rf.predict(X_test)
y_pred_RF_train=rf.predict(X_test_train)

#Gradient Boost
y_pred_GB=random_GBM_class.predict(X_test)
y_pred_GB_train=random_GBM_class.predict(X_test_train)

#XGBoost
y_pred_XGB=random_XGBst_class.predict(X_test)
y_pred_XGB_train=random_XGBst_class.predict(X_test_train)

y_pred_train=0.3*y_pred_LS_train+0.1*y_pred_RF_train+0.5*y_pred_GB_train+0.1*y_pred_GB_train
y_pred =0.3*y_pred_LS+0.1*y_pred_RF+0.5*y_pred_GB+0.1*y_pred_GB

rmse_test = mean_squared_error(y_test_train, y_pred_train)
print(rmse_test)


## Part 5: Get Results on Testing Data

In [None]:
#Use the best model to predict testing data dependent variable "SalePrice"
y_test_pred=np.exp(y_pred) 

In [None]:
#Save prediction results 
my_submission = pd.DataFrame({'Id': test.Id, 'SalePrice': y_test_pred})
# you could use any filename. We choose submission here
my_submission.to_csv('submission4.csv', index=False)