In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Easy to Follow EDA and Machine Learning Using Python

*Sam Park*

*16 July 2021*

# Import Libraries and Data

Pandas and numpy are the most important libraries to be familiar with and will be used for most data projects. If you are confused or have questions about any of these libraries, take a second to google each one in order to get a better understanding before reading the rest of the notebook.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.model_selection import cross_val_score
import xgboost as xgb

train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

Before jumping right into data cleaning and analysis, make a copy of your original data for reference later on. Also, take a look at how the data is organized and the overall shape of the datasets.

In [None]:
train_og = train.copy()
test_og = test.copy()

train.head()

In [None]:
test.head()

In [None]:
train.shape

In [None]:
test.shape

# Clean the Data and Perform Exploratory Analaysis

### Dealing with Null Data

Now that we've imported and inspected our data, it's time to begin cleaning. First we will be dealing with null values. We will want to delete columns with an excessive amount of null values and fill in the null values for the rest of the columns. This will make it easier for us to run machine learning algorithms later on in order to make our final predictions about sale price.

In [None]:
"""
Combining train and test data will allow us to manipulate both sets at the same time
"""
data = pd.concat([train, test], keys=('x', 'y'))
data = data.drop(["Id"], axis = 1)

"""
Sort columns by overall and relative amounts of null data
"""
null_data = data.isnull().sum().sort_values(ascending=False)

null_percentage = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)

missing_data = pd.concat([null_data, null_percentage], axis= 1, keys= ["Total", "Percentage"])
missing_data.head(20)

In [None]:
"""
We will remove these columns completely because of high instances of null data
"""
data = data.drop(["PoolQC", "MiscFeature", "Alley", "Fence", "FireplaceQu", "LotFrontage"],
                 axis = 1)

In [None]:
"""
Separate numeric and categorical variables before filling in null values
"""
num_data = data._get_numeric_data().columns.tolist()
cat_data = set(data.columns) - set(num_data)

"""
Fill in null values for numeric and categorical variables with mean and mode respectively
"""
for col in num_data:
    data[col].fillna(data[col].mean(), inplace=True)
    
for col in cat_data:
    data[col].fillna(data[col].mode()[0], inplace=True)
    
"""
Check to see if all null values have been filled
"""
data[num_data].isnull().sum()

In [None]:
data[cat_data].isnull().sum()

In [None]:
"""
Now take a look at the categorical data. Remove any variables dominated by a single class of data 
because they will not be very useful for training our algorithms. To make it easy we will use an 80% cutoff.
(remove columns with >80% single class of data)
"""
for i in cat_data:
    print(data[i].value_counts())

In [None]:
data = data.drop(["LandSlope", "Condition2", "LandContour", "Street", "ExterCond",
                  "Condition1", "Functional", "Electrical", "CentralAir",
                  "Heating", "GarageQual", "RoofMatl", "BsmtCond", "PavedDrive",
                  "Utilities", "GarageCond", "BsmtFinType2"], axis= 1)

### Check for Normality of Dependent Variable

It is important that our dependent variable has a relatively normal distribution. This will allow us to identify accurate correlations with other variables and make better predictions using machine learning algorithms later on.

In [None]:
plt.figure(figsize=(12,10))

sns.set_style("darkgrid")
sns.histplot(data=train, x="SalePrice", bins=50, cbar=True)

It appears that our dependent variable has a pretty severe right skew. We will attempt to adjust the distribution by performing a log transformation on the data in our selected column.

In [None]:
train['SalePrice'] = np.log1p(train['SalePrice'])
data['SalePrice'] = np.log1p(data['SalePrice'])

plt.figure(figsize=(12,10))
sns.set_style("darkgrid")
sns.histplot(data=train, x="SalePrice", bins=50, cbar=True, color='black')

After the log transformation the data now has a relatively normal distribution. We can now continue with our data analysis. However, since we transformed our dependent variable data we will need to remember to undo the log transformation on our predicted values at the end of our project.

### Correlation Analysis

Our next step is to find out which variables are correlated with sale price. Drop any variables that have little to no correlation with sale price and take note of the variables which have high correlation. This will give us an idea of which variables will be weighted heavily in our predictive algorithms.

In [None]:
"""
Rank variables based on correlation with sale price
"""
corr = train.corr()
corr_rank = corr["SalePrice"].sort_values(ascending = False)
corr_rank

In [None]:
"""
Delete variables with correlation of absolute value less than 0.1
"""
data = data.drop(["PoolArea", "MoSold", "3SsnPorch", "BsmtFinSF2", "BsmtHalfBath",
                  "MiscVal", "LowQualFinSF", "YrSold", "OverallCond", "MSSubClass"],
                 axis = 1)

In [None]:
top_features = corr.index[abs(corr["SalePrice"]>0.5)]
plt.figure(figsize = (9,9))
heat_map = sns.heatmap(data[top_features].corr(), annot=True, cmap="RdYlGn")

According to our correlation heatmap, overall quality (OverallQUal) and above ground square footage (GrLivArea) are key variables which have the highest correlation with sale price. We can see that GarageCars and GarageArea also have a high correlation with sale price, but since they essentially measure the same thing we don't have to consider them individually. First floor and basement square footage also have a relatively high correlation with sale price. Based on this analysis it seems like variables dealing with living area have a high impact on the sale price.

In [None]:
corr_rank = corr_rank.drop(["SalePrice"])
sorted_corr = corr_rank.index.tolist()
fig, axes = plt.subplots(4, 3, figsize=(20,10), sharey= True)
fig.suptitle("Highest Correlation with Sale Price", fontsize= 20)
plt.subplots_adjust(hspace = 0.7, wspace=0.1)
for i,col in zip(range(12), sorted_corr):
    sns.scatterplot(y=data['SalePrice'], x=data[col],ax=axes[i//3][i%3])
    axes[i//3][i%3].set_title('SalesPrice with '+col)

Using scatterplots we can take a closer look at the variables with the highest correlation to sale price. You can see that there is a significant upward trajectory in sale price once the overall quality reaches 6 and above. We can also see a clear trend as sale price increases with greater square footage in above ground living area, garage area, basement square footage, as well as 1st floor sqare footage. However, we can also see that there are a lot of outliers in our data. Next we will create a function to remove the outliers in our numerical data.

### Removing Outliers

Removing outliers in our data will prevent unnecessary biases from arising for our numerical variables. First choose which numerical variables need to have outliers removed. I would recommend leaving out variables such as year built and number of rooms as these are essentially categorical variables with numeric values. Then write a function to convert all outliers to the maximum and minimum values based on interquartile range.

In [None]:
n_features = data.select_dtypes(exclude = ["object"]).columns
n_features

In [None]:
data_outliers = data[["LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtUnfSF", "TotalBsmtSF",
                "1stFlrSF", "2ndFlrSF", "GrLivArea", "GarageArea", "WoodDeckSF",
                "OpenPorchSF"]]

In [None]:
def mod_outliers(data):
    df1 = data.copy()
    data = data[["LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtUnfSF", "TotalBsmtSF",
                "1stFlrSF", "2ndFlrSF", "GrLivArea", "GarageArea", "WoodDeckSF",
                "OpenPorchSF"]]
    
    q1 = data.quantile(0.25)
    q3 = data.quantile(0.75)
    
    iqr = q3 - q1
    
    lower_bound = q1 - (1.5 * iqr)
    upper_bound = q3 + (1.5 * iqr)
    
    for col in data.columns:
        for i in range(0, len(data[col])):
            if data[col][i] < lower_bound[col]:
                data[col][i] = lower_bound[col]
                
            if data[col][i] > upper_bound[col]:
                data[col][i] = upper_bound[col]
                
    for col in data.columns:
        df1[col] = data[col]
        
    return(df1)

In [None]:
data_outliers = mod_outliers(data_outliers)
data = mod_outliers(data)

In [None]:
"""
Print box plots of each modified variable to check if outliers were indeed removed
"""
for i in data_outliers:
    sns.boxplot(x=data_outliers[i])
    plt.show()

### Prepare Data for Modeling

Now that were have cleaned and analyzed our data, we can now prepare the data for modeling. First we have to create dummy variable for all of our categorical variables. Next, separate the test and training sets which we combined earlier for cleaner processing. Then separate sale price as its own series and remove it from the training set. You should end up with training "X", test "X", and training/test "Y" datasets in order to start modeling.

In [None]:
data = pd.get_dummies(data)

train = data.loc["x"]
test = data.loc["y"]
test = test.drop(["SalePrice"], axis = 1)

y = train["SalePrice"]
train_x = train.drop(["SalePrice"], axis = 1)
test_x = test

In [None]:
train_x.shape

In [None]:
test_x.shape

In [None]:
y.shape

# Apply Machine Learning Algorithms and Make Final Prediction

The final step is to apply some regression algorithms to our data and make a final prediction. Since we have a high number of dimensions we are going to skip simple linear regression. We will start by cross validating the ridge and lasso algorithms in order to find the best tuning parameters for our data. Lasso and Ridge regression are both designed to reduce the coefficients of non-important variables. The alpha parameter determines the degree to which the coefficients are reduced. Then we will use XGBoost to boost the better performing algorithm between lasso and ridge and save our final prediction.

### Ridge Regression

In [None]:
"""
Using the cross_val_score() function, we will measure all of our models with 5-fold cross validation and return
root mean squared error to compare the accuracy of each model.
"""
def rmse_cv(model):
    rmse = np.sqrt(-cross_val_score(model, train_x, y, scoring = "neg_mean_squared_error",
                                    cv = 5))
    return(rmse)
ridge = RidgeCV(alphas = [0.05, 0.1, 0.3, 1, 5, 10, 15, 30, 50, 75]).fit(train_x, y)
rmse_cv(ridge).mean()

### Lasso Regression

In [None]:
lasso = LassoCV(alphas = [1, 0.1, 0.01, 0.001, 0.0001]).fit(train_x, y)
rmse_cv(lasso).mean()

Using lasso regression we were able to achieve a lower rmse and therefore we will use XGBoost to boost our lasso model and make our final prediction. Although lasso and ridge both reduce the coefficients of variables based on alpha, it should be noted that only lasso can completely remove variables by reducing their coefficients to zero. Since this dataset has high dimensionality, this could be a reason why lasso was able to perform slightly better than ridge.

### Using XGBoost to Boost our Lasso Model

In [None]:
model_xgb = xgb.XGBRegressor(n_estimators = 360, max_depth = 2, learning_rate = 0.1)
model_xgb.fit(train_x, y)

In [None]:
lasso_preds = np.expm1(lasso.predict(test_x))
xgb_preds = np.expm1(model_xgb.predict(test_x))
preds = 0.7*lasso_preds + 0.3*xgb_preds

### Prepare Predictions for Submission

In [None]:
submission = pd.DataFrame({"id": test_og.Id, "SalePrice": preds})
submission.head(5)

In [None]:
submission.to_csv("submission.csv", index = False)