# House Sales in King Country, USA

Here [House Sales in King County, USA](https://www.kaggle.com/harlfoxem/housesalesprediction) dataset by [harlfoxem](https://www.kaggle.com/harlfoxem) is used to perform `EDA` on housing prices and creating `machine lerning model` to predict house prices.

**About data source**: This dataset contains house sale prices for King County, which includes Seattle. It includes homes sold between `May 2014 and May 2015`.

![](https://media.giphy.com/media/3o6Mba1qerHR51rl9C/giphy.gif)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold

import xgboost

from scipy.stats import zscore, pearsonr

from joblib import dump

In [None]:
# Pandas config
def pandas_config():
    # display 10 rows and all the columns
    pd.set_option('display.max_rows', 10)
    pd.set_option('display.max_columns', None)
    
pandas_config()

In [None]:
# Loading the dataset
file_path = '/kaggle/input/housesalesprediction/kc_house_data.csv'
df = pd.read_csv(file_path)
df.sample(5)

In [None]:
df.info()

No missing data

## Data preparation

In [None]:
# Drop df column
def drop_df_column(df, column_name, inplace=True):
    return df.drop([column_name], axis='columns', inplace=inplace)

In [None]:
drop_df_column(df, 'id')

In [None]:
print(df.select_dtypes('object').columns.tolist())
df.drop(['date'], axis='columns', inplace=True)

In [None]:
def plot_corr(df, figsize=(16, 12)):
    # the `corr` method uses pearson correaltion
    corr = df.corr()
    
    _, ax = plt.subplots(1, 1, figsize=figsize)
    g = sns.heatmap(corr, ax=ax, annot=True, cmap=sns.cubehelix_palette(start=.5, rot=-.5, as_cmap=True))

    for _ax in g.get_xticklabels():
        _ax.set_rotation(75)
    
    
plot_corr(df)

In [None]:
# Get columns names in as sentence (instead of getting long list 
# of column names)
def get_column_names(df):
    for column_name in df.columns.tolist():
        print(f'{column_name} | ', end='')

In [None]:
get_column_names(df)

In [None]:
def plot_base_relation(df, figsize=(20, 200)):
    columns = df.columns.tolist()
    _, axs = plt.subplots(len(columns), 4, figsize=figsize)
    
    for idx, column in enumerate(columns):
        # To get distribution of data
        sns.histplot(
            x=df[column],
            kde=False,
            color='#65b87b', alpha=.7,
            ax=axs[idx][0]
        )

        # To get knowledge about outliers
        sns.boxplot(
            x=df[column],
            color='#6fb9bd',
            ax=axs[idx][1]
        )

        # To get its realtion with price
        sns.scatterplot(
            x=column, y='price', data=df,
            color='#706dbd', alpha=.7, s=80,
            ax=axs[idx][2]
        )
        
        # To get count plot for `column`
        sns.countplot(
            x=column, data=df,
            color='#42b0f5', alpha=.7,
            ax=axs[idx][3]
        )
        
        
plot_base_relation(df, figsize=(20, 70))

### Dealing with outliers

A lot of columns have issue of outliers. Using `IQR` & `Zscores` method to deal with it.

In [None]:
# Removing outliers using IQR method
def rm_outliers_in_col_using_iqr(df, col):
    # col here is df.column_name (or df[column_name])

    Q1 = col.quantile(0.25)
    Q3 = col.quantile(0.75)
    IQR = Q3 - Q1

    outliers_row_idx = col.loc[
        (col < (Q1 - 1.5 * IQR)) | (col > (Q3 + 1.5 * IQR))
    ].index.tolist()

    df = df.drop(outliers_row_idx, axis='rows')
    return (outliers_row_idx, df)


# Removing outliers using the Zscore method
def rm_outliers_in_col_using_zscore(df, col, column_name):
    # col here is df.column_name (or df[column_name])
    
    zscores_df = pd.DataFrame({
        f'{column_name}': col.to_numpy()
    }, df.index.tolist())
    
    zscores_df['zscores'] = zscores_df.apply(lambda x: zscore(x))
    outliers_row_idx = zscores_df[np.abs(zscores_df.zscores) > 3].index.tolist()

    df = df.drop(outliers_row_idx, axis='rows')
    return (outliers_row_idx, df)


# Remove outliers of a column using iqr & zscore methods
def remove_outliers_of_a_column(df, column_name):
    rm_idxs = []
    for _ in range(10):
        outliers_row_idx, df = rm_outliers_in_col_using_iqr(df, df[column_name])
        rm_idxs.extend(outliers_row_idx)
        
        outliers_row_idx, df = rm_outliers_in_col_using_zscore(df, df[column_name], column_name)
        rm_idxs.extend(outliers_row_idx)
    return rm_idxs, df


# Remove outliers of a df using iqr & zscore methods
def remove_outliers_of_df(df):
    rm_rows_idxs = []
    for column in df.columns.tolist():
        if column == 'price':
            # As we don't want to do anything with `price`
            continue
            
        rm_idxs, df = remove_outliers_of_a_column(df, column)
        rm_rows_idxs.extend(rm_idxs)
    return rm_rows_idxs, df

In [None]:
'''
    If you removed all the outliers in `continuous_df` then only 1/10th of the data is remaining.
    So instead of going through all the columns in df at once, we will go through each column at 
    a time and if the columns has 2% of outliners then we drop that column. Keeping the threshold 
    as 2% because if keep threshold higher then collectivetly a lot of rows will drop which in turn
    reduces our dataset 
    
    eg. if threshold is 20% then column1 (if there are 15% outliers then remove the rows) then 
    column2 (if there are 10% outliers then remove the rows) so in total we end up dropping 
    15% + 10% = 30% of our rows.
    
    So to avoid this we are keeping threshold as 2%
'''

# To do the above thing we can just modify the `remove_outliers_of_df` func
def remove_outliers_of_df_with_threshold(df, threshold=2):
    # `threshold` here is the percent above which the entire 
    # column will be dropped 

    rm_rows_idxs = []
    for column in df.columns.tolist():
        if column == 'price':
            # As we don't want to do anything with `price`
            continue
            
        rm_idxs, tmp_df = remove_outliers_of_a_column(df, column)

        if round(len(rm_idxs) / len(df), 2) * 100 > threshold:
            drop_df_column(df, column)
        else:
            df = tmp_df.copy()
            del tmp_df
            rm_rows_idxs.extend(rm_idxs)
   
    return rm_rows_idxs, df

In [None]:
print(f'Dataset size before removing outliers: {len(df)}')

with np.errstate(divide='ignore', invalid='ignore'):
    RM_ROWS_IDXS, df = remove_outliers_of_df_with_threshold(df, threshold=4)

print(f'Dataset size after removing outliers: {len(df)}')

In [None]:
print(f'{len(RM_ROWS_IDXS)} columns are dropped while removing outliers')

In [None]:
plot_base_relation(df, (20, 38))

In [None]:
# Remove columns which have only one unique value as they won't be useful
drop_df_column(df, 'waterfront')

In [None]:
plot_corr(df)

In [None]:
drop_df_column(df, 'yr_renovated')
drop_df_column(df, 'zipcode')
drop_df_column(df, 'lat')
drop_df_column(df, 'long')

## Exploratory Data Analysis

In [None]:
df.head()

In [None]:
def plot_scatterplot(x, y, ax=None):
    sns.scatterplot(
        x=x, y=y,
        color='#706dbd', alpha=.7, s=80,
        ax=ax
    )
    
    
def plot_boxplot(x, ax=None):
    sns.boxplot(x=x, color='#6fb9bd', ax=ax)
    
    
def plot_barplot(x, y, ax=None):
    sns.barplot(x=x, y=y, data=df, palette='rocket', ax=ax)

In [None]:
_, ax = plt.subplots(2, 2, figsize=(16, 8))


plot_barplot(df.bedrooms, df.price, ax=ax[0][0])
plot_barplot(df.condition, df.price, ax=ax[0][1])
plot_barplot(df.bathrooms, df.price, ax=ax[1][0])
plot_barplot(df.floors, df.price, ax=ax[1][1])

In [None]:
_, ax = plt.subplots(2, 3, figsize=(16, 8))

plot_scatterplot(df.sqft_living, df.price, ax=ax[0][0])
plot_scatterplot(df.sqft_above, df.price, ax=ax[0][1])
plot_scatterplot(df.sqft_basement, df.price, ax=ax[0][2])
plot_scatterplot(df.yr_built, df.price, ax=ax[1][0])
plot_scatterplot(df.sqft_living15, df.price, ax=ax[1][1])

In [None]:
plot_corr(df, figsize=(14, 8))

`sqft_above` has strong positive correlation with `sqft_living` and moderate positive correlation with `sqft_living15` and `sqft_living` has positive correlation with `sqft_living15`. In short there is `multi-collinearity` issue here, so dropping any 2 columns out of 3. 

In [None]:
drop_df_column(df, 'sqft_above')
drop_df_column(df, 'sqft_living15')

In [None]:
tmp_df = df[['yr_built', 'price']].sort_values(by=['yr_built'])

group = tmp_df.groupby(['yr_built'])['price'].mean()
avg_price_of_the_year = [avg_price for avg_price in group]

plt.plot(tmp_df.yr_built.unique(), avg_price_of_the_year, linestyle='solid')
plt.xticks(rotation=16)

## Modelling

In [None]:
# Scaling int & float dtype column
def standard_scaler(column):
    # Bumping up the ndim by np.newaxis as column.values is 1D & fit_transform needs 2D
    return StandardScaler().fit_transform(column.values[:, np.newaxis])


# Scaling all int & float dtype columns 
def scaling_df(df):
    # Selecting columns which have number dtype
    numbers_df = df.select_dtypes(include=[np.int64, np.float64])

    for column_name in numbers_df.columns.tolist():
        df[column_name] = standard_scaler(df[column_name])
    return df


# Scaling `continuous_df` for EDA 
scaling_df(df)

df.sample(5)

In [None]:
columns = df.columns.tolist()
columns.remove('price')

x = df[columns]
y = df['price']

In [None]:
# Splitting the dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=6)

In [None]:
# Cross validation

kf = KFold(n_splits=10)

score = cross_val_score(Ridge(), x_train, y_train, cv=kf)
print(score.mean())

pr = PolynomialFeatures(degree=4)
x_train_pr = pr.fit_transform(x_train)
x_test_pr = pr.fit_transform(x_test)

score = cross_val_score(Ridge(), x_train_pr, y_train, cv=kf)
print(score.mean())

In [None]:
# Using XGBoost

xgb = xgboost.XGBRegressor()
try:
    xgb.fit(x_train_pr, y_train)
except KeyError:
    pass

## Evaluation

In [None]:
# Predictions
xgb_y_test_pred = xgb.predict(x_test_pr)

In [None]:
rms_error = mean_squared_error(y_test, xgb_y_test_pred, squared=False)
r2_score_value = r2_score(y_test, xgb_y_test_pred)

print(f"Root mean squared error: {rms_error}")
print(f"R2-score: {r2_score_value}")

In [None]:
# Creating a pipeline

scaling = ('scale', StandardScaler())
ploy = ('ploy', PolynomialFeatures(degree=4))
model = ('model', xgboost.XGBRegressor())

# Steps in the pipeline
steps = [scaling, ploy, model]

pipe = Pipeline(steps=steps)

# Fiitting the model
model = pipe.fit(x_train, y_train)

# Out-Of-Sample Forecast
y_test_pred = model.predict(x_test)

# Evaluation
rms_error = mean_squared_error(y_test, y_test_pred, squared=False)
r2_score_value = r2_score(y_test, y_test_pred)

print(f"Root mean squared error: {rms_error}")
print(f"R2-score: {r2_score_value}")

In [None]:
# Saving the model
dump(model, 'model.joblib')

## Visualizing our prediction against actual values

### Visualizing entire prediction vs actual value

In [None]:
f, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(20, 6))

ax1.plot(np.arange(len(y_test)), y_test, label='Actual')
ax2.plot(np.arange(len(y_test_pred)), y_test_pred, label='Prediction')

ax1.legend()
ax2.legend()

f, ax3 = plt.subplots(nrows=1, ncols=1, figsize=(20, 6))

ax3.plot(np.arange(len(y_test)), y_test, label='Actual')
ax3.plot(np.arange(len(y_test_pred)), y_test_pred, label='Prediction')

ax3.legend()

### Visualizing prediction vs actual values in interval of 100

In [None]:
def plot_result(start, end):
    f, ax3 = plt.subplots(nrows=1, ncols=1, figsize=(13, 5))

    ax3.plot(np.arange(len(y_test[start:end+1])), y_test[start:end+1], label='Actual')
    ax3.plot(np.arange(len(y_test_pred[start:end+1])), y_test_pred[start:end+1], label='Prediction')

    ax3.set_title(f'{start} - {end}')
    ax3.legend()

In [None]:
for i in range(0, 4480, 100):
    start = i
    end = start + 100
    plot_result(start, end)

---

I'll wrap things up there. If you want to find some other answers then go ahead `edit` this kernel. If you have any `questions` then do let me know.

If this kernel helped you then don't forget to ðŸ”¼ `upvote` and share your ðŸŽ™ `feedback` on improvements of the kernel.

![](https://media.giphy.com/media/cp7bUxkodNBHW/giphy.gif)

---