In [None]:
import pandas as pd
import numpy as np
import scipy as sp
import sklearn as sk
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.preprocessing
import statsmodels.formula.api as smapi
import itertools

In [None]:
df = pd.read_csv('data/train_prepared_light.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
ncategories = sum(df.dtypes == object)
ncategories

In [None]:
target = pd.read_csv('data/train_target.csv')

In [None]:
target.shape

# Sale price distribution
First step is to look at the target sale price for the training data set, i.e. the column we're trying to predict. 

In [None]:
target.describe()

The sale price is in hte hundreds of thousands, so let's divide the price by 1000 to get more manageable numbers.

In [None]:
target = target / 1000

In [None]:
sns.distplot(target);
plt.title('SalePrice')

In [None]:
import scipy as sp
sp.stats.skew(target)

In [None]:
sp.stats.skewtest(target)

The distribution is skewed (as demonstrated by the large z-score (and small pvalue) of teh skewtest). It is right skewed (the skew is positive). Skewed distribution are not ideal for linear models, which often assume a normal distribution. One way to correct for right-skewness is to take the log [1,2]

- [1] http://fmwww.bc.edu/repec/bocode/t/transint.html 
- [2] https://www.r-statistics.com/2013/05/log-transformations-for-skewed-and-wide-distributions-from-practical-data-science-with-r/
- [3] Alexandru Papiu's notebook https://www.kaggle.com/apapiu/house-prices-advanced-regression-techniques/regularized-linear-models/commentsnotebook 

We apply the function $x \rightarrow \log(1 + x)$ because it is always positive for $x \geq 0$

In [None]:
logtarget = np.log1p(target)
print('skewness of logtarget = ', sp.stats.skew(logtarget)[0])
print('skewness test of logtarget = ', sp.stats.skewtest(logtarget))
sns.distplot(logtarget)
plt.title(r'log(1 + SalePrice)')

# Features
The dataset is wide with 78 features.

In [None]:
df.columns, len(df.columns)

We've got 3 data types: int, float and object

In [None]:
df.dtypes.unique()

Split the data between categorical and numerical features

In [None]:
is_categorical = (df.dtypes == object)
is_numerical = (~ is_categorical)

In [None]:
dfcat = df.loc[:, is_categorical]
dfcat.head()

In [None]:
dfnum = df.loc[:, is_numerical]
dfnum.head()

## Numerical features

In [None]:
dfnum.columns, len(dfnum.columns)

We've got 36 numerical features. We can use the `describe` method to get some statistics:

In [None]:
dfnum.describe()

But that's a lot of numbers to digest. Better get started plotting! To help with plotting, but also to improve linear regression models, we're going to standardize our data. But before that we must deal with the NaN values.
http://sebastianraschka.com/Articles/2014_about_feature_scaling.html

### Deal with NaN values 

In [None]:
cols_with_nulls = dfnum.columns[dfnum.isnull().sum() > 0]
cols_with_nulls

In [None]:
dfnum[cols_with_nulls].isnull().sum()
#.plot(kind='bar')

Based on the description, the null values for the `MasVnrArea` should be 0 (no massonry veneer type)

In [None]:
dfnum.LotFrontage.fillna(dfnum.LotFrontage.median(), inplace=True)
dfnum.MasVnrArea.fillna(0, inplace=True)

For the GarageYrBlt, replace by the year the house was built. 

In [None]:
dfnum.GarageYrBlt.fillna(dfnum.YearBuilt[dfnum.GarageYrBlt.isnull()], inplace=True)
# check
dfnum.isnull().sum().any()

### Standardize the data 

In [None]:
def standardize(df):
    _values = sk.preprocessing.StandardScaler().fit_transform(df)
    return pd.DataFrame(data=_values, columns=df.columns)

dfnum_t = standardize(dfnum)

** Plot violinplots for each feature **
The violin plots give us some idea of the distribution of data for each feature. We can look for things like skewness, non-normality, and the presence of outliers. 

In [None]:
def violinplot(df, ax=None):
    if ax is None:
        ax = plt.gca()
    sns.violinplot(df, ax=ax)
    for xlab in ax.get_xticklabels():
        xlab.set_rotation(30)
        


In [None]:
def featureplot(df, nrows=1, figsize=(10,10), plotfunc=violinplot):
    """Plot the dataframe features"""
    width, height = figsize
    fig, axes = plt.subplots(nrows, 1, figsize=(width, height * nrows));
    i = 0
    plots_per_figure = df.shape[1] // nrows
    if nrows == 1:
        axes = [axes]
    for j, ax in zip(range(plots_per_figure, df.shape[1] + 1, plots_per_figure), axes):
        plotfunc(df.iloc[:, i:j], ax=ax)
        i = j


In [None]:
dfnum.shape

Many of the features are higly skewed with very long tails.

In [None]:
featureplot(dfnum_t.iloc[:, 0:9])

Using a log transformation for some of the skwewed features should help. 

In [None]:
fig, axes = plt.subplots(1,2, figsize=(8, 4))
sns.distplot(dfnum['LotArea'], ax=axes[0])
sns.distplot(np.log1p(dfnum['LotArea']), ax=axes[1])


Most of these are right skewed as well. BsmtFullBath has some discrete values (number of bathrooms).

In [None]:
featureplot(dfnum_t.iloc[:, 9:18])

Some features are almost constant so a log transformation won't help reduce the skewness

In [None]:
fig, axes = plt.subplots(1,2, figsize=(8, 4))
sns.distplot(dfnum['BsmtFinSF2'], ax=axes[0])
sns.distplot(np.log1p(dfnum['BsmtFinSF2']), ax=axes[1])
axes[0].set_title('Distribution of BsmtFinSF2')
axes[1].set_title('Distribution of log(1 + BsmtFinSF2)')

Let's come up with a diagnostic for detecting those nearly constant features. 

In [None]:
def test_nearly_constant(series):
    counts = series.value_counts()
    max_val_count = max(counts)
    other_val_count = counts.drop(counts.argmax()).sum()
    return other_val_count / max_val_count < 0.25

is_nearly_constant = dfnum_t.apply(test_nearly_constant)
is_nearly_constant.value_counts()

In [None]:
dropme = dfnum_t.columns[is_nearly_constant]
dropme

We're going to drop these nearly constant features. If we want to have more control we can transform them into categorical features (for example, is there a screen porch or not?).

In [None]:
dfnum_t.columns

In [None]:
dfnum_t = dfnum_t.drop(dropme, axis=1)


In [None]:
dfnum_t.shape

### Log transform the other features if they have a high skewness

In [None]:
zfactors = sp.stats.skewtest(dfnum_t)[0]
sns.distplot(zfactors)

In [None]:
is_skewed = np.abs(zfactors) > 10
pd.Series(data=zfactors, index=dfnum_t.columns)[is_skewed].sort_values().plot(kind='barh')
plt.title('Z-factor for skewtest')

Check the sign of the skewness for all these

In [None]:
assert all(np.sign(sp.stats.skew(dfnum_t)[is_skewed]) > 0)

Let's apply a log1p transform to all these and plot the distributions again

In [None]:
dfnum2 = dfnum.copy()
for feature, skewed_feature in zip(dfnum_t.columns, is_skewed):
    if skewed_feature:
        dfnum2[feature] = np.log1p(dfnum[feature])

dfnum_t2 = standardize(dfnum2).drop(dropme, axis=1)

In [None]:
dfnum_t2.iloc[:, is_skewed].columns

In [None]:
zfactors2 = sp.stats.skewtest(dfnum_t2)[0]
pd.Series(data=zfactors2, index=dfnum_t2.columns)[is_skewed].sort_values().plot(kind='barh')

Now our originally skewed features look more symmetric. 

In [None]:
featureplot(dfnum_t2.iloc[:, is_skewed], nrows=2, figsize=(10,5))

In [None]:
featureplot(dfnum_t2.iloc[:, ~is_skewed], nrows=2, figsize=(10, 5))

### Feature selection
We're now in a good position to identify the key numerical features. Those should be hightly correlated with the sale price.

In [None]:
nfeatures = dfnum_t2.columns
target_t = standardize(logtarget)

In [None]:
corr = pd.DataFrame(dfnum_t2.apply(lambda feature: sp.stats.pearsonr(feature, target_t['SalePrice'])),
                   columns=['pearsonr'])
corr['correlation'] = corr['pearsonr'].apply(lambda x: x[0])
corr['pvalue'] = corr['pearsonr'].apply(lambda x: x[1])
corr.drop('pearsonr', axis=1, inplace=True)

In [None]:
corr.sort_values('pvalue', ascending=False)['correlation'].plot(kind='barh')

In [None]:
corr.sort_values('pvalue').head()

In [None]:
corr.sort_values('pvalue').tail()

Let's drop the features that don't have a high correlation with the price (correlation less than 0.2)

In [None]:
#corr.filter?
key_features = corr[corr['correlation'] > 0.2].sort_values(by='correlation', ascending=False).index.values
key_features, key_features.size

### Basic linear regression model
We're left with 22 features. The first 4 should all be highly correlated with the price.


In [None]:
data = dfnum_t2.copy()
data['SalePrice'] = target_t

In [None]:

fig, axes = plt.subplots(2,2,figsize=(10,10))
for feature, ax in zip(key_features[:4], itertools.chain.from_iterable(axes)):
    ax.plot(data[feature], data['SalePrice'], 'o')
    ax.set(xlabel=feature, ylabel='SalePrice')
    


Let's build a simple linear regression model based on these 4 features.

In [None]:
regression1 = smapi.ols("SalePrice ~ OverallQual + GrLivArea + GarageCars + GarageArea", data=data).fit()
regression1.summary()

R-squared equals 0.79 so it's pretty good for a first try. Let's see what happens if we include all our numerical features.

In [None]:
data.columns

Statsmodels gets confused with columns that start with a digit, so let's rename that column first

In [None]:
data['1stFlrSF'].name = 'FlrSF'

In [None]:
data.rename_axis({'1stFlrSF': 'FirstFlrSF', '2ndFlrSF': 'SndFlrSF'}, axis=1, inplace=True)

In [None]:
desc = 'SalePrice ~ ' + ' + '.join(data.drop('SalePrice', axis=1))
desc

As can be seen below, using more numerical values doesn't help us much. We're going to need the categorical features if we want to improve our prediction.

In [None]:
regression2 = smapi.ols(desc, data=data).fit()
regression.summary()