In [None]:
import itertools
import os
import sys

import pandas as pd
import numpy as np
import scipy as sp
import sklearn as sk
import sklearn.preprocessing

import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smapi

In [None]:
sys.path.insert(1, os.path.join(sys.path[0], '..'))  # add parent directory to path
import samlib

Use the cleaner chaining method for transforming the data https://tomaugspurger.github.io/method-chaining.html

# Sale price distribution
First step is to look at the target sale price for the training data set, i.e. the column we're trying to predict. 

In [None]:
target = pd.read_csv('../data/train_target.csv')

In [None]:
target.describe()

The sale price is in hte hundreds of thousands, so let's divide the price by 1000 to get more manageable numbers.

In [None]:
target = target / 1000

In [None]:
sns.distplot(target);
plt.title('SalePrice')

In [None]:
import scipy as sp
sp.stats.skew(target)

In [None]:
sp.stats.skewtest(target)

The distribution is skewed (as demonstrated by the large z-score (and small pvalue) of teh skewtest). It is right skewed (the skew is positive). Skewed distribution are not ideal for linear models, which often assume a normal distribution. One way to correct for right-skewness is to take the log [1,2,3]

- [1] http://fmwww.bc.edu/repec/bocode/t/transint.html 
- [2] https://www.r-statistics.com/2013/05/log-transformations-for-skewed-and-wide-distributions-from-practical-data-science-with-r/
- [3] Alexandru Papiu's notebook https://www.kaggle.com/apapiu/house-prices-advanced-regression-techniques/regularized-linear-models/commentsnotebook 

We apply the function $x \rightarrow \log(1 + x)$ because it is always positive for $x \geq 0$

In [None]:
logtarget = np.log1p(target)
print('skewness of logtarget = ', sp.stats.skew(logtarget)[0])
print('skewness test of logtarget = ', sp.stats.skewtest(logtarget))
sns.distplot(logtarget)
plt.title(r'log(1 + SalePrice)')

# Merge the training and test datasets for data preparation
We're going to explore the training dataset and apply some transformations to it (fixing missing values, transforming columns etc). We'll need to apply the same transformations to the test dataset. To make that easy, let's put the training and test datasets into one dataframe. 

In [None]:
# Create a pipeline so we can process all the data later in one go if needed
class Pipeline:
    
    def __init__(self, df):
        self.input_df = df
        self._pipeline = []
        
    def append(self, func):
        """Append function to pipe"""
        if not self._pipeline or not self._same_func(self._pipeline[-1], func):
            self._pipeline.append(func) 
    
    def run(self):
        """Run the pipe"""
        df = self.input_df
        for func in self._pipeline:
            df = df.pipe(func)
        return df
    
    @staticmethod
    def _same_func(f1, f2):
        return f1.__name__ == f2.__name__
    
    def __str__(self):
        return str(self._pipeline)

    def __repr__(self):
        return repr(self._pipeline)
            


In [None]:
def read():
    """Read training and test data and return a dataframe with ['Dataset','Id'] multi-index
    """
    raw_train = pd.read_csv('../data/train_prepared_light.csv')
    raw_test = pd.read_csv('../data/test_prepared_light.csv')
    df = pd.concat([raw_train, raw_test], keys=['train', 'test'])
    df.index.names = 'Dataset', 'Id'
    return df
    
df = read()
pipeline = Pipeline(df.copy())

In [None]:
df.shape

In [None]:
ncategories = sum(df.dtypes == object)
ncategories

In [None]:
df.head()

In [None]:
df.tail()

## Select Numerical features

The dataset is wide with 78 features. Create dataframe containing the numerical features only.

In [None]:
df.columns, len(df.columns)

We've got 3 data types: int, float and object

In [None]:
df.dtypes.unique()

Split the data between categorical and numerical features

In [None]:
is_categorical = (df.dtypes == object)
is_numerical = ~is_categorical

In [None]:
df.dtypes

In [None]:
dfnum = df.loc[:, is_numerical].copy()

In [None]:
dfnum.columns, len(dfnum.columns)

We've got 36 numerical features. We can use the `describe` method to get some statistics:

In [None]:
desc = dfnum.describe()

But that's a lot of numbers to digest. Better get started plotting! To help with plotting, but also to improve linear regression models, we're going to standardize our data. But before that we must deal with the NaN values.
http://sebastianraschka.com/Articles/2014_about_feature_scaling.html

In [None]:
def select_numerical_features(df):
    return df.loc[:, df.dtypes != object]

pipeline.append(select_numerical_features)
# Check the pipline
all(pipeline.run() == dfnum)

### Deal with NaN values 

In [None]:
cols_with_nulls = dfnum.columns[dfnum.isnull().sum() > 0]
cols_with_nulls

In [None]:
dfnum.shape

In [None]:
dfnum[cols_with_nulls].isnull().sum().sort_values(ascending=False)

Based on the description, the null values for the `MasVnrArea` should be 0 (no massonry veneer type)

In [None]:
# We may want to refine this in the future. Perhaps build a model to predict the missing GarageCars from the other features?
median_list = 'LotFrontage', 'BsmtFullBath','BsmtHalfBath', 'GarageCars', 'GarageArea'
zero_list = 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'TotalBsmtSF', 'BsmtUnfSF'

def fillnans(dfnum):
    return dfnum.pipe(samlib.fillna, 'median', median_list)\
     .pipe(samlib.fillna, lambda df: 0, zero_list)\
     .assign(GarageYrBlt=dfnum.GarageYrBlt.
             fillna(dfnum.YearBuilt[dfnum.GarageYrBlt.isnull()]))  # fill with year garage was built


In [None]:
dfnum = fillnans(dfnum)

In [None]:
# Check that we got rid of the nulls
assert not samlib.has_nulls(dfnum)

In [None]:
pipeline.append(fillnans)
# Check the pipline
pipeline, all(pipeline.run() == dfnum)

### Plot violinplots for each feature 
The violin plots give us some idea of the distribution of data for each feature. We can look for things like skewness, non-normality, and the presence of outliers. 

In [None]:
from itertools import chain

def violinplot(df, ax=None):
    if ax is None:
        ax = plt.gca()
    sns.violinplot(data=df, ax=ax)
    for xlab in ax.get_xticklabels():
        xlab.set_rotation(0)
        

def featureplot(df, nrows=1, ncols=1, figsize=(12,8), plotfunc=violinplot):
    """Plot the dataframe features"""
    width, height = figsize
    fig, axes = plt.subplots(nrows, ncols, figsize=(width, height * nrows));
    i = 0
    plots_per_figure = max(df.shape[1] // (nrows * ncols), 1)
    if nrows == 1 and ncols == 1:
        axes = [axes]
    if nrows > 1 and ncols > 1:
        axes = chain.from_iterable(axes)  # flatten the nested list
    for j, ax in zip(range(plots_per_figure, df.shape[1] + 1, plots_per_figure), axes):
        plotfunc(df.iloc[:, i:j], ax=ax)
        i = j
    plt.tight_layout()

In [None]:
dfnum.shape

In [None]:
featureplot(dfnum, ncols=7, nrows=4, figsize=(12, 4))

Many of the features are higly skewed with very long tails.

Most of these are right skewed as well. BsmtFullBath has some discrete values (number of bathrooms).

Some features, such as `BsmtFinSF2`, are almost constant (blobs with long tail) as can be seen below

In [None]:
fig, ax = plt.subplots(1,1, figsize=(4, 4))
sns.distplot(dfnum.BsmtFinSF2, ax=ax)
ax.set_title('Distribution of BsmtFinSF2')


### Drop nearly constant features

In [None]:
def test_nearly_constant(series):
    counts = series.value_counts()
    max_val_count = max(counts)
    other_val_count = counts.drop(counts.argmax()).sum()
    return other_val_count / max_val_count < 0.25

is_nearly_constant = dfnum.apply(test_nearly_constant)
is_nearly_constant.value_counts()

In [None]:
dropme = dfnum.columns[is_nearly_constant]
dropme

In [None]:
def drop_constant_features(df):
    return df.drop(df.columns[df.apply(test_nearly_constant)], axis=1)

pipeline.append(drop_constant_features)
all(dfnum.drop(dropme, axis=1) == pipeline.run())

In [None]:
dfnum = dfnum.drop(dropme, axis=1)

### Log transform the other features if they have a high skewness

Using a log transformation for some of the skewed features should help, as illustrated below. We use the raw data (not the standardized one) because we need positive values for the log function (we'll standardize the transformed variables later).

In [None]:
fig, axes = plt.subplots(1,2, figsize=(8, 4))
sns.distplot(dfnum['LotArea'], ax=axes[0])
sns.distplot(np.log1p(dfnum['LotArea']), ax=axes[1])


Use dataframe & series whenever possible for maximum flexibility (see below)

In [None]:
def skewtest(train, sort=True, ascending=True):
    """Return dataframe of zfactor and pvalue for skew test"""
    test = sp.stats.skewtest(train)
    zfactor = test[0]
    pvalue = test[1]
    df = pd.DataFrame(dict(zfactor=zfactor, pvalue=pvalue), index=train.columns)
    if sort:
        return df.sort_values(by='zfactor', ascending=ascending)
    else:
        return df

skewtest(dfnum).head()

In [None]:
def is_skewed(train, min_zfactor=10, plot=False):
    """Return series of booleans indicating whether a column is skewed or not.
    """
    sk = skewtest(train)
    if plot:
        plt.figure(1)
        plt.title('Z-factor distribution from skewtest')
        plt.xlabel('Z-factor')
        sns.distplot(sk.zfactor)
        plt.figure(2)
        sk.zfactor.plot(kind='barh')
        plt.title('Z-factor for skewtest')
    return sk.zfactor > min_zfactor

In [None]:
is_skewed(dfnum, min_zfactor=10, plot=True)

Let's apply a log1p transform to all these and plot the distributions again

In [None]:
pipeline

In [None]:
def transform_skewed_colums(dfnum):
    """
    dfnum: dataframe to transform
    dropme: columns to drop
    is_skewed: iterable of length dfnum.columns indicating if a column is skewed
    """
    dfnum2 = dfnum.copy()
    skewed_colz = is_skewed(dfnum)
    dfnum2.loc[:, skewed_colz] = dfnum2.loc[:, skewed_colz].apply(np.log1p)
    return dfnum2

pipeline.append(transform_skewed_colums)

# the transformed dataset has fewer columns and we only want those
dfnum2 = pipeline.run()


In [None]:
dfnum.columns

In [None]:
is_skewed(dfnum, plot=True)

In [None]:
sp.stats.skewtest(dfnum2)[0]

In [None]:
zfactors2 = sp.stats.skewtest(dfnum2)[0]
pd.Series(data=zfactors2, index=dfnum2.columns)[is_skewed(dfnum)].sort_values().plot(kind='barh')

Now our originally skewed features look more symmetric. 

### Check that the distributions are less skewed

In [None]:
skewed = is_skewed(dfnum)
skewed.value_counts()

In [None]:
featureplot(dfnum2.loc[:, skewed], nrows=3, ncols=6, figsize=(10,3))

In [None]:
featureplot(dfnum2.loc[:, ~skewed], nrows=2, ncols=5, figsize=(10, 3))

## Save transformed numerical data
Use the storage magic to communicate between notebooks. 

In [None]:
dfnum2.to_csv('transformed_dataset_dfnum2.csv', index=True)

### Feature selection
We're now in a good position to identify the key numerical features. Those should be hightly correlated with the sale price.

### Standardize the data 
This is not in the right place. We don't normally want to standardize the data just for plotting... Although it may be the easiest way. Can we have violin plots in little individual plots instead? Sandardization is useful but probably for regression, not here!

In [None]:
def standardize(df):
    return sk.preprocessing.StandardScaler().fit_transform(df)

dfnum_t = dfnum.apply(standardize)

In [None]:
dfnum_t.head()

In [None]:
nfeatures = dfnum2.columns
target_t = logtarget.apply(standardize)
target_t.head()

In [None]:
dfnum_t2 = dfnum2.apply(standardize)

In [None]:
dfnum_t2.head()

In [None]:
corr = pd.DataFrame(data=dfnum_t2.loc['train',:].apply(lambda feature: sp.stats.pearsonr(feature, target_t['SalePrice'])), 
                    columns=['pearsonr'])
corr = corr.assign(correlation=corr.applymap(lambda x: x[0]),
                pvalue=corr.applymap(lambda x: x[1]))
corr = corr.drop('pearsonr', axis=1)

In [None]:
corr.head()

In [None]:
corr.sort_values('pvalue', ascending=False)['correlation'].plot(kind='barh')

In [None]:
corr.sort_values('pvalue').head()

In [None]:
corr.sort_values('pvalue').tail()

Let's keep only the features that have a high enough correlation with the price (correlation less than 0.2)

In [None]:
min_correlation = 0.2
key_features = corr[np.abs(corr['correlation'] > min_correlation)].sort_values(by='correlation', ascending=False).index.values
key_features, key_features.size

In [None]:
%store key_features