In [None]:
import itertools
import os
import sys

import pandas as pd
import numpy as np
import scipy as sp
import sklearn as sk
import sklearn.preprocessing

import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smapi

In [None]:
sys.path.insert(1, os.path.join(sys.path[0], '..'))  # add parent directory to path
import samlib

# Sale price distribution
First step is to look at the target sale price for the training data set, i.e. the column we're trying to predict. 

In [None]:
target = pd.read_csv('../data/train_target.csv')

In [None]:
target.describe()

The sale price is in hte hundreds of thousands, so let's divide the price by 1000 to get more manageable numbers.

In [None]:
target = target / 1000

In [None]:
logtarget = np.log1p(target)


# Merge the training and test datasets for data preparation
We're going to explore the training dataset and apply some transformations to it (fixing missing values, transforming columns etc). We'll need to apply the same transformations to the test dataset. To make that easy, let's use a class that maintains the training and test datasets and keeps them in sync (so that when we apply a transformation to the full dataset, it's applied automatically to the training and test datasets).

In [None]:
raw_train = pd.read_csv('../data/train_prepared_light.csv')
raw_test = pd.read_csv('../data/test_prepared_light.csv')
ds = samlib.DataSet(raw_train, raw_test)

In [None]:
ds.df.shape

In [None]:
ncategories = sum(ds.df.dtypes == object)
ncategories

# Features
The dataset is wide with 78 features.

In [None]:
ds.columns, len(ds.columns)

We've got 3 data types: int, float and object

In [None]:
ds.df.dtypes.unique()

Split the data between categorical and numerical features

In [None]:
is_categorical = (ds.df.dtypes == object)
is_numerical = (~ is_categorical)

## Categorical features


In [None]:
dscat = ds.apply(lambda df: df.loc[:, is_categorical])

In [None]:
dscat.columns, len(dscat.columns)

We've got 42 categorical features. We can use the `describe` method to get some statistics:

In [None]:
desc = dscat.describe()

In [None]:
desc = desc.append(pd.Series(dscat.df.isnull().sum(), name='nulls'))
desc

** Number of categories per feature **

In [None]:
plt.figure(figsize=(12, 10))
desc.loc['unique'].sort_values(ascending=False).plot(kind='barh')
plt.title('Number of categories per feature')

** Number of nulls per feature **

In [None]:
plt.figure(figsize=(12, 10))
ax = desc.loc['nulls'][desc.loc['nulls'] > 0].sort_values(ascending=False).plot(kind='barh')
plt.title('Number of nulls per feature')

But that's a lot of numbers to digest. Better get started plotting! To help with plotting, but also to improve linear regression models, we're going to standardize our data. But before that we must deal with the NaN values.
http://sebastianraschka.com/Articles/2014_about_feature_scaling.html

### Deal with NaN values 

In [None]:
dfnum = dsnum.df.copy()

In [None]:
cols_with_nulls = dfnum.columns[dfnum.isnull().sum() > 0]
cols_with_nulls

In [None]:
dfnum.shape

In [None]:
dfnum[cols_with_nulls].isnull().sum().sort_values(ascending=False)
#.plot(kind='bar')

Based on the description, the null values for the `MasVnrArea` should be 0 (no massonry veneer type)

In [None]:
# We may want to refine this in the future. Perhaps build a model to predict the missing GarageCars from the other features?
median_list = 'LotFrontage', 'BsmtFullBath','BsmtHalfBath', 'GarageCars', 'GarageArea'
zero_list = 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'TotalBsmtSF', 'BsmtUnfSF'


In [None]:
for feature in median_list:
    dfnum[feature].fillna(dfnum[feature].median(), inplace=True) 

In [None]:
for feature in zero_list:
    dfnum[feature].fillna(0, inplace=True)

For the GarageYrBlt, replace by the year the house was built. 

In [None]:
dfnum.GarageYrBlt.fillna(dfnum.YearBuilt[dfnum.GarageYrBlt.isnull()], inplace=True)


In [None]:
dsnum.df = dfnum

# Check that everything is in order
def has_nulls(df):
    return df.isnull().sum().any()

assert not has_nulls(dfnum)
assert not has_nulls(dsnum.df)
assert not has_nulls(dsnum.train)
assert not has_nulls(dsnum.test)

### Standardize the data 

In [None]:
def standardize(df):
    _values = sk.preprocessing.StandardScaler().fit_transform(df)
    return pd.DataFrame(data=_values, columns=df.columns)

dsnum_t = dsnum.apply(standardize)


### Plot violinplots for each feature 
The violin plots give us some idea of the distribution of data for each feature. We can look for things like skewness, non-normality, and the presence of outliers. 

In [None]:
def violinplot(df, ax=None):
    if ax is None:
        ax = plt.gca()
    sns.violinplot(df, ax=ax)
    for xlab in ax.get_xticklabels():
        xlab.set_rotation(30)
        


In [None]:
def featureplot(df, nrows=1, figsize=(12,8), plotfunc=violinplot):
    """Plot the dataframe features"""
    width, height = figsize
    fig, axes = plt.subplots(nrows, 1, figsize=(width, height * nrows));
    i = 0
    plots_per_figure = df.shape[1] // nrows
    if nrows == 1:
        axes = [axes]
    for j, ax in zip(range(plots_per_figure, df.shape[1] + 1, plots_per_figure), axes):
        plotfunc(df.iloc[:, i:j], ax=ax)
        i = j
