In [None]:
import itertools
import os
import sys

import pandas as pd
import numpy as np
import scipy as sp
import sklearn as sk
import sklearn.preprocessing

import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smapi

In [None]:
sys.path.insert(1, os.path.join(sys.path[0], '..'))  # add parent directory to path
import samlib

# Sale price distribution
First step is to look at the target sale price for the training data set, i.e. the column we're trying to predict. 

In [None]:
target = pd.read_csv('../data/train_target.csv')

In [None]:
target.describe()

The sale price is in hte hundreds of thousands, so let's divide the price by 1000 to get more manageable numbers.

In [None]:
target = target / 1000

In [None]:
logtarget = np.log1p(target)


# Merge the training and test datasets for data preparation


In [None]:
def read():
    """Read training and test data and return a dataframe with ['Dataset','Id'] multi-index
    """
    raw_train = pd.read_csv('../data/train_prepared_light.csv')
    raw_test = pd.read_csv('../data/test_prepared_light.csv')
    df = pd.concat([raw_train, raw_test], keys=['train', 'test'])
    df.index.names = 'Dataset', 'Id'
    return df
    
df = read()

In [None]:
pp = samlib.Pipeline(df.copy())  
assert pp == df  # the pipeline output equals df

In [None]:
def select_categorical_features(df):
    return df.loc[:, df.dtypes == object]

pp.append(select_categorical_features)

## Categorical features


We've got 42 categorical features. We can use the `describe` method to get some statistics:

In [None]:
pp().shape

In [None]:
pp().describe()

In [None]:
pp().isnull().sum()

** Number of categories per feature **

In [None]:
plt.figure(figsize=(12, 10))
pp().describe().loc['unique'].sort_values(ascending=False).plot(kind='barh')
plt.title('Number of categories per feature')

** Number of nulls per feature **

In [None]:
nulls = pp().isnull().sum()

In [None]:
plt.figure(figsize=(12, 10))
ax = nulls[nulls > 0].sort_values(ascending=False).plot(kind='barh')
plt.title('Number of nulls per feature')

But that's a lot of numbers to digest. Better get started plotting! To help with plotting, but also to improve linear regression models, we're going to standardize our data. But before that we must deal with the NaN values.
http://sebastianraschka.com/Articles/2014_about_feature_scaling.html

### Deal with NaN values 
Based on the descriptions, most of the null values are not missing values but an extra "none" category (e.g. no pool, or no alley etc). In the above graph, this is true for all features starting from 'BsmtFinType1'.

In [None]:
def replace_bad_nulls(df):
    nulls = df.isnull().sum()
    bad_nulls_colz = nulls[nulls > 0].sort_values()['BsmtFinType1':].index
    return samlib.fillna(df, lambda x: 'none', bad_nulls_colz)

pp.append(replace_bad_nulls)

** Replace true nulls with mode **
(work in progress)

In [None]:
def fill_nulls_with_mode(df):
    return samlib.fillna(df, lambda dg: dg.mode().loc[0])
pp.append(fill_nulls_with_mode)

# Explore categories
See http://seaborn.pydata.org/tutorial/categorical.html for some ideas

## Plot value counts 
Gives an idea of the distribution of values for each categorical variable. We can see that some categories, such as 'Condition2', are almost constant so are unlikely to have a large impact on predicting the sale price. 

In [None]:
df = pp()
df.shape

In [None]:
def plot_value_counts(data=df, **kwargs):
    return sns.countplot(y=data.columns[0], data=data, **kwargs)

In [None]:
# Sort by number of values
colz_ordered_by_unique = df.describe().loc['unique'].sort_values(ascending=False).index
df = df.reindex_axis(colz_ordered_by_unique, axis=1)

In [None]:
samlib.featureplot(df, nrows=7, ncols=3, figsize=(12, 4), plotfunc=plot_value_counts)

## Plot sale price distribution for each category

In [None]:
df.loc['train', 'LogSalePrice'] = logtarget.values

In [None]:
df.head()

In [None]:
train = df.loc['train']

In [None]:
def plot_price_dist(data, train=train, **kwargs):
    """Plot the price distribution for each category"""
    col = data.columns[0]
    dg = data.copy()
    dg['LogSalePrice'] = train['LogSalePrice']
    # Order categories by mean value
    ordered_cats =  dg.groupby(col).median().sort_values('LogSalePrice', ascending=False).index
    sns.violinplot(x="LogSalePrice", y=col, data=dg, order=ordered_cats, scale='width', **kwargs)
    #sns.swarmplot(x="LogSalePrice", y=col, data=df, color="w", alpha=.5);

In [None]:
samlib.featureplot(train, nrows=7, ncols=3, figsize=(12, 8), plotfunc=plot_price_dist)

## Change dtypes to `category`

In [None]:
def object_to_category(df):
    new = df.copy()
    for column, ctype in zip(df.columns, df.dtypes):
        if ctype == object:
            new[column] = df[column].astype('category')
    return new

pp.append(object_to_category)
assert all(pp().dtypes == 'category')

## Create dummy variables

In [None]:
pd.get_dummies(pp())