In [4]:
# Use this notebook for feature selection.
# First, look at correlation with target variable.  Remove outliers.

In [5]:
# this code will hide the coding cells..
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
The raw code for this IPython notebook is by default hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')

In [6]:
import pandas as pd
import numpy as np
import scipy as sp
from scipy import stats
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from ipywidgets import interact, interact_manual
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mp
import seaborn as sns
from IPython.display import HTML

In [171]:
# read in data
train = pd.read_csv('../data/clean_train.csv', index_col=0)
test = pd.read_csv('../data/clean_test.csv', index_col=0)

##### initial modifications #####



# set target
target = 'SalePrice'

# take log of target variable
train[target] = np.log(train[target])


##### feature classification #####

# separate data frames with numerical and categorical features
numericals = train.select_dtypes(include=[np.number])
categoricals = train.select_dtypes(exclude=[np.number])

# categorical feature data frame including target (numerical) variable
cat_target = pd.concat([train[['SalePrice']],categoricals], axis=1)

# lists of numerical and categorical features
num_list = numericals.drop('SalePrice', axis = 1).columns.tolist()
cat_list = categoricals.columns.tolist()


### Categorical Variable Classification
nominals = []
ordinals = []



In [163]:
def masking(df, feature, outliers=True, zeroes=True):
    s = 3
    
    idx_not_zero = df[feature][df[feature] != 0].index
    if zeroes == False:
        df = df.loc[idx_not_zero]
    if outliers == False:
        mask_outliers = (np.abs(stats.zscore(df[feature])) > s)
        df = df[~mask_outliers]
    
    return df




In [164]:
# Correlation with SalePrice
s = 3
fs = 15


@interact
def corr_vis(feature=num_list, IncludeOutliers=True, IncludeZeroValues=True):

    dataset = masking(df = train, feature=feature, outliers=IncludeOutliers, zeroes=IncludeZeroValues)
    target = 'SalePrice'
    dfy = dataset[target]
    dfx = dataset[feature]

    f, (ax1, ax2) = plt.subplots(1, 2, sharey=False, figsize=(20,7.5))
    plt.rc('xtick', labelsize=fs) 
    plt.rc('ytick', labelsize=fs) 
    
    ax1.scatter(x=dfx, y=dfy)
    ax1.set_title('SalePrice vs {}'.format(feature), fontsize = fs)
    ax1.set_xlabel(feature, fontsize = fs)
    ax1.set_ylabel('SalePrice', fontsize = fs)

    ax2.hist(x=dfx)
    ax2.set_title('Distribution of {}'.format(feature), fontsize = fs)
    ax2.set_xlabel(feature, fontsize = fs)
    ax2.set_ylabel('Count', fontsize = fs)

    
    

interactive(children=(Dropdown(description='feature', options=('LotFrontage', 'LotArea', 'YearBuilt', 'YearRem…

In [165]:
# Notes
# FullBath is strictly increasing, and can probably be treated as ordinal categorical
# X3SsnPorch's 0 values greatly span past the maximum SalePrice.  It makes me think this feature isn't necessary.
# MoSold: most sales happen in the summer, but there isn't much affect on the SalePrice
# GarageCars would be strictly increasing were it not for category 4 (4-car-garage)
# 

In [168]:
# Correlation plots
# Subplots are laid out in a 2 x 2 grid, with training data set on top, and test data set on bottom.
s = 3
fs = 15
m = 12
n = 5

setlist = ['train', 'test']

@interact
def corr_vis(feature_y=num_list, feature_x = num_list, IncludeOutliers=True, IncludeZeroValues=True, DatasetSelection=setlist):
    if DatasetSelection == 'train':
        dataselection = train
    else:
        dataselection = test
    feature = feature_x
    target = feature_y
    
    
    dataset = masking(df = dataselection, feature=feature, outliers=IncludeOutliers, zeroes=IncludeZeroValues)
    
    
    dfy = dataset[target]
    dfx = dataset[feature]
    
    f, axarr = plt.subplots(1,2, sharey=False,sharex=False, figsize=(m,n))
   
    axarr[0].scatter(x=dfx, y=dfy)
    axarr[0].set_title('{} vs {}'.format(target,feature), fontsize = fs)
    axarr[0].set_xlabel(feature, fontsize = fs)
    axarr[0].set_ylabel(target, fontsize = fs)

    axarr[1].hist(x=dfx)
    axarr[1].set_title('Distribution of {} {}ing Set'.format(feature, DatasetSelection), fontsize = fs)
    axarr[1].set_xlabel(feature, fontsize = fs)
    axarr[1].set_ylabel('Count', fontsize = fs)
    
    plt.tight_layout()
    
    tbl = dataset[[feature]].describe().round(2).T
    feature_corr = dataset.corr().unstack().round(4)
    tbl['Corr with {}'.format(target)] = feature_corr[feature][target].round(4)
    print(tbl)

interactive(children=(Dropdown(description='feature_y', options=('LotFrontage', 'LotArea', 'YearBuilt', 'YearR…

In [167]:
# Effect of chosen categorical variable on Median SalePrice
# Darker colors indicate higher frequency of category
cat_list = categoricals.columns.tolist()
@interact
def pivots(feature=cat_list):
    
    data_normalizer = mp.colors.Normalize()
    color_map = mp.colors.LinearSegmentedColormap(
        "my_map",
        {
        "red": [(0, 1.0, 1.0),
                (1.0, .5, .5)],
        "green": [(0, 0.5, 0.5),
                  (1.0, 0, 0)],
        "blue": [(0, 0.50, 0.5),
                 (1.0, 0, 0)]
        }
    )
   
    data_train = train.groupby(feature)[feature].count()
    
    colors=color_map(data_normalizer((data_train.values)))
     
    fs = 15
    
    
    f, (ax1, ax2) = plt.subplots(1, 2, sharey=False, figsize=(20,7.5))
    
    plt.rc('xtick', labelsize=fs) 
    plt.rc('ytick', labelsize=fs) 
    
    condition_pivot_train = train.pivot_table(index=feature, values='SalePrice', aggfunc=np.median).reset_index()
    sns.barplot(x=feature,y='SalePrice', palette = colors, data=condition_pivot_train, ax=ax1)
    ax1.set_title('Median SalePrice vs {}'.format(feature), fontsize = fs)
    ax1.set_xlabel(feature, fontsize = fs)
    ax1.set_ylabel('Median SalePrice', fontsize = fs)

    ax2.bar(x=data_train.index, height=data_train)
    ax2.set_title('Distribution of {}'.format(feature), fontsize = fs)
    ax2.set_xlabel(feature, fontsize = fs)
    ax2.set_ylabel('Count', fontsize = fs)


interactive(children=(Dropdown(description='feature', options=('MSSubClass', 'MSZoning', 'Street', 'Alley', 'L…