In [1]:
# Use this notebook for feature selection.
# First, look at correlation with target variable.  Remove outliers.

In [21]:
import pandas as pd
import numpy as np
import scipy as sp
from scipy import stats
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from ipywidgets import interact, interact_manual
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mp
import seaborn as sns

In [19]:
# read in data
train = pd.read_csv('../data/clean_train.csv', index_col=0)
test = pd.read_csv('../data/clean_test.csv', index_col=0)

# initial modifications
# set target
target = np.log(train['SalePrice'])

##### feature classification #####

# separate data frames with numerical and categorical features
numericals = train.select_dtypes(include=[np.number])
categoricals = train.select_dtypes(exclude=[np.number])

# categorical feature data frame including target (numerical) variable
cat_target = pd.concat([train[['SalePrice']],categoricals], axis=1)

# lists of numerical and categorical features
num_list = numericals.drop('SalePrice', axis = 1).columns.tolist()
cat_list = categoricals.columns.tolist()


### Categorical Variable Classification
nominals = []
ordinals = []



In [44]:
# Correlation with SalePrice
s = 3
fs = 15


@interact
def corr_vis(feature=num_list, outliers=True):
    mask = (np.abs(stats.zscore(train[feature])) > s)
    if outliers:
        y = target
        x = train[feature]
    else:
        y = target[~mask]
        x = train[feature][~mask]
    
    f, (ax1, ax2) = plt.subplots(1, 2, sharey=False, figsize=(20,7.5))
    plt.rc('xtick', labelsize=fs) 
    plt.rc('ytick', labelsize=fs) 
    
    ax1.scatter(x=x, y=y)
    ax1.set_title('SalePrice vs {}'.format(feature), fontsize = fs)
    ax1.set_xlabel(feature, fontsize = fs)
    ax1.set_ylabel('SalePrice', fontsize = fs)

    ax2.hist(x=x)
    ax2.set_title('Distribution of {}'.format(feature), fontsize = fs)
    ax2.set_xlabel(feature, fontsize = fs)
    ax2.set_ylabel('Count', fontsize = fs)

    
    

interactive(children=(Dropdown(description='feature', options=('MSSubClass', 'LotFrontage', 'LotArea', 'Overal…

In [9]:
# Notes
# FullBath is strictly increasing, and can probably be treated as ordinal categorical
# X3SsnPorch's 0 values greatly span past the maximum SalePrice.  It makes me think this feature isn't necessary.
# MoSold: most sales happen in the summer, but there isn't much affect on the SalePrice
# GarageCars would be strictly increasing were it not for category 4 (4-car-garage)
# 

In [242]:
# Correlation plots
# Subplots are laid out in a 2 x 2 grid, with training data set on top, and test data set on bottom.
s = 3
fs = 15
m = 10
n = 7.5


@interact
def corr_vis(feature_y=num_list, feature_x = num_list, outliers=True):
    mask_train = (np.abs(stats.zscore(train[feature_x])) > s)
    mask_test = (np.abs(stats.zscore(test[feature_x])) > s)
    
    if outliers:
        y = train[feature_y]
        x = train[feature_x]
        ytest = test[feature_y]
        xtest = test[feature_x]
    else:
        y = train[feature_y][~mask_train]
        x = train[feature_x][~mask_train]
        ytest = test[feature_y][~mask_test]
        xtest = test[feature_x][~mask_test]
    
    f, axarr = plt.subplots(2,2, sharey=False,sharex=False, figsize=(m,n))
   
    
    plt.title('{} vs {}'.format(feature_y,feature_x), fontsize = fs)
    
    axarr[0,0].scatter(x=x, y=y)
    axarr[0,0].set_xlabel(feature_x, fontsize = fs)
    axarr[0,0].set_ylabel(feature_y, fontsize = fs)

    axarr[0,1].hist(x=x)
    axarr[0,1].set_title('Distribution of {} TRAINING SET'.format(feature_x), fontsize = fs)
    axarr[0,1].set_xlabel(feature_x, fontsize = fs)
    axarr[0,1].set_ylabel('Count', fontsize = fs)
    
    
    axarr[1,0].scatter(x=xtest, y=ytest)
    axarr[1,0].set_xlabel(feature_x, fontsize = fs)
    axarr[1,0].set_ylabel(feature_y, fontsize = fs)
    
    
    axarr[1,1].hist(x=xtest)
    axarr[1,1].set_title('Distribution of {} TEST SET'.format(feature_y), fontsize = fs)
    axarr[1,1].set_xlabel(feature_y, fontsize = fs)
    axarr[1,1].set_ylabel('Count', fontsize = fs)
    
    plt.tight_layout()
    
    tbl = train[[feature_x]].describe().round(2).T
    feature_corr = train.corr().unstack().round(4)
    tbl['Corr with {}'.format(feature_y)] = feature_corr[feature_x][feature_y].round(4)
    print(tbl)

interactive(children=(Dropdown(description='feature_y', options=('LotFrontage', 'LotArea', 'YearBuilt', 'YearR…

In [43]:
# Effect of chosen categorical variable on Median SalePrice
# Darker colors indicate higher frequency of category
cat_list = categoricals.columns.tolist()
@interact
def pivots(feature=cat_list):
    
    data_normalizer = mp.colors.Normalize()
    color_map = mp.colors.LinearSegmentedColormap(
        "my_map",
        {
        "red": [(0, 1.0, 1.0),
                (1.0, .5, .5)],
        "green": [(0, 0.5, 0.5),
                  (1.0, 0, 0)],
        "blue": [(0, 0.50, 0.5),
                 (1.0, 0, 0)]
        }
    )
   
    data_train = train.groupby(feature)[feature].count()
    
    colors=color_map(data_normalizer((data_train.values)))
     
    fs = 15
    
    
    f, (ax1, ax2) = plt.subplots(1, 2, sharey=False, figsize=(20,7.5))
    
    plt.rc('xtick', labelsize=fs) 
    plt.rc('ytick', labelsize=fs)
    plt.xticks(rotation=70)
    
    condition_pivot_train = train.pivot_table(index=feature, values='SalePrice', aggfunc=np.median).reset_index().sort_values('SalePrice')
    #condition_pivot_train.SalePrice = np.log(condition_pivot_train.SalePrice)
    sns.barplot(x=feature,y='SalePrice', palette = colors, data=condition_pivot_train, ax=ax1)
    ax1.set_title('Median SalePrice vs {}'.format(feature), fontsize = fs)
    ax1.set_xlabel(feature, fontsize = fs)
    ax1.set_ylabel('Median SalePrice', fontsize = fs)
    #ax1.set_xticklabels(labels = train['feature'].unique(), rotation=70)

    ax2.bar(x=condition_pivot_train[feature], height=data_train)
    ax2.set_title('Distribution of {}'.format(feature), fontsize = fs)
    ax2.set_xlabel(feature, fontsize = fs)
    ax2.set_ylabel('Count', fontsize = fs)


interactive(children=(Dropdown(description='feature', options=('MSZoning', 'Street', 'Alley', 'LotShape', 'Lan…

In [42]:
train.SaleType.unique()

array(['WD', 'New', 'COD', 'ConLD', 'ConLI', 'CWD', 'ConLw', 'Con', 'Oth'],
      dtype=object)