In [1]:
# Use this notebook for feature selection.
# First, look at correlation with target variable.  Remove outliers.

In [16]:
import pandas as pd
import numpy as np
import scipy as sp
from scipy import stats
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from ipywidgets import interact, interact_manual
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mp
import seaborn as sns

In [162]:
# read in data
train = pd.read_csv('../data/clean_train.csv', index_col=0)
test = pd.read_csv('../data/clean_test.csv', index_col=0)


In [163]:
# initial modifications
# set target
target = train['SalePrice']


In [167]:
# feature classification

# separate data frames with numerical and categorical features
numericals = train.select_dtypes(include=[np.number])
categoricals = train.select_dtypes(exclude=[np.number])

# categorical feature data frame including target (numerical) variable
cat_target = pd.concat([train[['SalePrice']],categoricals], axis=1)

# lists of numerical and categorical features
num_list = numericals.drop('SalePrice', axis = 1).columns.tolist()
cat_list = categoricals.columns.tolist()


### Categorical Variable Classification
nominals = []
ordinals = []


In [7]:
# numerical correlations with target variable

num_corr = numericals.corr()['SalePrice']


In [170]:
# Correlation with SalePrice
s = 3
fs = 15


@interact
def corr_vis(feature=num_list, outliers=True):
    mask = (np.abs(stats.zscore(train[feature])) > s)
    if outliers:
        y = target
        x = train[feature]
    else:
        y = target[~mask]
        x = train[feature][~mask]
    
    f, (ax1, ax2) = plt.subplots(1, 2, sharey=False, figsize=(20,7.5))
    plt.rc('xtick', labelsize=fs) 
    plt.rc('ytick', labelsize=fs) 
    
    ax1.scatter(x=x, y=y)
    ax1.set_title('SalePrice vs {}'.format(feature), fontsize = fs)
    ax1.set_xlabel(feature, fontsize = fs)
    ax1.set_ylabel('SalePrice', fontsize = fs)

    ax2.hist(x=x)
    ax2.set_title('Distribution of {}'.format(feature), fontsize = fs)
    ax2.set_xlabel(feature, fontsize = fs)
    ax2.set_ylabel('Count', fontsize = fs)

    
    

interactive(children=(Dropdown(description='feature', options=('LotFrontage', 'LotArea', 'YearBuilt', 'YearRem…

In [9]:
# FullBath is strictly increasing, and can probably be treated as ordinal categorical
# X3SsnPorch's 0 values greatly span past the maximum SalePrice.  It makes me think this feature isn't necessary.
# MoSold: most sales happen in the summer, but there isn't much affect on the SalePrice
# GarageCars would be strictly increasing were it not for category 4 (4-car-garage)
# 

In [172]:
# Correlation with SalePrice
s = 3
fs = 15
m = 7.5
n = 7.5


@interact
def corr_vis(feature_y=num_list, feature_x = num_list, outliers=True):
    mask_train = (np.abs(stats.zscore(train[feature_x])) > s)
    mask_test = (np.abs(stats.zscore(test[feature_x])) > s)
    
    if outliers:
        y = train[feature_y]
        x = train[feature_x]
        ytest = test[feature_y]
        xtest = test[feature_x]
    else:
        y = train[feature_y][~mask_train]
        x = train[feature_x][~mask_train]
        ytest = test[feature_y][~mask_test]
        xtest = test[feature_x][~mask_test]
    
    f, axarr = plt.subplots(2,2, sharey=False,sharex=False, figsize=(m,n))
   
    
#     plt.rc('xtick', labelsize=fs) 
#     plt.rc('ytick', labelsize=fs) 
    
    plt.title('{} vs {}'.format(feature_y,feature_x), fontsize = fs)
    
    axarr[0,0].scatter(x=x, y=y)
    axarr[0,0].set_xlabel(feature_x, fontsize = fs)
    axarr[0,0].set_ylabel(feature_y, fontsize = fs)

    axarr[0,1].hist(x=x)
    axarr[0,1].set_title('Distribution of {}'.format(feature_x), fontsize = fs)
    axarr[0,1].set_xlabel(feature_x, fontsize = fs)
    axarr[0,1].set_ylabel('Count', fontsize = fs)
    
    
    axarr[1,0].scatter(x=xtest, y=ytest)
    axarr[1,0].set_xlabel(feature_x, fontsize = fs)
    axarr[1,0].set_ylabel(feature_y, fontsize = fs)
    
    
    axarr[1,1].hist(x=xtest)
    axarr[1,1].set_title('Distribution of {}'.format(feature_y), fontsize = fs)
    axarr[1,1].set_xlabel(feature_y, fontsize = fs)
    axarr[1,1].set_ylabel('Count', fontsize = fs)
    
    plt.tight_layout()


Id
838      21.000000
435      21.000000
1008     21.000000
650      21.000000
656      21.000000
364      21.000000
1379     21.000000
431      21.000000
236      21.000000
916      21.000000
233      21.000000
76       21.000000
228      21.000000
226      21.000000
490      21.000000
1292     21.000000
1220     21.000000
501      21.000000
1039     21.000000
1450     21.000000
1040     21.000000
1030     21.000000
615      21.000000
759      24.000000
963      24.000000
505      24.000000
196      24.000000
57       24.000000
1105     24.000000
194      24.000000
           ...    
1191    129.227039
530     129.766699
524     130.000000
430     130.000000
967     130.000000
1152    134.000000
160     134.000000
447     137.000000
1174    138.000000
1288    139.854396
278     140.000000
172     141.000000
808     144.000000
910     149.000000
314     150.000000
1212    152.000000
1338    153.000000
1183    160.000000
1108    168.000000
232     174.000000
198     174.000000
1128    1

In [186]:
# Correlation plots
s = 3
fs = 15
m = 10
n = 7.5


@interact
def corr_vis(feature_y=num_list, feature_x = num_list, outliers=True):
    mask_train = (np.abs(stats.zscore(train[feature_x])) > s)
    mask_test = (np.abs(stats.zscore(test[feature_x])) > s)
    
    if outliers:
        y = train[feature_y]
        x = train[feature_x]
        ytest = test[feature_y]
        xtest = test[feature_x]
    else:
        y = train[feature_y][~mask_train]
        x = train[feature_x][~mask_train]
        ytest = test[feature_y][~mask_test]
        xtest = test[feature_x][~mask_test]
    
    f, axarr = plt.subplots(2,2, sharey=False,sharex=False, figsize=(m,n))
   
    
#     plt.rc('xtick', labelsize=fs) 
#     plt.rc('ytick', labelsize=fs) 
    
    plt.title('{} vs {}'.format(feature_y,feature_x), fontsize = fs)
    
    axarr[0,0].scatter(x=x, y=y)
    axarr[0,0].set_xlabel(feature_x, fontsize = fs)
    axarr[0,0].set_ylabel(feature_y, fontsize = fs)

    axarr[0,1].hist(x=x)
    axarr[0,1].set_title('Distribution of {}'.format(feature_x), fontsize = fs)
    axarr[0,1].set_xlabel(feature_x, fontsize = fs)
    axarr[0,1].set_ylabel('Count', fontsize = fs)
    
    
    axarr[1,0].scatter(x=xtest, y=ytest)
    axarr[1,0].set_xlabel(feature_x, fontsize = fs)
    axarr[1,0].set_ylabel(feature_y, fontsize = fs)
    
    
    axarr[1,1].hist(x=xtest)
    axarr[1,1].set_title('Distribution of {}'.format(feature_y), fontsize = fs)
    axarr[1,1].set_xlabel(feature_y, fontsize = fs)
    axarr[1,1].set_ylabel('Count', fontsize = fs)
    
    plt.tight_layout()

    

interactive(children=(Dropdown(description='feature_y', options=('LotFrontage', 'LotArea', 'YearBuilt', 'YearR…

In [None]:
exclusion_list = ['BsmtHalfBath', 'MiscVal', 'LowQualFinSF', 'OverallCond', 'KitchenAbvGr']

In [530]:
# all correlations with SalePrice
cat_target_corr = cat_target.corr()['SalePrice']


In [111]:
# Effect of chosen categorical variable on Median SalePrice
cat_list = categoricals.columns.tolist()
@interact
def pivots(feature=cat_list):
    
    data_normalizer = mp.colors.Normalize()
    color_map = mp.colors.LinearSegmentedColormap(
        "my_map",
        {
        "red": [(0, 1.0, 1.0),
                (1.0, .5, .5)],
        "green": [(0, 0.5, 0.5),
                  (1.0, 0, 0)],
        "blue": [(0, 0.50, 0.5),
                 (1.0, 0, 0)]
        }
    )
   
    data_train = train.groupby(feature)[feature].count()
    
    colors=color_map(data_normalizer((data_train.values)))
     
    fs = 15
    
    
    f, (ax1, ax2) = plt.subplots(1, 2, sharey=False, figsize=(20,7.5))
    
    plt.rc('xtick', labelsize=fs) 
    plt.rc('ytick', labelsize=fs) 
    
    condition_pivot_train = train.pivot_table(index=feature, values='SalePrice', aggfunc=np.median).reset_index()
    sns.barplot(x=feature,y='SalePrice', palette = colors, data=condition_pivot_train, ax=ax1)
    ax1.set_title('Median SalePrice vs {}'.format(feature), fontsize = fs)
    ax1.set_xlabel(feature, fontsize = fs)
    ax1.set_ylabel('Median SalePrice', fontsize = fs)

    ax2.bar(x=data_train.index, height=data_train)
    ax2.set_title('Distribution of {}'.format(feature), fontsize = fs)
    ax2.set_xlabel(feature, fontsize = fs)
    ax2.set_ylabel('Count', fontsize = fs)


interactive(children=(Dropdown(description='feature', options=('MSSubClass', 'MSZoning', 'Street', 'Alley', 'L…