In [4]:
# this code will hide the coding cells..
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
The raw code for this IPython notebook is by default hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')

In [13]:
# Use this notebook for feature selection.
import pandas as pd
import numpy as np
import scipy as sp
from scipy import stats
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from ipywidgets import interact, interact_manual
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mp
import seaborn as sns
from IPython.display import HTML

##### read in data #####
train = pd.read_csv('clean_data/train_fe.csv', index_col=0)
test = pd.read_csv('clean_data/train_fe.csv', index_col=0)

##### initial modifications #####

# set target
target = 'SalePrice'

# train without log of target
train_original = train.copy()

# take log of target variable
train[target] = np.log(train[target])

##### feature classification #####

# separate data frames with numerical and categorical features
numericals = train.select_dtypes(include=[np.number])
categoricals = train.select_dtypes(exclude=[np.number])

# categorical feature data frame including target (numerical) variable
cat_target = pd.concat([train[['SalePrice']],categoricals], axis=1)

# lists of numerical and categorical features
num_list = numericals.drop('SalePrice', axis = 1).columns.tolist()
cat_list = categoricals.columns.tolist()

##### Global Variables #####
s = 3 # number of std deviations to exclude
fs = 15 # universal fontsize


# functions
def masking(df, feature, outliers=True, zeros=True):
    # returns modeified dataset
    
    # df          :  pandas dataframe
    # feature     :  string indicating feature
    # outliers    :  boolean to toggle presence of outliers
    # zeros       :  boolean to toggle presence of zero-values
    idx_not_zero = df[feature][df[feature] != 0].index
    if zeros == False:
        df = df.loc[idx_not_zero]
    if outliers == False:
        mask_outliers = (np.abs(stats.zscore(df[feature])) > s)
        df = df[~mask_outliers]
    
    return df




def get_corr_list(df, target=[], thresh = .5):
    # returns list of correlated numerical values 
    # from dataset given a threshold
    
    # df      :  pandas dataframe
    # thresh  :  threshold absolute value for correlations

    # get correlation list and drop target
    corr_list = df.drop(target, axis=1).corr().unstack()

    # ditch the values of 1.0
    corr_list = corr_list[corr_list != 1]

    # get list with threshold
    corr_list = corr_list[(corr_list > thresh) | (corr_list < -thresh)]

    # drop duplicates
    return corr_list.drop_duplicates().sort_values(ascending=False)


Correlation With Target Variable

In [17]:
# Correlation with SalePrice

@interact
def corr_vis(feature=num_list, IncludeOutliers=True, IncludeZeroValues=True):

    dataset = masking(df = train, feature=feature, outliers=IncludeOutliers, zeros=IncludeZeroValues)
    target = 'SalePrice'
    dfy = dataset[target]
    dfx = dataset[feature]

    f, (ax1, ax2) = plt.subplots(1, 2, sharey=False, figsize=(20,7.5))
    plt.rc('xtick', labelsize=fs) 
    plt.rc('ytick', labelsize=fs) 
    
    ax1.scatter(x=dfx, y=dfy)
    ax1.set_title('Log(SalePrice) vs {}'.format(feature), fontsize = fs)
    ax1.set_xlabel(feature, fontsize = fs)
    ax1.set_ylabel('Log({})'.format(target), fontsize = fs)

    ax2.hist(x=dfx)
    ax2.set_title('Distribution of {}'.format(feature), fontsize = fs)
    ax2.set_xlabel(feature, fontsize = fs)
    ax2.set_ylabel('Count', fontsize = fs)

    
    # print small table
    tbl = dataset[[feature]].describe().round(2).T
    feature_corr = dataset.corr().unstack().round(4)
    tbl['Corr with {}'.format(target)] = feature_corr[feature][target].round(4)
    print(tbl)

interactive(children=(Dropdown(description='feature', options=('MSSubClass', 'OverallQual', 'YearBuilt', 'Year…

In [7]:
# list of features that require a boolean support column
need_toggle = ['YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'ScreenPorch']

# list of features to drop after engineering
fe_drop =['PoolArea', 'X3SsnPorch', 'GrLivArea', 'FirePlaces', 'HalfBath', 'BsmtHalfBath','BsmtFullBath', 'LowQualFinSF', 'BsmtUnfSF','EnclosedPorch','KitchenAbvGr','BedroomAbvGr', 'FullBath', 'HalfBath', 'BsmtFinType2', 'TotalBsmtSF', 'X1stFlrSF', 'X2ndFlrSF', 'TotRmsAbvGrd', 'LotArea', 'LotFrontage']

# check out FirePlaces correlation

In [8]:
# Notes
# FullBath is strictly increasing, and can probably be treated as ordinal categorical
# X3SsnPorch's 0 values greatly span past the maximum SalePrice.  It makes me think this feature isn't necessary.
# MoSold: most sales happen in the summer, but there isn't much affect on the SalePrice
# GarageCars would be strictly increasing were it not for category 4 (4-car-garage)
# 

Correlation Plots

In [9]:
# Correlation plots

setlist = ['train', 'test']

@interact
def corr_vis(feature_y=num_list, feature_x = num_list, IncludeOutliers=True, IncludeZeroValues=True, DatasetSelection=setlist):
    # figure sizing parameters
    m = 12
    n = 5
    
    # conditionals to switch between datasets
    if DatasetSelection == 'train':
        dataselection = train
    else:
        dataselection = test
    feature = feature_x
    target = feature_y
    
    # anchor dataset prior to plotting
    dataset = masking(df = dataselection, feature=feature, outliers=IncludeOutliers, zeros=IncludeZeroValues)
    
    # anchor plotting parameters
    dfy = dataset[target]
    dfx = dataset[feature]
    
    f, axarr = plt.subplots(1,2, sharey=False,sharex=False, figsize=(m,n))
   
    axarr[0].scatter(x=dfx, y=dfy)
    axarr[0].set_title('{} vs {}'.format(target,feature), fontsize = fs)
    axarr[0].set_xlabel(feature, fontsize = fs)
    axarr[0].set_ylabel(target, fontsize = fs)

    axarr[1].hist(x=dfx)
    axarr[1].set_title('Distribution of {} {}ing Set'.format(feature, DatasetSelection), fontsize = fs)
    axarr[1].set_xlabel(feature, fontsize = fs)
    axarr[1].set_ylabel('Count', fontsize = fs)
    
    plt.tight_layout()
    
    # print small table
    tbl = dataset[[feature]].describe().round(2).T
    feature_corr = dataset.corr().unstack().round(4)
    tbl['Corr with {}'.format(target)] = feature_corr[feature][target].round(4)
    print(tbl)

interactive(children=(Dropdown(description='feature_y', options=('MSSubClass', 'LotFrontage', 'LotArea', 'Over…

In [10]:
get_corr_list(train, target = target)
num_drop = ['GarageCars']
# TotalSpaceSF = TotalBsmtSF + X1stFlrSF + X2ndFlrSF
# LotFA = np.sqrt(LotArea) + LotFrontage
# TotalBath = FullBath + HalfBath
# RmsPerLivSpc = TotRmsAbvGrd/(X1stFlrSF + X2ndFlrSF)
# FPperSF = FirePlaces/TotalSpaceSF



Effect of Chosen Categorical Variable on Median SalePrice

In [16]:
# Effect of chosen categorical variable on Median SalePrice
# Darker colors indicate higher frequency of category
cat_list = categoricals.columns.tolist()
@interact
def pivots(feature=cat_list, Aggregate = ['Mean', 'Median']):
    # figure sizing parameters
    m = 20
    n = 7.5
    
    if Aggregate == 'Median':
        Agg = np.median
    else:
        Agg = np.mean
    
    
    
    
    dfx = train_original
    data_normalizer = mp.colors.Normalize()
    color_map = mp.colors.LinearSegmentedColormap(
        "my_map",
        {
        "red": [(0, 1.0, 1.0),
                (1.0, .5, .5)],
        "green": [(0, 0.5, 0.5),
                  (1.0, 0, 0)],
        "blue": [(0, 0.50, 0.5),
                 (1.0, 0, 0)]
        }
    )
   
    data_train = dfx.groupby(feature)[feature].count()
    
    colors=color_map(data_normalizer((data_train.values)))
     
    fs = 15
    
    
    f, (ax1, ax2) = plt.subplots(1, 2, sharey=False, figsize=(m,n))
    
    plt.rc('xtick', labelsize=fs) 
    plt.rc('ytick', labelsize=fs) 
    
    condition_pivot_train = dfx.pivot_table(index=feature, values='SalePrice', aggfunc=Agg).reset_index()
    sns.barplot(x=feature,y='SalePrice', palette = colors, data=condition_pivot_train, ax=ax1)
    ax1.set_title('{} SalePrice vs {}'.format(Aggregate, feature), fontsize = fs)
    ax1.set_xlabel('{} Category'.format(feature), fontsize = fs)
    ax1.set_ylabel('{} SalePrice'.format(Aggregate), fontsize = fs)

    ax2.bar(x=data_train.index, height=data_train)
    ax2.set_title('Distribution of {} Categories'.format(feature), fontsize = fs)
    ax2.set_xlabel('{} Category'.format(feature), fontsize = fs)
    ax2.set_ylabel('Count', fontsize = fs)
    

interactive(children=(Dropdown(description='feature', options=('MSZoning', 'Street', 'Alley', 'LotShape', 'Lan…

In [12]:
### AAAHHHHH WHAT DO I DO WITH THESE FEATURES!!??!1
cat_ordinal = ['BsmtQual', ]
cat_drop = ['BsmtCond', 'RoofStyle', 'Alley', 'LandContour', 'Utilities', 'LandSlope', 'RoofMatl', 'BsmtFinType2', 'Functional', 'MiscFeature']
cat_drop_maybe = ['Condition1', 'Condition2', 'BldgType', 'ExterCond', 'Fence', 'SaleCondition']
# bin LotConfig to be CulDSac or not
# bin Exterior1st and Exterior2nd to be VinylSd or not
# bin Foundation to be PConc or not
# bin BsmtFinType1 to be GLQ or not
# bin Electrical to be SBrkr or not




In [None]:
get_corr_list(train, target = target)