In [1]:
# Use this notebook for feature selection.
import pandas as pd
import numpy as np
import scipy as sp
from scipy import stats
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from ipywidgets import interact, interact_manual
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mp
import seaborn as sns
from IPython.display import HTML

##### read in data #####
train = pd.read_csv('../data/clean_train.csv', index_col=0)
test = pd.read_csv('../data/clean_test.csv', index_col=0)

##### initial modifications #####

# set target
target = 'SalePrice'

# train without log of target
train_original = train.copy()

# take log of target variable
train[target] = np.log(train[target])

##### feature classification #####

# separate data frames with numerical and categorical features
numericals = train.select_dtypes(include=[np.number])
categoricals = train.select_dtypes(exclude=[np.number])

# categorical feature data frame including target (numerical) variable
cat_target = pd.concat([train[['SalePrice']],categoricals], axis=1)

# lists of numerical and categorical features
num_list = numericals.drop('SalePrice', axis = 1).columns.tolist()
cat_list = categoricals.columns.tolist()

##### Global Variables #####
s = 3 # number of std deviations to exclude
fs = 15 # universal fontsize

In [64]:
# get list of correlated numerical values from dataset given a threshold

def get_corr_list(df, thresh = .5):


    # get correlation list
    corr_list = df.corr().unstack()

    # ditch the values of 1.0
    corr_list = corr_list[corr_list != 1]

    # get list with threshold
    corr_list = corr_list[(corr_list > thresh) | (corr_list < -thresh)]

    # drop duplicates
    return corr_list.drop_duplicates()


In [65]:
get_corr_list(train)

LotFrontage   LotArea         0.678814
YearBuilt     YearRemodAdd    0.592855
              GarageCars      0.537850
              SalePrice       0.586570
YearRemodAdd  SalePrice       0.565608
BsmtFinSF1    TotalBsmtSF     0.522396
              BsmtFullBath    0.649212
TotalBsmtSF   X1stFlrSF       0.819530
              SalePrice       0.612134
X1stFlrSF     GrLivArea       0.566024
              SalePrice       0.596981
X2ndFlrSF     GrLivArea       0.687501
              HalfBath        0.609707
              BedroomAbvGr    0.502901
              TotRmsAbvGrd    0.616423
GrLivArea     FullBath        0.630012
              BedroomAbvGr    0.521270
              TotRmsAbvGrd    0.825489
              SalePrice       0.700927
FullBath      TotRmsAbvGrd    0.554784
              SalePrice       0.594771
BedroomAbvGr  TotRmsAbvGrd    0.676620
TotRmsAbvGrd  SalePrice       0.534422
GarageYrBlt   GarageCars     -0.566935
              GarageArea     -0.530063
GarageCars    GarageArea 

In [7]:
len(train.corr().unstack()['LotFrontage'])

34

In [None]:
# which numerical variables will 