## Feature Selection Techniques

In [1]:
import pandas as pd
import numpy as np

# view all rows
pd.set_option('display.max_columns', None)

### Removing Constant Features

##### *Informational*

The first feature selection technique involves indicating the constant features. Features that only have one value will not be important to our model so we will remove them from the data. This can be thought of as *informational* dimensionality reduction.

In [2]:
# bring in our data set
data = pd.read_csv('data/data_eda.csv')

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,city,country,state,zip,day,month,year,weekday
0,0,376000.0,3.0,2.0,1340,1384,3.0,0,0,3,1340,0,2008,0,Seattle,USA,WA,98103,9,5,2014,Friday
1,1,800000.0,4.0,3.25,3540,159430,2.0,0,0,3,3540,0,2007,0,Carnation,USA,WA,98014,9,5,2014,Friday
2,3,324000.0,3.0,2.25,998,904,2.0,0,0,3,798,200,2007,0,Seattle,USA,WA,98117,9,5,2014,Friday
3,4,549900.0,5.0,2.75,3060,7015,1.0,0,0,5,1600,1460,1979,0,Seattle,USA,WA,98146,10,5,2014,Saturday
4,5,320000.0,3.0,2.5,2130,6969,2.0,0,0,3,2130,0,2003,0,Maple Valley,USA,WA,98038,10,5,2014,Saturday


In [16]:
# these columns have only 1 value so they are constant and can be dropped from the data.
col = list(data.columns)

# create an empty list that will store constant features 
constant_features = []

for col in data[col]:
    unique = len(data[col].unique())
    if unique == 1:
        constant_features.append(col)
        print(col)

country
state
year


In [18]:
# remove those features
data.drop(columns = constant_features, axis = 1, inplace = True)

In [24]:
data.drop(columns = ['Unnamed: 0'], axis = 1, inplace = True)

In [25]:
data.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,city,zip,day,month,weekday
0,376000.0,3.0,2.0,1340,1384,3.0,0,0,3,1340,0,2008,0,Seattle,98103,9,5,Friday
1,800000.0,4.0,3.25,3540,159430,2.0,0,0,3,3540,0,2007,0,Carnation,98014,9,5,Friday
2,324000.0,3.0,2.25,998,904,2.0,0,0,3,798,200,2007,0,Seattle,98117,9,5,Friday
3,549900.0,5.0,2.75,3060,7015,1.0,0,0,5,1600,1460,1979,0,Seattle,98146,10,5,Saturday
4,320000.0,3.0,2.5,2130,6969,2.0,0,0,3,2130,0,2003,0,Maple Valley,98038,10,5,Saturday


##### *Redundancy*

For the next feature selection techniques, we will use a copy of the data because we will not always be dropping the same features. This section of feature selection techniques covers the *redundency* of variables.

## Pairwise Correlation

If two variables are highly correlated with eachother we can remove a variable without too much information loss. This is because these variables introduce redundancy to the model. If two variables are highly correlated with each other the variable with a higher correlation to the target variable will be kept. The tolerance I will be using is 0.65.

In [27]:
# correlation matrix
corr_matrix = data.select_dtypes(exclude='object').corr()
corr_matrix

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,zip,day,month
price,1.0,0.313803,0.487606,0.633164,0.097828,0.275771,0.068505,0.224112,0.051303,0.544639,0.232691,0.064273,-0.07437,-0.063574,-0.040728,0.037869
bedrooms,0.313803,1.0,0.520278,0.59663,0.062053,0.161719,-0.034603,0.060635,0.009234,0.465052,0.306269,0.162345,-0.07379,-0.153023,-0.010499,0.016215
bathrooms,0.487606,0.520278,1.0,0.728175,0.087904,0.500721,0.011755,0.121398,-0.143904,0.650862,0.223356,0.521959,-0.244874,-0.215463,-0.027841,0.026166
sqft_living,0.633164,0.59663,0.728175,1.0,0.200044,0.337449,0.027987,0.197852,-0.08563,0.857856,0.371716,0.334883,-0.141093,-0.218076,-0.024381,0.032693
sqft_lot,0.097828,0.062053,0.087904,0.200044,1.0,-0.002191,0.026333,0.081267,0.017548,0.19663,0.02916,0.038113,-0.013672,-0.125807,0.017221,-0.020198
floors,0.275771,0.161719,0.500721,0.337449,-0.002191,1.0,0.000287,-0.012413,-0.292564,0.5305,-0.309964,0.499238,-0.246453,-0.078966,-0.030881,0.029729
waterfront,0.068505,-0.034603,0.011755,0.027987,0.026333,0.000287,1.0,0.299774,0.026657,0.013238,0.029862,-0.042625,0.016161,0.021715,0.019098,-0.003893
view,0.224112,0.060635,0.121398,0.197852,0.081267,-0.012413,0.299774,1.0,0.047661,0.080661,0.234446,-0.068373,0.044841,0.112156,0.042816,-0.034697
condition,0.051303,0.009234,-0.143904,-0.08563,0.017548,-0.292564,0.026657,0.047661,1.0,-0.204903,0.205625,-0.402012,-0.185023,0.032565,-0.02675,0.014298
sqft_above,0.544639,0.465052,0.650862,0.857856,0.19663,0.5305,0.013238,0.080661,-0.204903,1.0,-0.15819,0.454603,-0.178274,-0.278393,-0.021329,0.041956


In [61]:
def pairwise(corr_matrix, threshold=0.65):
    # create an empty list that will be our rows
    results = []
    # price is the first column of our correlation matrix
    dependent_var = corr_matrix.columns[0]
    
    # loop through the columns
    for i in range(1, len(corr_matrix.columns)):
        # inner loop - within the range of columns...so we start on the first column and check each row before moving to next column
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # i is column j is row
                results.append({
                    'first_variable': corr_matrix.columns[i],
                    'second_variable': corr_matrix.columns[j],
                    'first_variable_price_corr': corr_matrix.iloc[0, i],
                    'second_var_price_corr': corr_matrix.iloc[0, j]
                })
    
    return pd.DataFrame(results)



Based on our function we would remove bathrooms and sqft_above.

In [63]:
# view the variables
pairwise(corr_matrix)

Unnamed: 0,first_variable,second_variable,first_variable_price_corr,second_var_price_corr
0,sqft_living,bathrooms,0.633164,0.487606
1,sqft_above,bathrooms,0.544639,0.487606
2,sqft_above,sqft_living,0.544639,0.633164


Now, we will make a copy of the correlation matrix and create a function that will loop through and delete the columns that have a high correlation with another variable but less correlation to the target variable. We will also return the dropped columns to see if it matches the above function (drop sqft_above and bathrooms).

In [68]:
corrMatrix = corr_matrix.copy()

In [84]:
# function to remove highly correlated variables
def remove_highly_correlated(corr_matrix, threshold=0.65):
    # Get the dependent variable (first column)
    dependent_var = corr_matrix.columns[0]
    
    # Create a set to keep track of columns to drop - create set because there is no duplicates
    to_drop = set()
    
    # Iterate through the correlation matrix
    for i in range(1, len(corr_matrix.columns)): # start at column 1 so price is not included
        for j in range(i):                       # for each column in that row
            if abs(corr_matrix.iloc[i, j]) > threshold:
                col1 = corr_matrix.columns[i]
                col2 = corr_matrix.columns[j]
                
                # Compare correlations with the dependent variable
                corr_with_dep_col1 = abs(corr_matrix.iloc[0, i]) # correlation of column
                corr_with_dep_col2 = abs(corr_matrix.iloc[0, j]) # correlation of row
                
                if corr_with_dep_col1 < corr_with_dep_col2:
                    to_drop.add(col1)
                else:
                    to_drop.add(col2)
    
    # Drop the columns from the correlation matrix
    corr_matrix = corr_matrix.drop(columns=to_drop, index=to_drop)
    
    return corr_matrix, list(to_drop)

In [85]:
new_matrix, dropped_columns = remove_highly_correlated(corrMatrix)

In [86]:
dropped_columns

['bathrooms', 'sqft_above']

### Multicollinearity

In this technique we will use variance inflation factor. The VIF of 5 is the general threshold for high multicollinearity.

In [95]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools import add_constant

In [103]:
data_used = data.copy()
# drop price 
data_used = add_constant(data_used)
def calculate_vif(df):
    vif_data = pd.DataFrame()
    vif_data["feature"] = df.columns
    vif_data["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    return vif_data

vif = calculate_vif(data_used.select_dtypes(exclude = 'object')).round(2)
print(vif)

          feature         VIF
0           const  4459530.81
1           price        1.91
2        bedrooms        1.70
3       bathrooms        3.14
4     sqft_living         inf
5        sqft_lot        1.08
6          floors        2.04
7      waterfront        1.11
8            view        1.23
9       condition        1.47
10     sqft_above         inf
11  sqft_basement         inf
12       yr_built        2.31
13   yr_renovated        1.33
14            zip        1.26
15            day        1.42
16          month        1.42


  vif = 1. / (1. - r_squared_i)


In [111]:
from sklearn.linear_model import LinearRegression
data_used = data_used.select_dtypes(exclude='object')
# Function to calculate VIF
def calculate_vif(data, target_col):
    features = data.columns[data.columns != target_col]
    X = data[features]
    y = data[target_col]

    # Fit linear regression model
    lin_reg = LinearRegression().fit(X, y)

    # Calculate VIF
    try:
        vif = 1 / (1 - lin_reg.score(X, y))
    except ZeroDivisionError:
        vif = float('inf')

    return vif

# Calculate VIF for each predictor
vif_data = pd.DataFrame()
vif_data["Variable"] = data_used.columns
vif_data["VIF"] = [calculate_vif(data_used, col) for col in data_used.columns]

# Display the VIF values
print(vif_data)

         Variable       VIF
0           const       inf
1           price  1.910255
2        bedrooms  1.699271
3       bathrooms  3.138355
4     sqft_living       inf
5        sqft_lot  1.078554
6          floors  2.039632
7      waterfront  1.105409
8            view  1.231424
9       condition  1.471221
10     sqft_above       inf
11  sqft_basement       inf
12       yr_built  2.311166
13   yr_renovated  1.331054
14            zip  1.262113
15            day  1.421239
16          month  1.418868


In [109]:
data_used.columns

Index(['const', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
       'floors', 'waterfront', 'view', 'condition', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'city', 'zip', 'day',
       'month', 'weekday'],
      dtype='object')