In [9]:
from datasets import wine_red_dataset
import pandas as pd
wine_red = wine_red_dataset()
wine_red.dropna(how='all', inplace=True)

# Feature selection

When a model is trained with colinear features, the coefficients (weights) tend to depend on each other which reduces explanability of the model. Find features that have high colinearity. 

Variance Inflation Factor, measures how much the variance of an estimated regression coefficient is increased because of collinearity

In [12]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
x = wine_red.dropna().drop('quality', axis=1).drop('ID', axis=1)
vif_factors = pd.Series([variance_inflation_factor(x.values, i) for i in range(x.shape[1])], index=x.columns)

display(vif_factors)

fixed acidity                2.314030
volatile acidity             1.741720
citric acid                  3.132193
residual sugar               1.125396
chlorides                    1.364200
flavanoids              139849.177083
free sulfur dioxide          1.930893
total sulfur dioxide         2.104990
density                      1.653583
pH                           1.004598
sulphates                    1.346137
magnesium                    1.004227
alcohol                     10.631516
lightness                    9.929243
dtype: float64

=> alcohol and lightness seem to have high colinearity

In [20]:
wine_red['alcohol'].corr(wine_red['lightness'])

-0.9478388402404578

alcohol and lightness have high negative correlation

In [21]:
x = wine_red.dropna().drop('quality', axis=1).drop('lightness',axis=1).drop('ID', axis=1)
vif_factors = pd.Series([variance_inflation_factor(x.values, i) for i in range(x.shape[1])], index=x.columns)

display(vif_factors)

fixed acidity                2.314014
volatile acidity             1.741515
citric acid                  3.130305
residual sugar               1.125355
chlorides                    1.361599
flavanoids              132957.699251
free sulfur dioxide          1.925508
total sulfur dioxide         2.095232
density                      1.648418
pH                           1.004597
sulphates                    1.341383
magnesium                    1.004135
alcohol                      1.679509
dtype: float64

when removing lightness as a feature, the colinearity disappears

In [95]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_regression
wine_red_nan = wine_red.dropna()
x = wine_red_nan.drop('ID', axis=1).drop('quality', axis=1)
y = wine_red_nan['quality']

def get_feature_names_from_booleans(features, boolean_list):
    tmp_features = []
    for i in range(0, len(boolean_list)):
        if boolean_list[i]:
            tmp_features.append(features[i])
    return tmp_features

kbest_features = []

#filtering using Mutual Information = Information Gain
flt = SelectKBest(mutual_info_regression, k=8).fit(x,y)
scores = flt.scores_
df = pd.DataFrame(scores,index=x.columns, columns=['score'])
best_features = df.sort_values(by=['score'], ascending=False)

display(best_features)

Unnamed: 0,score
alcohol,0.204938
lightness,0.124397
volatile acidity,0.115596
sulphates,0.086926
total sulfur dioxide,0.060673
fixed acidity,0.047819
citric acid,0.039384
density,0.034821
magnesium,0.02623
chlorides,0.022254


# Instance selection

# Correlating Features

# Feature reduction