In [2]:
# Load the wine dataset

import pandas as pd
import numpy as np

df_wine = pd.read_csv(
    'data/wine.data',
    header=None
)


In [3]:
df_wine.columns = ['Class label', 'Alcohol',
    'Malic acid', 'Ash',
    'Alcalinity of ash', 'Magnesium',
    'Total phenols', 'Flavanoids',
    'Nonflavanoid phenols',
    'Proanthocyanins',
    'Color intensity', 'Hue',
    'OD280/OD315 of diluted wines',
    'Proline']

In [4]:
print('Class labels: ', np.unique(df_wine['Class label']))

Class labels:  [1 2 3]


In [5]:
df_wine.head()

Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


### Use built in sklearn train_test_split function

In [6]:
from sklearn.model_selection import train_test_split

X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0, stratify=y
)

In [7]:
# Normalization
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train)
X_test_norm = mms.transform(X_test)

In [8]:
# Manual standardization
ex = np.array([0, 1, 2, 3, 4, 5])
print('standardized:', (ex - ex.mean()) / ex.std())

standardized: [-1.46385011 -0.87831007 -0.29277002  0.29277002  0.87831007  1.46385011]


In [9]:
# Manual normalized
print('normalized:', (ex - ex.min()) / (ex.max() - ex.min()))

normalized: [0.  0.2 0.4 0.6 0.8 1. ]


In [10]:
# Sklearns standardization
from sklearn.preprocessing import StandardScaler

stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

### L1 Regularization

In [11]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='l1', C=1.0, solver='liblinear', multi_class='ovr')

In [12]:
lr.fit(X_train_std, y_train)

In [13]:
print("train acc: ", lr.score(X_train_std, y_train))

train acc:  1.0


In [14]:
print("test acc: ", lr.score(X_test_std, y_test))

test acc:  1.0


In [15]:
lr.intercept_

array([-1.2634772 , -1.21563689, -2.37059871])

In [16]:
lr.coef_

array([[ 1.24547755,  0.18022768,  0.74626207, -1.16362333,  0.        ,
         0.        ,  1.15959926,  0.        ,  0.        ,  0.        ,
         0.        ,  0.55671664,  2.50937258],
       [-1.53938848, -0.38555583, -0.99587172,  0.36389665, -0.0587057 ,
         0.        ,  0.66734757,  0.        ,  0.        , -1.93162241,
         1.23771069,  0.        , -2.23318612],
       [ 0.13567769,  0.16857695,  0.35719501,  0.        ,  0.        ,
         0.        , -2.43784637,  0.        ,  0.        ,  1.56347859,
        -0.81830596, -0.49281373,  0.        ]])

### Implement Sequential Feature Selection

In [17]:
from sklearn.base import clone
from itertools import combinations
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

class SBS:
    def __init__(self, estimator, k_features,
                 scoring=accuracy_score, test_size=0.25, random_state=1):
        self.scoring = scoring
        self.estimator = clone(estimator)
        self.k_features = k_features
        self.test_size = test_size
        self.random_state = random_state
    
    def fit(self, X, y):
        X_train, X_test, y_train, y_test = \
            train_test_split(X, y, test_size=self.test_size, random_state=self.random_state)
        
        dim = X_train.shape[1] # i.e. the number of columns
        self.indices_ = tuple(range(dim)) # the tuple will materialize the range rather than return iterator
        self.subsets_ = [self.indices_]
        score = self._calc_score(X_train, y_train, X_test, y_test, self.indices_)
        self.scores_ = [score]

        # This is where the greedy feature selection starts
        while dim > self.k_features:
            scores = []
            subsets = []

            # Iterate through each combination of current features less 1
            for p in combinations(self.indices_, r=dim- 1):
                score = self._calc_score(X_train, y_train, X_test, y_test, p)
                scores.append(score)
                subsets.append(p)

            # Choose the highest scoring subset
            best = np.argmax(scores)
            self.indices_ = subsets[best]  # this will hold the set of indices (of the features) for the end results
            self.subsets_.append(self.indices_) # We append so we have a history of the best subsets for each of k, k+1, ..., dim
            dim -= 1

            self.scores_.append(scores[best]) # This keeps track of the actual score itself

        self.k_score_ = self.scores_[-1]  # This the final score for the reduced k-dim dataset

        return self

    def transform(self, X):
        return X[:, self.indices_]

    def _calc_score(self, X_train, y_train, X_test, y_test, indices):
        self.estimator.fit(X_train[:, indices], y_train)
        y_pred = self.estimator.predict(X_test[:, indices])
        score = self.scoring(y_test, y_pred)
        return score

In [18]:
# In action!
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
sbs = SBS(knn, k_features=1)
sbs.fit(X_train_std, y_train)


<__main__.SBS at 0x13c857100>

In [19]:
sbs.subsets_

[(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12),
 (0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12),
 (0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11),
 (0, 1, 2, 3, 4, 5, 6, 7, 9, 11),
 (0, 1, 2, 3, 4, 5, 7, 9, 11),
 (0, 1, 2, 3, 5, 7, 9, 11),
 (0, 1, 2, 3, 5, 7, 11),
 (0, 1, 2, 3, 5, 11),
 (0, 1, 2, 3, 11),
 (0, 1, 2, 11),
 (0, 1, 11),
 (0, 11),
 (0,)]

In [20]:
sbs.subsets_[10] # want k=3 (best performance) => 13 - 3 = 10

(0, 1, 11)

In [29]:
k3 = list(sbs.subsets_[10])
print(df_wine.columns[1:][k3])

Index(['Alcohol', 'Malic acid', 'OD280/OD315 of diluted wines'], dtype='object')


In [30]:
# Evaluate model on full dataset
knn.fit(X_train_std, y_train)
print('Training accuracy', knn.score(X_train_std, y_train))
print('Test accuracy: ', knn.score(X_test_std, y_test))

Training accuracy 0.967741935483871
Test accuracy:  0.9629629629629629


In [31]:
knn.fit(X_train_std[:, k3], y_train)
print('Training accuracy', knn.score(X_train_std[:, k3], y_train))
print('Test accuracy: ', knn.score(X_test_std[:, k3], y_test))


Training accuracy 0.9516129032258065
Test accuracy:  0.9259259259259259
