# Advanced Machine Learning @ UDD
### Instructor: Visiting Professor Rossano Schifanella

## Representing Data and Engineering Features

In [None]:
import matplotlib.pyplot as plt

from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')

### Categorical Variables

#### One-Hot-Encoding (Dummy variables)

In [None]:
import pandas as pd
import numpy as np
import os

# The file has no headers naming the columns, so we pass header=None
# and provide the column names explicitly in "names"
data = pd.read_csv("data/adult.csv", header=None, index_col=False,
    names=['age', 'workclass', 'fnlwgt', 'education',  'education-num',
           'marital-status', 'occupation', 'relationship', 'race', 'gender',
           'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
           'income'])

# For illustration purposes, we only select some of the columns:
data = data[['age', 'workclass', 'education', 'gender', 'hours-per-week',
             'occupation', 'income']]

data.head()

#### Checking string-encoded categorical data

In [None]:
print(data.gender.value_counts())

In [None]:
print(data.workclass.value_counts())

In [None]:
data = data[data.workclass!="?"]

In [None]:
print(data.workclass.value_counts())

### Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['income'])

features = list(data.columns.values)
features.remove('income')

to_binarize = ['workclass', 'education', 'occupation', 'gender']

for f in to_binarize:
    data[f] = label_encoder.fit_transform(data[f])

X = data[features].values
X

### OneHotEncoding

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
one_hot_encoder = OneHotEncoder(sparse=False, categories='auto')
X_encoded = one_hot_encoder.fit_transform(X)
X_encoded

It's sometime easier to perform the OneHotEncoding using directly pandas

In [None]:
X_encoded_pandas = pd.get_dummies(data[features], columns=['gender','workclass', 'education', 'occupation'])
X_encoded_pandas.head()

In [None]:
X_encoded_pandas.shape

### Binarization

In [None]:
data['hours-per-week'].values


In [None]:
data['hours-per-week'].values.reshape(-1, 1)

In [None]:
from sklearn.preprocessing import Binarizer

binarizer = Binarizer(threshold=39)
binarizer.fit_transform(data['hours-per-week'].values.reshape(-1, 1))
binary_hours_per_week= binarizer.fit_transform(data['hours-per-week'].values.reshape(-1, 1))

for i in range(10):
    print(data['hours-per-week'][i], binary_hours_per_week[i])

## Scaling numeric features

In [None]:
from sklearn.datasets import load_wine, load_breast_cancer, load_iris
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [None]:
dataset = load_wine()
# dataset = load_breast_cancer()
# dataset = load_iris()

X,y = dataset.data, dataset.target

print(dataset.DESCR)

In [None]:
import pandas as pd

df = pd.DataFrame(X, columns=dataset.feature_names)
df.head()

In [None]:
df.shape

In [None]:
def plot_corr_matrix(df):
    
    columns_names = df.columns
    n_columns = len(columns_names)
    
    fig = plt.figure(figsize=(max(8, n_columns/3), max(8,n_columns/3)))
    ax = fig.add_subplot(111)
    cax = ax.matshow(df.corr(), interpolation='nearest')
    fig.colorbar(cax)
    plt.xticks(np.arange(n_columns), columns_names, rotation=30, ha='left')
    plt.yticks(range(n_columns), columns_names,)
    plt.tight_layout()
    plt.show()
    
def plot_coxbox(X, columns_names, title=None):
    n_columns = len(columns_names)
    fig = plt.figure(figsize=(max(6, n_columns/3),max(6,n_columns/3)))
    ax = fig.add_subplot(111)
    ax.boxplot(X)
    plt.xticks(np.arange(1, n_columns + 1), columns_names, rotation=30, ha="right")
    if title:
        plt.title(title)


In [None]:
plot_corr_matrix(df)

In [None]:
plot_coxbox(X, dataset.feature_names)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=10)
model.fit(X,y)

print('average accuracy: %f' %np.average(cross_val_score(model, X, y, cv=10)))

### Standardization

The result of **standardization** (or **Z-score normalization**) is that the features will be rescaled so that they'll have the properties of a standard normal distribution with $\mu = 0$ and $\sigma = 1$

where $\mu$ is the mean (average) and $\sigma$ is the standard deviation from the mean; standard scores (also called ***z*** scores) of the samples are calculated as follows:

\begin{equation} z = \frac{x - \mu}{\sigma}\end{equation} 

Standardizing the features so that they are centered around 0 with a standard deviation of 1 is not only important if we are comparing measurements that have different units, but it is also a general requirement for many machine learning algorithms.

### MinMax Scaling

An alternative approach to Z-score normalization (or standardization) is the so-called **Min-Max scaling** (often also simply called "normalization" - a common cause for ambiguities).  
In this approach, the data is scaled to a fixed range - usually 0 to 1.  
The cost of having this bounded range - in contrast to standardization - is that we will end up with smaller standard deviations, which can suppress the effect of outliers.

A Min-Max scaling is typically done via the following equation:

\begin{equation} X_{norm} = \frac{X - X_{min}}{X_{max}-X_{min}} \end{equation}

In [None]:
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

In [None]:
scale_methods = [StandardScaler(), RobustScaler(), MinMaxScaler()]

models = [DecisionTreeClassifier(random_state=43), 
          KNeighborsClassifier()]

for model in models:
    print(type(model).__name__)
    reference_score = np.average(cross_val_score(model, X, y, cv=5))
    
    for scaler in scale_methods:
        X_scaled = scaler.fit_transform(X)
        print('%s=> %f (%f)' %(type(scaler).__name__, 
                               np.average(cross_val_score(model, X_scaled, y, cv=5)), 
                               reference_score))
    print()

## Another visualization of the effect of scaling with synthetic data

In [None]:
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, Normalizer

X, y = make_blobs(n_samples=50, centers=2, random_state=4, cluster_std=1)
X += 3 

plt.figure(figsize=(15, 8))
main_ax = plt.subplot2grid((2, 4), (0, 0), rowspan=2, colspan=2)

main_ax.scatter(X[:, 0], X[:, 1], c=y, cmap="Accent")
maxx = np.abs(X[:, 0]).max()
maxy = np.abs(X[:, 1]).max()

main_ax.set_xlim(-maxx + 1, maxx + 1)
main_ax.set_ylim(-maxy + 1, maxy + 1)
main_ax.set_title("Original Data")
other_axes = [plt.subplot2grid((2, 4), (i, j))
              for j in range(2, 4) for i in range(2)]

for ax, scaler in zip(other_axes, [StandardScaler(), RobustScaler(),
                                   MinMaxScaler(), Normalizer(norm='l2')]):
    X_ = scaler.fit_transform(X)
    ax.scatter(X_[:, 0], X_[:, 1], c=y, cmap="Accent")
    ax.set_xlim(-2, 2)
    ax.set_ylim(-2, 2)
    ax.set_title(type(scaler).__name__)

other_axes.append(main_ax)

for ax in other_axes:
    ax.spines['left'].set_position('center')
    ax.spines['right'].set_color('none')
    ax.spines['bottom'].set_position('center')
    ax.spines['top'].set_color('none')
    ax.xaxis.set_ticks_position('bottom')
    ax.yaxis.set_ticks_position('left')


http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html

For instance many elements used in the objective function of a learning algorithm (such as the RBF kernel of Support Vector Machines or the L1 and L2 regularizers of linear models) assume that all features are centered around 0 and have variance in the same order. If a feature has a variance that is orders of magnitude larger that others, it might dominate the objective function and make the estimator unable to learn from other features correctly as expected.



Some examples of algorithms where feature scaling matters are:

- k-nearest neighbors with an Euclidean distance measure if want all features to contribute equally
- k-means (see k-nearest neighbors)
- logistic regression, SVMs, perceptrons, neural networks etc. if you are using gradient descent/ascent-based optimization, otherwise some weights will update much faster than others
- linear discriminant analysis, principal component analysis, kernel principal component analysis since you want to find directions of maximizing the variance (under the constraints that those directions/eigenvectors/principal components are orthogonal); you want to have features on the same scale since you’d emphasize variables on “larger measurement scales” more. There are many more cases than I can possibly list here … I always recommend you to think about the algorithm and what it’s doing, and then it typically becomes obvious whether we want to scale your features or not.


# Importance of Feature Scaling


Feature scaling through standardization (or Z-score normalization)
can be an important preprocessing step for many machine learning
algorithms. Standardization involves rescaling the features such
that they have the properties of a standard normal distribution
with a mean of zero and a standard deviation of one.

While many algorithms (such as SVM, K-nearest neighbors, and logistic
regression) require features to be normalized, intuitively we can
think of Principle Component Analysis (PCA) as being a prime example
of when normalization is important. In PCA we are interested in the
components that maximize the variance. If one component (e.g. human
height) varies less than another (e.g. weight) because of their
respective scales (meters vs. kilos), PCA might determine that the
direction of maximal variance more closely corresponds with the
'weight' axis, if those features are not scaled. As a change in
height of one meter can be considered much more important than the
change in weight of one kilogram, this is clearly incorrect.

To illustrate this, PCA is performed comparing the use of data with
:class:`StandardScaler <sklearn.preprocessing.StandardScaler>` applied,
to unscaled data. The results are visualized and a clear difference noted.
The 1st principal component in the unscaled set can be seen. It can be seen
that feature #13 dominates the direction, being a whole two orders of
magnitude above the other features. This is contrasted when observing
the principal component for the scaled version of the data. In the scaled
version, the orders of magnitude are roughly the same across all the features.

The dataset used is the Wine Dataset available at UCI. This dataset
has continuous features that are heterogeneous in scale due to differing
properties that they measure (i.e alcohol content, and malic acid).

The transformed data is then used to train a naive Bayes classifier, and a
clear difference in prediction accuracies is observed wherein the dataset
which is scaled before PCA vastly outperforms the unscaled version.




In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.datasets import load_wine
from sklearn.pipeline import make_pipeline

RANDOM_STATE = 42
FIG_SIZE = (10, 7)


features, target = load_wine(return_X_y=True)

# Make a train/test split using 30% test size
X_train, X_test, y_train, y_test = train_test_split(features, target,
                                                    test_size=0.30,
                                                    random_state=RANDOM_STATE)

# Fit to data and predict using pipelined GNB and PCA.
unscaled_clf = make_pipeline(PCA(n_components=2), GaussianNB())
unscaled_clf.fit(X_train, y_train)
pred_test = unscaled_clf.predict(X_test)

# Fit to data and predict using pipelined scaling, RandomForest and PCA.
std_clf = make_pipeline(StandardScaler(), 
                        PCA(n_components=2), 
                        RandomForestClassifier(n_estimators=100))
std_clf.fit(X_train, y_train)
pred_test_std = std_clf.predict(X_test)

# Show prediction accuracies in scaled and unscaled data.
print('\nPrediction accuracy for the normal test dataset with PCA')
print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_test)))

print('\nPrediction accuracy for the standardized test dataset with PCA')
print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_test_std)))

# Extract PCA from pipeline
pca = unscaled_clf.named_steps['pca']
pca_std = std_clf.named_steps['pca']

# Show first principal components
print('\nPC 1 without scaling:\n', pca.components_[0])
print('\nPC 1 with scaling:\n', pca_std.components_[0])

# Scale and use PCA on X_train data for visualization.
scaler = std_clf.named_steps['standardscaler']
X_train_std = pca_std.transform(scaler.transform(X_train))

# visualize standardized vs. untouched dataset with PCA performed
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=FIG_SIZE)


for l, c, m in zip(range(0, 3), ('blue', 'red', 'green'), ('^', 's', 'o')):
    ax1.scatter(X_train[y_train == l, 0], X_train[y_train == l, 1],
                color=c,
                label='class %s' % l,
                alpha=0.5,
                marker=m
                )

for l, c, m in zip(range(0, 3), ('blue', 'red', 'green'), ('^', 's', 'o')):
    ax2.scatter(X_train_std[y_train == l, 0], X_train_std[y_train == l, 1],
                color=c,
                label='class %s' % l,
                alpha=0.5,
                marker=m
                )

ax1.set_title('Training dataset after PCA')
ax2.set_title('Standardized training dataset after PCA')

for ax in (ax1, ax2):
    ax.set_xlabel('1st principal component')
    ax.set_ylabel('2nd principal component')
    ax.legend(loc='upper right')
    ax.grid()

plt.tight_layout()

plt.show()

## Add polynomial features

Often it's useful to add complexity to the model by considering nonlinear features of the input data. A simple and common method to use is polynomial features, which can get features’ high-order and interaction terms. It is implemented in the class ***<mark>PolynomialFeatures</mark>***.

In [None]:
from sklearn.preprocessing import PolynomialFeatures

np.random.seed(43)  #Setting seed for reproducability
X = np.zeros(shape=(60,1))
X[:,0] = np.array([i*np.pi/180 for i in range(60,360,5)])

y = np.sin(X[:,0]) + np.random.normal(0,0.15,len(X))

Plot the dataset

In [None]:
fig, ax = plt.subplots()
ax.scatter(X[:,0], y, c="gray", edgecolors=(0, 0, 0))
ax.set_xlabel('X')
ax.set_ylabel('y')
plt.show()

Add polynomial features

In [None]:
polynomial_features = PolynomialFeatures(degree=5)

polynomial_features.fit(X)
X_polynomial = polynomial_features.transform(X)

In [None]:
polynomial_features.get_feature_names()

In [None]:
X_polynomial.shape


Fit a linear regression with the new polynomial features

In [None]:
from sklearn.linear_model import LinearRegression

linear_regression = LinearRegression().fit(X_polynomial, y)

In [None]:
# define points to be fit in the polynomial linear regression
line = np.linspace(X.min(), X.max(), 1000)[:-1].reshape(-1, 1)
line_poly = polynomial_features.transform(line)

plt.plot(X[:, 0], y, 'o')
plt.plot(line, linear_regression.predict(line_poly), label='polynomial linear regression')
plt.legend(loc="best")

Exercise: play with the degree of the polynomial function and see the effect in the fit. Could you hypothesize the presence of overfitting?   