# Compressing data via dimensionality reduction

## Plotting function for decision boundaries, test and training samples

In [83]:
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt

def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
    # Setup marker generator and color map
    markers = ('s', 'x', 'o', '^', 'v')
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    cmap = ListedColormap(colors[:len(np.unique(y))])
    
    # Plotting the data
    x1_min, x1_max = min(X[:,0]) - 1, max(X[:,0]) + 1   
    x2_min, x2_max = min(X[:,1]) - 1, max(X[:,1]) + 1 
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution), 
                           np.arange(x2_min, x2_max, resolution))
    Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
    Z = Z.reshape(xx1.shape)
    
    plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
    plt.xlim(x1_min, x1_max)
    plt.ylim(x2_min, x2_max)
    
    # Plot all samples
    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x=X[y==cl, 0], y=X[y==cl, 1], 
                    alpha = 0.8, c=cmap(idx), marker=markers[idx], label=cl)
    
    # Highlight test samples
    if test_idx:
        X_test, y_test = X[test_idx, :], y[test_idx]
        plt.scatter(X_test[:, 0], X_test[:, 1], alpha=1.0, c='', marker='o', label='test sample')

## Getting the wine data set and standardizing it

In [84]:
class standardizedData():
    def __init__(self, path, test_size):
        self.path = path
        self.test_size = test_size
    def getData(self):
        import pandas as pd
        from sklearn.preprocessing import StandardScaler
        from sklearn.cross_validation import train_test_split
        df_wine = pd.read_csv(self.path)
        X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=self.test_size)
        std = StandardScaler()
        self.X_train_std = std.fit_transform(X_train)
        self.X_test_std  = std.transform(X_test)
        self.y_train = y_train
        self.y_test = y_test
        return self

## Unsupervised dimensionality reduction: Principal component anaysis (PCA)

## Computing the covarience matrix

In [85]:
import numpy as np
cov_mat = np.cov(X_train_std.T)
eig_values, eig_vectors = np.linalg.eig(cov_mat)

## Plotting the explained variances

In [86]:
import matplotlib.pyplot as plt
total  = sum(eig_values)
explained_var = [i/total for i in sorted(eig_values, reverse=True)]
plt.bar(np.arange(1, len( explained_var) + 1), explained_var, 
        alpha = 0.5, align='center', label='individual explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc='best')
plt.show()

## Feature transformation

In [87]:
eigen_pairs =[(np.abs(eig_values[i]),eig_vectors[:,i]) for i in range(len(eig_values))]
eigen_pairs.sort(reverse=True)
W = np.hstack((eigen_pairs[0][1][:, np.newaxis], eigen_pairs[1][1][:, np.newaxis]))
X_train_pca = np.dot(X_train_std, W)

## PCA in sklearn

In [88]:
data = standardizedData(path='/home/shantanu/PycharmProjects/PythonMLBook/Chapter4/wine.csv', test_size=0.3).getData()
X_train_std = data.X_train_std
y_train     = data.y_train
X_test_std  = data.X_test_std
y_test      = data.y_test

from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca  = pca.transform(X_test_std)
lr = LogisticRegression()
lr.fit(X_train_pca, y_train)
plot_decision_regions(X_train_pca, y_train, classifier=lr)
plt.show()

## Supervised data compression: Linear discriminant analysis