In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize']= (10,8)

In [None]:
from sklearn import datasets
from sklearn.decomposition import PCA

# Import des données

In [None]:
iris = datasets.load_iris()

X = iris.data
y = iris.target
labels = iris.target
target_names = iris.target_names

In [None]:
print target_names

In [None]:
import pandas as pd
df = pd.DataFrame(X)
df['class'] = y
df['class'][df['class'] == 0] = 'setosa'
df['class'][df['class'] == 1] = 'versicolor'
df['class'][df['class'] == 2] = 'virginica'
df.columns=['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid', 'class']
df.head()
X = df[['sepal_len', 'sepal_wid', 'petal_len', 'petal_wid']]
y = df['class']

In [None]:
df.dropna(how="all", inplace=True) # drops the empty line at file-end
df.sample(6)

### Visualisation des données deux à deux

In [None]:
from pandas.tools.plotting import scatter_matrix
import numpy as np
pd.scatter_matrix(X, figsize=(10,10), s=100);

## Répartition en histogrammes

In [None]:
import math

label_dict = {1: 'setosa',
              2: 'versicolor',
              3: 'virgnica'}

feature_dict = {0: 'sepal length [cm]',
                1: 'sepal width [cm]',
                2: 'petal length [cm]',
                3: 'petal width [cm]'}

with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize=(10, 8))
    for cnt in range(4):
        plt.subplot(2, 2, cnt+1)
        for lab in ('setosa', 'versicolor', 'virginica'):
            plt.hist(X[y==lab].iloc[:,cnt],
                     label=lab,
                     bins=10,
                     alpha=0.7,)
        plt.xlabel(feature_dict[cnt])
    plt.legend(loc='upper right', fancybox=True, fontsize=8)

    plt.tight_layout();

## Normalisation avant l'analyse en composantes principales

In [None]:
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(X)

# Analyse en composantes principales

In [None]:
pca = PCA()
pca.fit(X)
X_r = pca.transform(X)

### Variance des données expliquée par les composantes principales

In [None]:
plt.figure(1, figsize=(10, 4))
plt.plot(pca.explained_variance_ratio_, 'o-', linewidth=2)
plt.axis('tight')
plt.xlabel('n_composants')
plt.ylabel('variance_expliquee');

On observe que la variance est très bien expliquée par 2 composantes.

In [None]:
pca.explained_variance_ratio_

## Visualisation pour n = 2

In [None]:
X_r = PCA(n_components=2).fit_transform(iris.data)

In [None]:
plt.figure(figsize=(10, 6))
for lab, col in zip(('setosa', 'versicolor', 'virginica'),
                        ('blue', 'green', 'red')):
    plt.scatter(X_r[y[y==lab].index.values][:, 0],
                    X_r[y[y==lab].index.values][:, 1],
                    label=lab,
                    c=col)
plt.xlabel('Comp. 1')
plt.ylabel('Comp. 2')
plt.legend(loc='lower center');

## Visualisation pour n = 3

In [None]:
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(1, figsize=(8, 6))
ax = Axes3D(fig, elev=-150, azim=110)
X_reduced = PCA(n_components=3).fit_transform(iris.data)
ax.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 2], c=labels,
           cmap=plt.cm.rainbow)
ax.set_xlabel("Vecteur propre 1")
ax.w_xaxis.set_ticklabels([])
ax.set_ylabel("Vecteur propre 2")
ax.w_yaxis.set_ticklabels([])
ax.set_zlabel("Vecteur propre 3")
ax.w_zaxis.set_ticklabels([]);

L'ajout de la 3ème composante principale apporte peu d'explication quand à la variance des données.