In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn import datasets
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.cluster import MeanShift
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from mpl_toolkits.mplot3d import Axes3D

## 1.1 Preprocesamiento de datos

In [None]:
iris = datasets.load_iris().data

#### 1.b Normalization

In [None]:
iris_normalized = preprocessing.normalize(iris, norm='l2')

In [None]:
print("El minimo es %f - El máximo es %f " % (iris_normalized.min(), iris_normalized.max()))

#### 1.c Scaling

##### Escala matriz en un rango [0,1]

In [None]:
min_max_scaler = preprocessing.MinMaxScaler()
iris_train_minmax = min_max_scaler.fit_transform(iris)
iris_train_minmax

##### Escala la matriz en un rango [-1,1]

In [None]:
max_abs_scaler = preprocessing.MaxAbsScaler()
iris_train_maxabs = max_abs_scaler.fit_transform(iris)
print("El minimo es %f - El máximo es %f " % (iris_train_maxabs.min(), iris_train_maxabs.max()))
iris_train_maxabs

#### 2. PCA

In [None]:
data = pd.read_csv('data/data_otto/train.csv')
data.drop('id', axis=1, inplace=True)
x = data[data.columns[:-1]].values
y = data[data.columns[-1]].values

In [None]:
for c in [19,28]:
    xp = PCA(n_components=c)
    xp.fit_transform(x)
    print(("La variabilidad del %f%% se explica con %d componentes") % 
            ((xp.explained_variance_ratio_.cumsum()[c-1]*100), c))

In [None]:
% matplotlib inline

xp = PCA(n_components=2)
xp = x_pca.fit_transform(x)

product_class = np.unique(y)

colors = plt.get_cmap("hsv")

plt.figure(figsize=(10, 4))
for i, p in enumerate(product_class):
    mask = (y == p)
    plt.scatter(xp[mask, 0], xp[mask, 1], 
                c=colors(1. * i / 11), label=p, alpha=0.2)
    
plt.legend(loc="upper left")
plt.xlabel('PC 1')
plt.ylabel('PC 2')

## 1.3 Clustering

#### 3. Mean-Shift sobre el dataset IRIS

In [None]:
X = datasets.load_iris().data

In [None]:
ms = MeanShift()
ms.fit(X)
labels = ms.labels_
cluster_centers = ms.cluster_centers_

labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

print("Cantidad de clusters : %d" % n_clusters_)

In [None]:
% matplotlib inline

plt.figure(1)
plt.clf()

colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
    my_members = labels == k
    cluster_center = cluster_centers[k]
    plt.plot(X[my_members, 0], X[my_members, 1], col + '.')
    plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
             markeredgecolor='k', markersize=14)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()

#### 3. k-means sobre el dataset IRIS

In [None]:
% matplotlib inline

from sklearn import datasets

np.random.seed(5)

centers = [[1, 1], [-1, -1], [1, -1]]
iris = datasets.load_iris()
X = iris.data
y = iris.target

clustering = KMeans(n_clusters=3)
clustering.fit(X)
labels = clustering.labels_

fig = plt.figure(1)
plt.clf()
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)

plt.cla()

ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=labels.astype(np.float))

ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_xlabel('Petal width')
ax.set_ylabel('Sepal length')
ax.set_zlabel('Petal length')

## 1.4 Clasificación

#### 3. Vinos

In [None]:
data = pd.read_csv('data/winequality-red.csv', sep=";")
X = data[data.columns[:-1]].as_matrix()[:,[8,10]]
y = data["quality"].as_matrix()

In [None]:
clf = SVC(kernel='linear').fit(X, Y)