In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn import datasets
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.cluster import MeanShift
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from mpl_toolkits.mplot3d import Axes3D
from sklearn.metrics.cluster import adjusted_rand_score

## 1.1 Preprocesamiento de datos

In [None]:
iris = datasets.load_iris().data

#### 1.b Normalization

In [None]:
iris_normalized = preprocessing.normalize(iris, norm='l2')

In [None]:
print("El minimo es %f - El máximo es %f " % (iris_normalized.min(), iris_normalized.max()))

#### 1.c Scaling

##### Escala matriz en un rango [0,1]

In [None]:
min_max_scaler = preprocessing.MinMaxScaler()
iris_train_minmax = min_max_scaler.fit_transform(iris)
iris_train_minmax

##### Escala la matriz en un rango [-1,1]

In [None]:
max_abs_scaler = preprocessing.MaxAbsScaler()
iris_train_maxabs = max_abs_scaler.fit_transform(iris)
print("El minimo es %f - El máximo es %f " % (iris_train_maxabs.min(), iris_train_maxabs.max()))
iris_train_maxabs

#### 2. PCA

In [None]:
data = pd.read_csv('data/data_otto/train.csv')
data.drop('id', axis=1, inplace=True)
x = data[data.columns[:-1]].values
y = data[data.columns[-1]].values

In [None]:
for c in [19,28]:
    xp = PCA(n_components=c)
    xp.fit_transform(x)
    print(("La variabilidad del %f%% se explica con %d componentes") % 
            ((xp.explained_variance_ratio_.cumsum()[c-1]*100), c))

In [None]:
% matplotlib inline

xp = PCA(n_components=2)
xp = x_pca.fit_transform(x)

product_class = np.unique(y)

colors = plt.get_cmap("hsv")

plt.figure(figsize=(10, 4))
for i, p in enumerate(product_class):
    mask = (y == p)
    plt.scatter(xp[mask, 0], xp[mask, 1], 
                c=colors(1. * i / 11), label=p, alpha=0.2)
    
plt.legend(loc="upper left")
plt.xlabel('PC 1')
plt.ylabel('PC 2')

## 1.3 Clustering

#### 3. Mean-Shift sobre el dataset IRIS

In [None]:
X = datasets.load_iris().data

In [None]:
ms = MeanShift()
ms.fit(X)
labels = ms.labels_
cluster_centers = ms.cluster_centers_

labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

print("Cantidad de clusters : %d" % n_clusters_)

#### Rand index adjusted for chance (Punto 1.5)

In [None]:
adjusted_rand_score(datasets.load_iris().target, ms.labels_)

In [None]:
% matplotlib inline

plt.figure(1)
plt.clf()

colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
    my_members = labels == k
    cluster_center = cluster_centers[k]
    plt.plot(X[my_members, 0], X[my_members, 1], col + '.')
    plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
             markeredgecolor='k', markersize=14)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()

#### 3. k-means sobre el dataset IRIS

In [None]:
% matplotlib inline

from sklearn import datasets

np.random.seed(5)

centers = [[1, 1], [-1, -1], [1, -1]]
iris = datasets.load_iris()
X = iris.data
y = iris.target

clustering = KMeans(n_clusters=3)
clustering.fit(X)
labels = clustering.labels_

fig = plt.figure(1)
plt.clf()
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)

plt.cla()

ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=labels.astype(np.float))

ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])
ax.set_xlabel('Petal width')
ax.set_ylabel('Sepal length')
ax.set_zlabel('Petal length')

## 1.4 Clasificación

#### 3. Vinos

In [64]:
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_curve, auc

data = pd.read_csv('data/winequality-red.csv', sep=";")
X = data[["fixed acidity", "alcohol"]].as_matrix()
y = data["quality"].as_matrix()

y = label_binarize(y, classes=[3, 4, 5, 6, 7, 8])

n_classes = y.shape[1]

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.20, random_state=42)

In [66]:
data = data[data.quality == 6]
data.describe()[1:2]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
mean,8.347179,0.497484,0.273824,2.477194,0.084956,15.711599,40.869906,0.996615,3.318072,0.675329,10.629519,6


#### Clasificación con Kernel Linear

In [67]:
classifier = OneVsRestClassifier(SVC(kernel='linear'))
y_score = classifier.fit(X_train, y_train).decision_function(X_test)

#classifier = SVC(kernel='linear').fit(X_train, y_train)

In [68]:
report = classification_report(y_test, classifier.predict(X_test))
print(report)

             precision    recall  f1-score   support

          0       0.00      0.00      0.00         1
          1       0.00      0.00      0.00        10
          2       0.64      0.76      0.70       130
          3       0.00      0.00      0.00       132
          4       0.00      0.00      0.00        42
          5       0.00      0.00      0.00         5

avg / total       0.26      0.31      0.28       320



  'precision', 'predicted', average, warn_for)


In [None]:
h = 1

x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1

xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))


Z = classifier.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)

# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)
plt.xlabel('X2')
plt.ylabel('X1')
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())

plt.show()

In [69]:
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import auc
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

#### Log Loss

In [70]:
loss = log_loss(y_test, classifier.predict(X_test))
print("El log loss del método es %f" % loss)

El log loss del método es 6.865827


#### Accuracy

In [71]:
accuracy = accuracy_score(y_test, classifier.predict(X_test))
print("La accuracy del método es %f" % accuracy)

La accuracy del método es 0.309375


#### Precision

In [72]:
precision = precision_score(y_test, classifier.predict(X_test))
print("La precision del método es %f" % precision)

La precision del método es 0.261161


  sample_weight=sample_weight)
  'precision', 'predicted', average, warn_for)


#### F1 Score

In [73]:
f1_score = f1_score(y_test, classifier.predict(X_test))
print("El f1_score del método es %f" % f1_score)

El f1_score del método es 0.283231


  sample_weight=sample_weight)
  'precision', 'predicted', average, warn_for)


#### Area Under the Curve

In [84]:
auc = roc_auc_score(y_test, y_score)*100
print("El area bajo la curva para el kernel linear en SVM es: %.2f%%" % auc)

El area bajo la curva para el kernel linear en SVM es: 60.06%
