# Cross-validating predictive models for the Iris data set

In [139]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [140]:
iris = pd.read_csv('iris.data', names=['Sepal length (cm)', 'Sepal width (cm)', 'Petal length (cm)', 'Petal width (cm)', 'Species'])

In [141]:
from sklearn.model_selection import train_test_split

X = iris[['Sepal length (cm)', 'Sepal width (cm)', 'Petal length (cm)', 'Petal width (cm)']]
Y = iris['Species']

X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=0.2)

In [142]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [143]:
k_neighbors = KNeighborsClassifier(5)
logistic_regression = LogisticRegression(multi_class='auto', solver='lbfgs', max_iter=1000)

In [144]:
# 10-fold cross validation
knn_scores = cross_val_score(k_neighbors, X_train, Y_train, cv=10)
lr_scores = cross_val_score(logistic_regression, X_train, Y_train, cv=10)

In [145]:
(np.mean(knn_scores), 2 * np.std(knn_scores))

(0.974242424242424, 0.07878787878787882)

In [146]:
(np.mean(lr_scores), 2 * np.std(lr_scores))

(0.949242424242424, 0.1115567280860604)

In [148]:
k_neighbors_trained = k_neighbors.fit(X_train, Y_train)

In [149]:
logistic_regression_trained = logistic_regression.fit(X_train, Y_train)

In [150]:
from sklearn.metrics import confusion_matrix

# https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [152]:
plot_confusion_matrix(Y_validation, k_neighbors_trained.predict(Y_validation), classes=Y_validation) 

ValueError: could not convert string to float: 'Iris-virginica'

In [156]:
iris_ = sklearn.datasets.load_iris()

In [157]:
iris_.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [158]:
iris_.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')