# Cambridge ML Commando Course
## Session 2.1 - Classification with Logistic Regression

In [None]:
%pylab inline
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import sklearn
import IPython
import platform
from sklearn import preprocessing
print ('Python version:', platform.python_version())
print ('IPython version:', IPython.__version__)
print ('numpy version:', np.__version__)
print ('scikit-learn version:', sklearn.__version__)
print ('matplotlib version:', matplotlib.__version__)

## Hours of study...
A super-simple example to start us off.  We have a dataset that matches hours of study to the observation of whether a student passed a test.  The test result is dichotomous (0 for fail, 1 for pass).  How do we fit

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
#Hours, pass
study_data = [(0.50, 0),
(0.75, 0),
(1.00, 0),
(1.25, 0),
(1.50, 0),
(1.75, 0),
(1.75, 1),
(2.00, 0),
(2.25, 1),
(2.50, 0),
(2.75, 1),
(3.00, 0),
(3.25, 1),
(3.50, 0),
(4.00, 1),
(4.25, 1),
(4.50, 1),
(4.75, 1),
(5.00, 1),
(5.50, 1) ]

X = numpy.array(study_data)[:,0].reshape(-1,1)
y = numpy.array(study_data)[:,1].ravel()

lr = LogisticRegression()
lr.fit(X,y)
print("Accuracy (training) is", lr.score(X,y))

In [None]:
#The main dataset for this book
from sklearn import datasets
iris = datasets.load_iris()

print(iris.target)
print(iris.target_names)
print(iris.feature_names)

In [None]:
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

generate_synthetic_dataset = False

if generate_synthetic_dataset == True:
    dataset="synthetic data"
    n_f = 2
    n_c = 2
    X,y = sklearn.datasets.make_classification(n_features=n_f, n_informative=2, n_redundant=0, n_classes=n_c, n_clusters_per_class=2, random_state=42)
    y_names = ["Class {}".format(ix) for ix in range(n_c)]
    X_names = ["Feat {}".format(ix) for ix in range(n_f)]
else:
    for dataset in ["iris"]: # we can iterate across them all with ["iris","wine","breast_cancer"]:
        if dataset == "iris":
            remove_a_species = True # we can use this to make the Iris dataset binary
            petals_only = True # we can use this to make the Iris data 2D
            iris = datasets.load_iris()
#             print(iris.DESCR) #this gives a fairly long description of the dataset
            X_names = iris.feature_names
            y_names = iris.target_names
            
            if remove_a_species:
                remove_which = "setosa"
                class_to_exclude = list(y_names).index(remove_which)
                y_names = np.delete(y_names,class_to_exclude)
                print(y_names)
                
                indices_2class = (iris.target != class_to_exclude) #[False for non-class_to_exclude else True]
                X = iris.data[indices_2class]
                y = iris.target[indices_2class]
            else:
                X = iris.data
                y = iris.target

            if petals_only:
                # sepal values are 0:2
                X = X[:, 2:4]
                X_names = X_names[2:4]
                
        elif dataset=="wine":
            wine_data = datasets.load_wine()
            X = wine_data.data
            X_names = wine_data.feature_names
            y = wine_data.target
            y_names = wine_data.target_names
        elif dataset=="breast_cancer":
            bc_data = datasets.load_breast_cancer()
            X = bc_data.data
            X_names = bc_data.feature_names
            y = bc_data.target
            y_names = bc_data.target_names
            
        print(X_names)
        print(y_names)
        print(X.shape)
            
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=666)
X_scaler = StandardScaler()
X_train = X_scaler.fit_transform(X_train)
X_test = X_scaler.transform(X_test)

plt.set_cmap("Paired")
ax = plt.gca()

from sklearn.decomposition import PCA
if X_train.shape[1] >2:
    reduce_to_2d = PCA(n_components=2) # flatten the dataset down to 2D so we can actually plot it
    X_tr_vis = reduce_to_2d.fit_transform(X_train)
    X_tt_vis = reduce_to_2d.transform(X_test)
    plt.xlabel("Principal Comp 1")
    plt.ylabel("Principal Comp 2")
else:
    reduce_to_2d = None
    print(X_train.shape)
    X_tr_vis = X_train
    X_tt_vis = X_test
    plt.xlabel(X_names[0])
    plt.ylabel(X_names[1])

ax.scatter(X_tr_vis[:,0], X_tr_vis[:,1], c=y_train, alpha=0.7)#, cmap=plt.cm.Paired)
ax.scatter(X_tt_vis[:,0], X_tt_vis[:,1], marker="s", s=50, c=y_test, edgecolor="k")#, cmap=plt.cm.Paired) #it's sneaky to look at your test data!
plt.title(dataset)
plt.show()


In [None]:
from sklearn.model_selection import cross_val_score, KFold
from scipy.stats import sem

def evaluate_cross_validation(estr, X, y, K):
    # create a k-fold croos validation iterator of k=5 folds
    print("underway...")
    k_cv = KFold(n_splits=K, shuffle=True, random_state=0)
    # by default the score used is the one returned by score method of the estimator (accuracy)
    scores = cross_val_score(estr, X, y, cv=k_cv)
    print (scores)
    print (("Mean score: {0:.3f} (+/-{1:.3f})").format(
        np.mean(scores), sem(scores)))
    
    
from sklearn.preprocessing import minmax_scale
def plot_matrix(mx, title="Some Matrix", labels = None):
    plt.set_cmap("jet")
    fig, ax = plt.subplots()
    
    im = ax.imshow(minmax_scale(mx))

    # # We want to show all ticks...
    if labels is not None:
        ax.set_xticks(np.arange(len(labels)))
        ax.set_yticks(np.arange(len(labels)))

        # # ... and label them with the respective list entries
        ax.set_xticklabels(labels)
        ax.set_yticklabels(labels)

        # Rotate the tick labels and set their alignment.
        plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
                 rotation_mode="anchor")

#     Loop over data dimensions and create text annotations.
    for i in range(len(mx)):
        for j in range(len(mx)):
            text = ax.text(j, i, round(mx[i, j],1),
                           ha="center", va="center", color="w")

    ax.set_title(title)
    # fig.tight_layout()
    fig.set_size_inches(6,6)
    plt.show()

In [None]:
from sklearn.linear_model import LogisticRegression
# from sklearn import svm
reg = LogisticRegression(solver="lbfgs", multi_class="ovr")
# reg = svm.SVC(kernel="rbf", probability=True)
reg.fit(X_train, y_train)

In [None]:
np.set_printoptions(precision=2, suppress=True) # 2 decimal places, suppress scientific notation for legibility

evaluate_cross_validation(reg, X_train, y_train, 5)
y_hats_tr = reg.predict(X_train)
y_hats_proba = reg.predict_proba(X_train)
print("y yh[class probabilities]:")
for y,yh,yhp in zip(y_train, y_hats_tr, y_hats_proba):
    print("True v pred: {},{} ~".format(y, yh), "Class Probs:", yhp)

In [None]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x_min, x_max = X_tr_vis[:, 0].min() - .5, X_tr_vis[:, 0].max() + .5
y_min, y_max = X_tr_vis[:, 1].min() - .5, X_tr_vis[:, 1].max() + .5
h = .02  # step size in the mesh
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

if reduce_to_2d is not None:
    Z = reg.predict(reduce_to_2d.inverse_transform(np.c_[xx.ravel(), yy.ravel()]))
else:
    Z = reg.predict(np.c_[xx.ravel(), yy.ravel()])
    
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1, figsize=(6, 6))
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)

# Plot the training and test points
plt.scatter(X_tr_vis[:,0], X_tr_vis[:,1], c=y_train, marker="o", s=20, edgecolors='k', cmap=plt.cm.Paired)
plt.scatter(X_tt_vis[:,0], X_tt_vis[:,1], c=y_test, marker="s", s=50, edgecolors='w', alpha=0.7, cmap=plt.cm.Paired)

if reduce_to_2d is None:
    plt.xlabel(X_names[0])
    plt.ylabel(X_names[1])
else:
    plt.xlabel('Prin Comp 1')
    plt.ylabel('Prin Comp 2')
    
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())
plt.show()

print("TRAINING:")
y_hats_tr = reg.predict(X_train)
print("Acc:", accuracy_score(y_train, y_hats_tr))
# print("p/r/F/supp:",sklearn.metrics.precision_recall_fscore_support(y_train, y_hats_tr, average='micro'))
print ("Classification Report:")
print (classification_report(y_train, y_hats_tr, target_names=y_names))
print("\nConfusion Mx (training data):")
plt.set_cmap("jet")
mx = confusion_matrix(y_train, y_hats_tr)
plot_matrix(mx, title="Confusion (Train)", labels = y_names)

print("\nTEST")
y_hats = reg.predict(X_test)
print("Acc:", accuracy_score(y_test, y_hats))
# print("p/r/F/supp:",sklearn.metrics.precision_recall_fscore_support(y_test, y_hats, average='micro'))
print ("Classification Report:")
print (classification_report(y_test, y_hats, target_names=y_names))
print("\nConfusion Mx (test data):")
mx = confusion_matrix(y_test, y_hats)
plot_matrix(mx, title="Confusion (Test)", labels = y_names)


### Multinomial classification
Now reintroduce the setosa class, and try running the same routines again with a non-binary dataset.

## Summary
- We loaded the Iris dataset and manually split it into training and test sets
- We used Logistic Regression to classify the test data
- We checked out the decision boundary by plotting the class partitions
- We got P/R/F and Confusion Matrices on train and test
- We repeated with all 3 classes