# Machine Learning  Examples

In [1]:
from sklearn import datasets
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import metrics
from sklearn import tree
from sklearn import neighbors
from sklearn import svm
from sklearn import ensemble
from sklearn import cluster

import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

np.random.seed(555)  

##  Machine Learning - Classification
 First, load digits data from Scikit-Learn datasets:

In [2]:
from sklearn.datasets import load_digits
digits = load_digits()

We will assign the features data to X and the target (labels) to y:  

In [3]:
X = digits.data
y = digits.target
print(X.shape)
print(y.shape)

(1797, 64)
(1797,)


Note that there are 1,797 samples and 64 features.

1) Split the data into a training set (with 70% of the data) and a testing set. Name the variables `X_train, X_test, y_train, y_test`.  Display the size of `X_train`.

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
print(X_train.shape)

(1257, 64)


We will apply different classification algorithms to the digits data.
- decision trees
- random forest method
- support-vector machine with linear kernel 

2) Classify the digits using the decision trees classifer.

In [5]:

classifier = tree.DecisionTreeClassifier()
classifier.fit(X_train, y_train)
y_test_pred = classifier.predict(X_test)

# decision trees 


3) Display the classification report for this classifier.

In [6]:
print(metrics.classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.98      0.93      0.95        54
           1       0.75      0.84      0.79        45
           2       0.88      0.91      0.89        55
           3       0.82      0.79      0.80        52
           4       0.80      0.93      0.86        57
           5       0.83      0.89      0.86        45
           6       0.98      0.96      0.97        57
           7       0.93      0.77      0.85        53
           8       0.79      0.71      0.75        59
           9       0.83      0.84      0.83        63

    accuracy                           0.86       540
   macro avg       0.86      0.86      0.86       540
weighted avg       0.86      0.86      0.86       540



4) Compute the "accuracy score" (`metrics.accuracy_score`) of the prediction on the testing set.

In [7]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_test_pred, normalize=True, sample_weight=None)

0.8574074074074074

5) Compute the confusion matrix for this classifier. 

In [8]:
metrics.confusion_matrix(y_test, y_test_pred)

array([[50,  0,  1,  0,  1,  1,  0,  0,  0,  1],
       [ 0, 38,  1,  2,  1,  0,  0,  1,  2,  0],
       [ 0,  1, 50,  0,  0,  1,  0,  0,  2,  1],
       [ 0,  0,  1, 41,  0,  1,  1,  0,  5,  3],
       [ 0,  1,  0,  0, 53,  1,  0,  0,  1,  1],
       [ 0,  0,  0,  3,  0, 40,  0,  0,  1,  1],
       [ 0,  2,  0,  0,  0,  0, 55,  0,  0,  0],
       [ 0,  1,  1,  0,  7,  0,  0, 41,  0,  3],
       [ 0,  6,  2,  3,  3,  1,  0,  1, 42,  1],
       [ 1,  2,  1,  1,  1,  3,  0,  1,  0, 53]])

Use the random forest classifer to perform the same task. 


In [9]:
classifier = ensemble.RandomForestClassifier()
classifier.fit(X_train, y_train)
y_test_pred = classifier.predict(X_test)

accuracy_score(y_test, y_test_pred, normalize=True, sample_weight=None)

0.9777777777777777

In [10]:
metrics.confusion_matrix(y_test, y_test_pred)

array([[53,  0,  0,  0,  1,  0,  0,  0,  0,  0],
       [ 0, 45,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0, 55,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0, 52,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0, 57,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0, 44,  0,  0,  0,  1],
       [ 0,  0,  0,  0,  1,  0, 56,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0, 53,  0,  0],
       [ 0,  2,  0,  1,  0,  1,  0,  0, 55,  0],
       [ 0,  0,  0,  1,  0,  1,  0,  1,  2, 58]])

6) Use the support-vector machine (with 'linear' kernel) as the classifer.


In [11]:
#your code: (svm)
classifier = svm.LinearSVC()
classifier.fit(X_train, y_train)
y_test_pred = classifier.predict(X_test)
metrics.confusion_matrix(y_test, y_test_pred)



array([[53,  0,  0,  0,  0,  1,  0,  0,  0,  0],
       [ 0, 38,  0,  1,  0,  0,  1,  0,  4,  1],
       [ 0,  1, 54,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0, 49,  0,  1,  0,  0,  1,  1],
       [ 0,  0,  0,  0, 55,  0,  1,  0,  1,  0],
       [ 0,  0,  0,  1,  0, 44,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0, 56,  0,  1,  0],
       [ 0,  0,  0,  0,  1,  0,  0, 51,  0,  1],
       [ 0,  2,  1,  2,  0,  1,  0,  0, 53,  0],
       [ 1,  0,  0,  2,  0,  0,  0,  1,  4, 55]])

In [12]:
accuracy_score(y_test, y_test_pred, normalize=True, sample_weight=None)

0.9407407407407408

# Machine Learning - Clustering


1) Perform K-means method on the data `X` and assign the prediction to the variable name `y_pred`

In [13]:
clustering=cluster.KMeans(n_clusters=10)
clustering.fit(X)
y_pred=clustering.predict(X)

In [14]:
from scipy.stats import mode
y_pred_labels = np.zeros_like(y_pred)
for i in range(10):
    mask = (y_pred==i)
    y_pred_labels[mask] = mode(y[mask])[0]

2) Print out the classification report and the accuracy score.

In [15]:
accuracy_score(y,y_pred_labels, normalize=True, sample_weight=None)

0.7929883138564274

In [16]:
print(metrics.classification_report(y,y_pred_labels))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       178
           1       0.60      0.30      0.40       182
           2       0.85      0.84      0.84       177
           3       0.86      0.84      0.85       183
           4       0.98      0.90      0.94       181
           5       0.92      0.75      0.82       182
           6       0.97      0.98      0.98       181
           7       0.85      0.98      0.91       179
           8       0.45      0.57      0.51       174
           9       0.56      0.78      0.65       180

    accuracy                           0.79      1797
   macro avg       0.80      0.79      0.79      1797
weighted avg       0.80      0.79      0.79      1797



3) Print out the confusion matrix

In [17]:
metrics.confusion_matrix(y,y_pred_labels)

array([[177,   0,   0,   0,   1,   0,   0,   0,   0,   0],
       [  0,  55,  24,   1,   0,   1,   2,   0,  99,   0],
       [  1,   2, 148,  13,   0,   0,   0,   3,   8,   2],
       [  0,   0,   0, 154,   0,   2,   0,   7,   7,  13],
       [  0,   7,   0,   0, 163,   0,   0,   8,   3,   0],
       [  0,   0,   0,   2,   2, 136,   1,   0,   0,  41],
       [  1,   1,   0,   0,   0,   0, 177,   0,   2,   0],
       [  0,   2,   0,   0,   0,   0,   0, 175,   2,   0],
       [  0,   5,   3,   2,   0,   4,   2,   5, 100,  53],
       [  0,  20,   0,   7,   0,   5,   0,   7,   1, 140]])