In [1]:
%matplotlib notebook

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits

In [2]:
dataset = load_digits()
X, y = dataset.data, dataset.target

In [3]:
for class_name, class_count in zip(dataset.target_names, np.bincount(dataset.target)):
    print(class_name, class_count)

0 178
1 182
2 177
3 183
4 181
5 182
6 181
7 179
8 174
9 180


In [4]:
y_binary_imbalanced = y.copy()
y_binary_imbalanced[y_binary_imbalanced != 1] = 0

print('original labels: ', y[1:30])
print('binary labels: ', y_binary_imbalanced[1:30])

original labels:  [1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9]
binary labels:  [1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]


In [5]:
np.bincount(y_binary_imbalanced)

array([1615,  182], dtype=int64)

In [6]:
from sklearn.svm import SVC

X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state = 0)

svm = SVC(kernel = 'rbf', C=1).fit(X_train, y_train)
svm.score(X_test, y_test)

y_predict = svm.predict(X_test)

y_predict == y_test

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [7]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import r2_score

dummy_majority = DummyClassifier(strategy = 'most_frequent').fit(X_train, y_train)

y_dummy_predictions = dummy_majority.predict(X_test)

k = np.sum(y_test == y_dummy_predictions) # np.sum counts number of True in the boolean mask y_test == y_dummy_predictions
accuracy = k/len(y_test)

dummy_majority.score(X_test,y_test)

0.9044444444444445

In [8]:
from sklearn.metrics import confusion_matrix

dummy_majority = DummyClassifier(strategy='most_frequent').fit(X_train, y_train)
y_majority_predicted = dummy_majority.predict(X_test)

confusion = confusion_matrix(y_test,y_majority_predicted)

print('most frequent class (dummy classifier)\n', confusion)

most frequent class (dummy classifier)
 [[407   0]
 [ 43   0]]


In [9]:
dummy_classprop = DummyClassifier(strategy='stratified').fit(X_train, y_train)

y_classprop_predicted = dummy_classprop.predict(X_test)

confusion = confusion_matrix(y_test,y_classprop_predicted)

print('most frequent class (dummy classifier)\n', confusion)

most frequent class (dummy classifier)
 [[357  50]
 [ 39   4]]


In [10]:
svm = SVC(kernel='linear',C=1).fit(X_train, y_train)

svm_predicted = svm.predict(X_test)
confusion = confusion_matrix(y_test,svm_predicted)

print('most frequent class (svm classifier)\n', confusion)

most frequent class (svm classifier)
 [[402   5]
 [  5  38]]


In [12]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print('Accuracy: {:.2f}'.format(accuracy_score(y_test, y_classprop_predicted)))
print('Precision: {:.2f}'.format(precision_score(y_test, y_classprop_predicted)))
print('Recall: {:.2f}'.format(recall_score(y_test, y_classprop_predicted)))
print('F1 score: {:.2f}'.format(f1_score(y_test, y_classprop_predicted)))

Accuracy: 0.80
Precision: 0.07
Recall: 0.09
F1 score: 0.08


In [14]:
from sklearn.metrics import classification_report

print('svm:')
print(classification_report(y_test,svm_predicted,target_names=['not 1','1']))
print('dummy:')
print(classification_report(y_test,y_classprop_predicted,target_names=['not 1','1']))
print('dummy majority:')
print(classification_report(y_test,y_majority_predicted,target_names=['not 1','1']))

svm:
              precision    recall  f1-score   support

       not 1       0.99      0.99      0.99       407
           1       0.88      0.88      0.88        43

    accuracy                           0.98       450
   macro avg       0.94      0.94      0.94       450
weighted avg       0.98      0.98      0.98       450

dummy:
              precision    recall  f1-score   support

       not 1       0.90      0.88      0.89       407
           1       0.07      0.09      0.08        43

    accuracy                           0.80       450
   macro avg       0.49      0.49      0.49       450
weighted avg       0.82      0.80      0.81       450

dummy majority:
              precision    recall  f1-score   support

       not 1       0.90      1.00      0.95       407
           1       0.00      0.00      0.00        43

    accuracy                           0.90       450
   macro avg       0.45      0.50      0.47       450
weighted avg       0.82      0.90      0.86   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
from sklearn.linear_model import LogisticRegression as lr

y_scores_lr = lr().fit(X_train,y_train).decision_function(X_test)
y_scores_list = list(zip(y_test[0:20],y_scores_lr[0:20]))

y_scores_list

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[(0, -29.828770502888116),
 (0, -19.382789738053784),
 (0, -29.198670349715567),
 (0, -21.746404459790504),
 (0, -22.64235310102342),
 (0, -11.805842369075759),
 (1, 6.495997717246041),
 (0, -23.35467639948811),
 (0, -27.5442267850303),
 (0, -26.88820425754935),
 (0, -31.863267982040266),
 (0, -22.48602744407303),
 (0, -25.318060176866066),
 (0, -13.384469430324788),
 (0, -13.565681390836076),
 (0, -13.30829484317648),
 (1, 12.18111062994153),
 (0, -34.362361855034976),
 (0, -13.231587724821603),
 (0, -29.594035340820167)]

In [27]:
y_proba_lr = lr().fit(X_train,y_train).predict_proba(X_test)
y_proba_list = list(zip(y_test,y_proba_lr[:,1]))

y_proba_list

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[(0, 1.1105281422944705e-13),
 (0, 3.820862166051439e-09),
 (0, 2.0853482323256512e-13),
 (0, 3.594649427724256e-10),
 (0, 1.4674083663646533e-10),
 (0, 7.4607856827615285e-06),
 (1, 0.9984928066217791),
 (0, 7.197686272776271e-11),
 (0, 1.0906723390476288e-12),
 (0, 2.10184779111985e-12),
 (0, 1.451972998094963e-14),
 (0, 1.71570398915917e-10),
 (0, 1.0104298618070975e-11),
 (0, 1.538856470204047e-06),
 (0, 1.2838044335177512e-06),
 (0, 1.6606582403928e-06),
 (1, 0.9999948736454032),
 (0, 1.1929324742457014e-15),
 (0, 1.7930553067677074e-06),
 (0, 1.4043448120828546e-13),
 (0, 3.0489349713047584e-11),
 (0, 1.5103024072372098e-07),
 (0, 6.756693459140514e-14),
 (0, 4.125268816341987e-05),
 (0, 1.032705450640122e-17),
 (0, 1.772306087145626e-09),
 (0, 2.359495536221842e-09),
 (0, 1.0729197005818194e-19),
 (0, 1.1825156148715618e-17),
 (0, 8.926304754869694e-15),
 (0, 9.360633781830224e-10),
 (0, 2.5441086753176074e-13),
 (0, 7.561746748831151e-07),
 (0, 0.004455613538202336),
 (0, 7.977