# Dummy Classifier

In [2]:
%matplotlib notebook
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [1]:
from sklearn.datasets import load_digits

In [4]:
dataset = load_digits()
x, y = dataset.data, dataset.target

for class_name, class_count in zip(dataset.target_names, np.bincount(dataset.target)):
    print(class_name, class_count)
# 0이라는 class를 가진 data가 178개 있음

0 178
1 182
2 177
3 183
4 181
5 182
6 181
7 179
8 174
9 180


In [5]:
y_binary_imbalanced = y.copy()
y_binary_imbalanced[y_binary_imbalanced != 1] = 0

print(y[1:30])
print(y_binary_imbalanced[1:30])

[1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9]
[1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]


In [6]:
x[1:30]

array([[ 0.,  0.,  0., ..., 10.,  0.,  0.],
       [ 0.,  0.,  0., ..., 16.,  9.,  0.],
       [ 0.,  0.,  7., ...,  9.,  0.,  0.],
       ...,
       [ 0.,  0.,  0., ...,  1.,  0.,  0.],
       [ 0.,  0., 10., ...,  1.,  0.,  0.],
       [ 0.,  0.,  9., ..., 12., 11.,  0.]])

In [7]:
np.bincount(y_binary_imbalanced)
# 1615개의 data 중 182개의 data만 class 1을 가짐
# 나머지는 class 0 -> most frequent = 0

array([1615,  182], dtype=int64)

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x, y_binary_imbalanced, random_state=0)

clf = LogisticRegression(max_iter=800).fit(x_train, y_train)
clf.score(x_test, y_test)

0.9755555555555555

In [17]:
from sklearn.dummy import DummyClassifier

In [18]:
dummy = DummyClassifier(strategy='most_frequent').fit(x_train, y_train)
# 현재 y_train에 0이 제일 많으므로 0 선택 -> most_frequent

y_dummy_predictions = dummy.predict(x_test)

In [19]:
y_dummy_predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [20]:
dummy.score(x_test, y_test)

0.9044444444444445

dummy classifier `strategy=most_frequent`이므로     
- y_train set에 0이 가장 많으므로 모두 0이라 예측
- 0을 0이라고 판단한 경우 = TN
- 1을 0이라고 판단한 경우 = FN
- 해당 classifier의 경우 TN, FN 외 FP, TP는 나올 수 없음

# Confusion Matrix

In [22]:
from sklearn.metrics import confusion_matrix

In [23]:
y_majority_predicted = dummy.predict(x_test)
confusion = confusion_matrix(y_test, y_majority_predicted)

In [24]:
print(confusion)
# TN = 407, FN = 43

[[407   0]
 [ 43   0]]


In [25]:
y_logreg_predicted = clf.predict(x_test)
confusion_logreg = confusion_matrix(y_test, y_logreg_predicted)

In [26]:
print(confusion_logreg)

[[403   4]
 [  7  36]]


In [27]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [28]:
print('Dummy Classifier')
print('Accuracy: {:.2f}'.format(accuracy_score(y_test, y_majority_predicted)))
print('Precision: {:.2f}'.format(precision_score(y_test, y_majority_predicted)))
print('Recall: {:.2f}'.format(recall_score(y_test, y_majority_predicted)))

Dummy Classifier
Accuracy: 0.90
Precision: 0.00
Recall: 0.00


  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
print('Logistic Regression based Classifier')
print('Accuracy: {:.2f}'.format(accuracy_score(y_test, y_logreg_predicted)))
print('Precision: {:.2f}'.format(precision_score(y_test, y_logreg_predicted)))
print('Recall: {:.2f}'.format(recall_score(y_test, y_logreg_predicted)))

Logistic Regression based Classifier
Accuracy: 0.98
Precision: 0.90
Recall: 0.84
