Let's generate some fake prediction ($\hat{y}$, as ```y_``` in code) , and fake label($y$, as ```y``` in code)

In [None]:
import pandas as pd
import numpy as np

In [73]:
def fakeData(shift_from_center = 0.):    
    """
    Create fake y prediction as contineous data, and ground truth label y as discrete data
    """
    y_ = np.random.rand(1000)
    y = (y_+((np.random.rand(1000)-0.5)/5)+shift_from_center)>0.5
    return y_,y

In [66]:
y_,y = fakeData(shift_from_center = 0.)
y_[:5],y[:5]

(array([0.4742574 , 0.51897665, 0.08691137, 0.7796798 , 0.2330116 ]),
 array([ True,  True, False,  True, False]))

### When Accuracy Still works

In [67]:
# Let's start the treshold at 0.5
th = 0.5

In [68]:
def accuracy(y_,y,th):
    return ((y_>th)==y).astype(np.float).mean()

def recall(y_,y,th):
    return ((y_>th)[y]).astype(np.float).sum()/y.sum()

def precision(y_,y,th):
    return (y[y_>th]).astype(np.float).mean()

def f1score(y_,y,th):
    r = recall(y_,y,th)
    p = precision(y_,y,th)
    return 2*r*p/(r+p)

In [74]:
y_,y = fakeData(shift_from_center = 0.)
print("%.2f%% of the label is True"%(y.mean()*100))
print("When threshold is %s, the accuracy:%.3f\trecall:%.3f\tprecision:%.3f\tf1score:%.3f"%(
                                                                th,
                                                                accuracy(y_,y,th), 
                                                                 recall(y_,y,th),
                                                                 precision(y_,y,th),
                                                                 f1score(y_,y,th)))

50.30% of the label is True
When threshold is 0.5, the accuracy:0.949	recall:0.950	precision:0.948	f1score:0.949


### When accuracy stops working

When most the case will be positive

In [82]:
y_,y = fakeData(shift_from_center = .4)
print("%.2f%% of the label is True"%(y.mean()*100))
print("When threshold is %s, the accuracy:%.3f\trecall:%.3f\tprecision:%.3f\tf1score:%.3f"%(
                                                                th,
                                                                accuracy(y_,y,th), 
                                                                 recall(y_,y,th),
                                                                 precision(y_,y,th),
                                                                 f1score(y_,y,th)))

89.80% of the label is True
When threshold is 0.5, the accuracy:0.603	recall:0.558	precision:1.000	f1score:0.716


When most the case will be negative

In [81]:
y_,y = fakeData(shift_from_center = -.4)
print("%.2f%% of the label is True"%(y.mean()*100))
print("When threshold is %s, the accuracy:%.3f\trecall:%.3f\tprecision:%.3f\tf1score:%.3f"%(
                                                                th,
                                                                accuracy(y_,y,th), 
                                                                 recall(y_,y,th),
                                                                 precision(y_,y,th),
                                                                 f1score(y_,y,th)))

10.60% of the label is True
When threshold is 0.5, the accuracy:0.608	recall:1.000	precision:0.213	f1score:0.351


### Threshold Finding

Finding a threshold that can give the best F1 score

#### Case1

In [83]:
y_,y = fakeData(shift_from_center = .4)

print("%.2f%% of the label is True"%(y.mean()*100))
print("When threshold is %s, the accuracy:%.3f\trecall:%.3f\tprecision:%.3f\tf1score:%.3f"%(
                                                                th,
                                                                accuracy(y_,y,th), 
                                                                 recall(y_,y,th),
                                                                 precision(y_,y,th),
                                                                 f1score(y_,y,th)))

89.50% of the label is True
When threshold is 0.5, the accuracy:0.619	recall:0.574	precision:1.000	f1score:0.730


In [89]:
datalist = []
for i in range(99):
    th = (i+1)/100
    datalist.append({
        "Threshold":th,
        "Accuracy":accuracy(y_,y,th), 
        "Recall":recall(y_,y,th),
        "Precision":precision(y_,y,th),
        "F1 Score":f1score(y_,y,th),
    })
data_df = pd.DataFrame(datalist)
data_df

Unnamed: 0,Accuracy,F1 Score,Precision,Recall,Threshold
0,0.900,0.947034,0.900302,0.998883,0.01
1,0.910,0.952077,0.909461,0.998883,0.02
2,0.919,0.956661,0.917864,0.998883,0.03
3,0.929,0.961807,0.927386,0.998883,0.04
4,0.937,0.965854,0.937895,0.995531,0.05
5,0.942,0.968444,0.943796,0.994413,0.06
6,0.946,0.970362,0.953614,0.987709,0.07
7,0.954,0.974614,0.962923,0.986592,0.08
8,0.954,0.974444,0.969061,0.979888,0.09
9,0.955,0.974902,0.973274,0.976536,0.10


Find the best threshold

In [92]:
data_df.sort_values(by="F1 Score",ascending=False).head()

Unnamed: 0,Accuracy,F1 Score,Precision,Recall,Threshold
10,0.956,0.975336,0.978628,0.972067,0.11
9,0.955,0.974902,0.973274,0.976536,0.1
7,0.954,0.974614,0.962923,0.986592,0.08
8,0.954,0.974444,0.969061,0.979888,0.09
12,0.953,0.973371,0.987356,0.959777,0.13


#### Case 2

In [93]:
y_,y = fakeData(shift_from_center = -.4)

print("%.2f%% of the label is True"%(y.mean()*100))
print("When threshold is %s, the accuracy:%.3f\trecall:%.3f\tprecision:%.3f\tf1score:%.3f"%(
                                                                th,
                                                                accuracy(y_,y,th), 
                                                                 recall(y_,y,th),
                                                                 precision(y_,y,th),
                                                                 f1score(y_,y,th)))

10.60% of the label is True
When threshold is 0.99, the accuracy:0.906	recall:0.123	precision:0.929	f1score:0.217


In [94]:
datalist = []
for i in range(99):
    th = (i+1)/100
    datalist.append({
        "Threshold":th,
        "Accuracy":accuracy(y_,y,th), 
        "Recall":recall(y_,y,th),
        "Precision":precision(y_,y,th),
        "F1 Score":f1score(y_,y,th),
    })
data_df = pd.DataFrame(datalist)
data_df

Unnamed: 0,Accuracy,F1 Score,Precision,Recall,Threshold
0,0.117,0.193607,0.107179,1.000000,0.01
1,0.126,0.195212,0.108163,1.000000,0.02
2,0.135,0.196843,0.109166,1.000000,0.03
3,0.144,0.198502,0.110187,1.000000,0.04
4,0.158,0.201139,0.111814,1.000000,0.05
5,0.167,0.202871,0.112886,1.000000,0.06
6,0.177,0.204831,0.114101,1.000000,0.07
7,0.184,0.206226,0.114967,1.000000,0.08
8,0.191,0.207640,0.115847,1.000000,0.09
9,0.198,0.209073,0.116740,1.000000,0.10


In [95]:
data_df.sort_values(by="F1 Score",ascending=False).head()

Unnamed: 0,Accuracy,F1 Score,Precision,Recall,Threshold
87,0.947,0.772532,0.708661,0.849057,0.88
86,0.943,0.765432,0.678832,0.877358,0.87
90,0.95,0.761905,0.769231,0.754717,0.91
88,0.946,0.761062,0.716667,0.811321,0.89
89,0.946,0.75,0.736364,0.764151,0.9
