In [8]:
from sklearn.metrics import f1_score
import numpy as np
import pickle
import gzip

In [10]:
# Required adaptation to jupyter!
pred = np.load('stacked_predictions_15000_validation.npy')
with gzip.open('data/y_validation.pickle','rb') as fp:
    labels = pickle.load(fp)

#labels = labels[::,1::]

y_true = np.transpose(labels)
predictions = np.transpose(pred)

# Baseline threshold and F1
y_pred = (predictions > 0.20).astype(int)
f1 = f1_score(y_true, y_pred, average='micro')

### Using this adjusted threshold function actually improves the f1 on the validation set by ~0.03

In [11]:
def thresholds(y_true, predictions, num_labels=228):
    thresholds = []
    for i in range(num_labels):
        labels_i = y_true[i]
        activations_i = predictions[i]
        threshold = equidistant_threshold(labels_i, activations_i)
        thresholds.append(threshold)
    np.array(thresholds)
    return(thresholds)

# Input function for individual thresholds
def equidistant_threshold(labels_i,activations_i, distance=0.1, rng=(15,30)):
    best_threshold = 0.5
    best_f1 = 0.0
    for i in range(rng[0], rng[1], 1): 
        threshold = i/100.
        pred_i = (activations_i > threshold).astype(int)
        f1 = f1_score(labels_i, pred_i)
        #print('f1: {}'.format(f1))
        if(f1>best_f1):
            #print('!!newbest!!')
            best_f1 = f1
            best_threshold = threshold
    return(best_threshold)

In [12]:
t = thresholds(y_true, predictions)
pred_equi = np.empty((228,9897))
for i, threshold in enumerate(t):
    pred_equi[i,:] = (predictions[i,:] > threshold).astype(int)

  'precision', 'predicted', average, warn_for)


In [13]:
f1_score(y_true, pred_equi, average='micro')

0.5917886355984234

In [12]:
f1_score(y_true, (predictions > 0.2).astype(int), average='micro')

0.5732677871520627

### Run this to save optimized thresholds

In [14]:
np.save('optimize_thresholds', t)

In [5]:
# All functions used in cells bellow

# Make individual thresholds for all the labels output in one array. Uses 
# either the equidistant spacing or successive splitting method for searching
def thresholds(y_true, predictions, method='equidistant', num_labels=228):
    thresholds = []
    for i in range(num_labels):
        labels_i = y_true[i]
        activations_i = predictions[i]
        if(method=='equidistant'):
            threshold = equidistant_threshold(labels_i, activations_i)
            thresholds.append(threshold)
        if(method=='successive'):
            threshold = successive_threshold(labels_i, activations_i)
            thresholds.append(threshold)
    np.array(thresholds)
    return(thresholds)

# Input function for individual thresholds
def equidistant_threshold(labels_i,activations_i, distance=0.1):
    points = int(round(1/distance))+1
    best_threshold = 0.5
    best_f1 = 0.0
    for i in range(points): 
        threshold = i*distance
        #print('threshold: {}'.format(threshold))
        pred_i = (activations_i > threshold).astype(int)
        f1 = f1_score(labels_i, pred_i)
        #print('f1: {}'.format(f1))
        if(f1>best_f1):
            #print('!!newbest!!')
            best_f1 = f1
            best_threshold = threshold
    return(best_threshold)

# Input function for successive splitting
def successive_threshold(labels_i, activations_i, depth=10):
    best_threshold = 0.5
    best_f1 = 0.0
    min_threshold = 0.1
    max_threshold = 0.9
    min_pred = (activations_i > min_threshold).astype(int)
    max_pred = (activations_i > max_threshold).astype(int)
    min_f1 = f1_score(labels_i, min_pred)
    max_f1 = f1_score(labels_i, max_pred)
    if (max_f1>best_f1):
        best_f1 = max_f1
        best_threshold = max_threshold
    if (min_f1>best_f1):
        best_f1 = min_f1
        best_threshold = min_threshold    
    for i in range(depth):
        center = (max_threshold+min_threshold)/2
        if(min_f1>max_f1):
            max_threshold = center
            max_pred = (activations_i > max_threshold).astype(int)
            max_f1 = f1_score(labels_i, max_pred)
            if (max_f1>best_f1):
                best_f1 = max_f1
                best_threshold = max_threshold
        else:
            min_threshold = center
            min_pred = (activations_i > min_threshold).astype(int)
            min_f1 = f1_score(labels_i, min_pred)
            if (min_f1>best_f1):
                best_f1 = min_f1
                best_threshold = min_threshold
    return(best_threshold)

# Lazy function to find one global threshold better than 0.5 using equidistance    
def global_eq_threshold(y_true, predictions, distance):
    points = int(round(1/distance+1))
    best_f1 = 0.0
    best_threshold = 0.5
    for i in range(points): 
        threshold = i*distance
        y_pred = (predictions > threshold).astype(int)
        f1 = f1_score(y_true, y_pred, average='micro')
        
        if(f1>best_f1):
            best_f1=f1
            best_threshold = threshold
    return(best_threshold)

# Function to find global threshold better than 0.5 using successive splitting
def global_succ_threshold(y_true, predictions, depth):
    best_threshold = 0.5
    best_f1 = 0.0
    min_threshold = 0.1
    max_threshold = 0.9
    min_pred = (predictions > min_threshold).astype(int)
    max_pred = (predictions > max_threshold).astype(int)
    min_f1 = f1_score(y_true, min_pred, average='micro')
    max_f1 = f1_score(y_true, max_pred, average='micro')
    if (max_f1>best_f1):
        best_f1 = max_f1
        best_threshold = max_threshold
    if (min_f1>best_f1):
        best_f1 = min_f1
        best_threshold = min_threshold
    for i in range(depth):
        center = (max_threshold+min_threshold)/2
        if(min_f1>max_f1):
            max_threshold = center
            max_pred = (predictions > max_threshold).astype(int)
            max_f1 = f1_score(y_true, max_pred,average='micro')
            if (max_f1>best_f1):
                best_f1 = max_f1
                best_threshold = max_threshold
        else:
            min_threshold = center
            min_pred = (predictions > min_threshold).astype(int)
            min_f1 = f1_score(y_true, min_pred,average='micro')
            if (min_f1>best_f1):
                best_f1 = min_f1
                best_threshold = min_threshold
    return(best_threshold)

In [10]:
#Equidistant threshold finding and f1

eq = np.array(thresholds(y_true,y_pred))

y_pred = np.empty((predictions.shape))
for i, threshold in enumerate(eq):
    y_pred[i] = (predictions[i,:] > threshold).astype(int)

f1 = f1_score(y_true, y_pred, average='micro')
print(f1)

  'precision', 'predicted', average, warn_for)


0.09568756326617724


In [11]:
# Successive splitting thresholds and f1

sc = np.array(thresholds(y_true, y_pred,method='successive'))
y_pred = np.empty((predictions.shape))
for i, threshold in enumerate(sc):
    y_pred[i] = (predictions[i,:] > threshold).astype(int)

f1 = f1_score(y_true, y_pred, average='micro')
print (f1)

  'precision', 'predicted', average, warn_for)


0.21779322833413445


In [12]:
# Global equidistant search thresholds and f1

geq = np.array(global_eq_threshold(y_true, y_pred,0.01))
y_pred = (predictions > geq).astype(int)

f1 = f1_score(y_true, y_pred, average='micro')
print (f1)


  'precision', 'predicted', average, warn_for)


0.06798355079245075


In [13]:
#Global successive splitting thresholds and f1

gsc = np.array(global_succ_threshold(y_true, y_pred,100))
y_pred = (predictions > gsc).astype(int)

f1 = f1_score(y_true, y_pred, average='micro')
print (f1)

0.2178029667885129
