# Custom scorer

In [57]:
import pandas as pd

In [58]:
# Let's create a dummy "wine" dataset of 1000 wines, 10 features, and 3 classes (0=bad, 1=medium, 2=good wine)
from sklearn.datasets import make_classification

X, y = make_classification(
    n_samples=1000, n_features=10, n_classes=3, n_clusters_per_class=1, weights=[0.6, 0.3, 0.1], random_state=0
)
print("X.shape = ", X.shape)
pd.Series(y).value_counts()

X.shape =  (1000, 10)


0    599
1    299
2    102
dtype: int64

-----
❓ Our objective is to train a model which **maximizes prediction precision for the good wines (y=2) only**.  

We don't want any customers to be dissatisfied!

----

In [69]:
# We split train/test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 0)

In [70]:
# We want to gridsearch SVC classifiers (for instance)
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [71]:
# Let's CV the "best" SVC for "accurary" first (default perf metrics for SVC)
param_grid = {
    "kernel": ['rbf', 'linear'],
    "C": [0.1, 0.05, 1, 5, 10, 50, 100],
    "gamma": [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
}

clf = GridSearchCV(SVC(), param_grid, cv=5)
clf = clf.fit(X_train, y_train)
clf.best_estimator_

SVC(C=0.05, gamma=0.0001, kernel='linear')

In [72]:
# Print results
from sklearn.metrics import classification_report
print(classification_report(y_test, clf.best_estimator_.predict(X_test)))

              precision    recall  f1-score   support

           0       0.89      0.97      0.93       183
           1       0.90      0.74      0.81        88
           2       0.70      0.66      0.68        29

    accuracy                           0.87       300
   macro avg       0.83      0.79      0.81       300
weighted avg       0.87      0.87      0.87       300



☝️ Not good enough, we want to focus **only** on class 2

## Custom scoring function

Let's make our own custom metric which returns the precision of class "2"

In [73]:

# TP2/(TP2+FN2.1+FN2.0)

def my_custom_metric(y_true, y_pred):
    
    # Initiate count of predicted 2s and actual 2s
    predicted_2 = 0
    actually_2 = 0
    
    # Go through predictions and true values
    for (idx, y_p) in enumerate(y_pred):
        if y_p ==2:
            predicted_2 +=1 #If prediction = 2, count +1
            if y_true[idx] == 2:
                actually_2 +=1 #If true value is also 2, count +1
                
    if predicted_2 == 0: # Incase there are no 2s predicted, return zero to avoid division / zero
        return 0 
    else:
        return actually_2/predicted_2 # Otherwise, return precision


_true = [0,0,1,2,2,1]
_pred = [0,1,1,2,1,0]

my_custom_metric(_true, _pred)

1.0

## Grid search with custom scoring function

In [74]:
# Let's try to plug that into sklearn (will crash)
GridSearchCV(SVC(), param_grid, cv=5, scoring=my_custom_metric).fit(X_train, y_train)

TypeError: my_custom_metric() takes 2 positional arguments but 3 were given

## `Make_scorer`

We need to transform our "metric" into a "sklearn scorer method"

In [75]:
from sklearn.metrics import make_scorer

my_custom_scorer = make_scorer(my_custom_metric)

## Grid search with custom scorer method

In [76]:
clf = GridSearchCV(SVC(), param_grid, cv=5, scoring=my_custom_scorer)

clf = clf.fit(X_train, y_train)

clf.best_estimator_

SVC(C=0.1, gamma=0.1)

In [77]:
from sklearn.metrics import classification_report

print(classification_report(y_test, clf.best_estimator_.predict(X_test)))

              precision    recall  f1-score   support

           0       0.84      1.00      0.91       183
           1       0.88      0.72      0.79        88
           2       0.90      0.31      0.46        29

    accuracy                           0.85       300
   macro avg       0.87      0.68      0.72       300
weighted avg       0.86      0.85      0.83       300



✅ We improved our precision for class 2 from 0.8 up to 0.94, but at the detriment of overall accuracy!

## Ben's magical one liner version

In [78]:
def my_custom_metric(y_true, y_pred):
    return precision_score(y_true,y_pred, average= None)[2].tolist()

_true = [0,0,1,2,2,1]
_pred = [0,1,1,2,1,0]

my_custom_metric(_true, _pred)

1.0