In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Precision - Recall Trade-off

In this notebook I will be explaining the relationship and difference between precision and recall and how changing one affect the other.

# Outline

* [1. Overview of the Data-set](#1.-Overview-of-the-Data-set)
* [2. Confusion Matrix](#2.-Confusion-Matrix)
* [3. Precision vs Recall](#3.-Precision-vs-Recall)
* [4. Conclusion](4.-Conclusion)


### Kindly upvote if you find the kernel helpful :) 

In [None]:
# Libraries

import scipy
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, precision_recall_curve

In [None]:
# Variables

binary_data = [f'bin_{i}' for i in range(5)]
ordinal_data = [f'ord_{i}' for i in range(6)]
norminal_data = [f'nom_{i}' for i in range(10)]
day_n_month = ['day', 'month']

ordinal_scaler = StandardScaler()
ordinal_encoder = OrdinalEncoder(categories='auto')
ohe_encoder = OneHotEncoder()
dm_scaler = StandardScaler()

mapper_ord_1 = {'Novice': 0, 'Contributor': 1, 'Expert': 2, 'Master': 3, 'Grandmaster': 4}

mapper_ord_2 = {'Freezing': 0, 'Cold': 1, 'Warm': 2, 'Hot': 3,'Boiling Hot': 4, 'Lava Hot': 5}

mapper_ord_3 = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6, 'h': 7, 
                'i': 8, 'j': 9, 'k': 10, 'l': 11, 'm': 12, 'n': 13, 'o': 14}

mapper_ord_4 = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 
                'I': 8, 'J': 9, 'K': 10, 'L': 11, 'M': 12, 'N': 13, 'O': 14,
                'P': 15, 'Q': 16, 'R': 17, 'S': 18, 'T': 19, 'U': 20, 'V': 21, 
                'W': 22, 'X': 23, 'Y': 24, 'Z': 25}

# Functions

def plot_conf_mx(mx, title=None, ax=None, fontsize=None):
    """
    returns: a matplotlib black-white representation of the confusion matrix
            The higher the greater the intensity of the white shade and vice.
    """
    
    if ax is not None:
        if title is not None:
            ax.set_title(title, fontsize=fontsize)
            
        ax.matshow(mx, cmap=plt.cm.gray)
    else:
        if title is not None:
            plt.title(title, fontsize=fontsize)
            
        plt.matshow(mx, cmap=plt.cm.gray)
        plt.show()
            
def compare_conf_mx(shape, mx_s, titles=None, fontsize=15, figsize=(12, 10)):
    """
    returns: a matplotlib.subplot comparing multiple matrices in order of their shape
            and for easier comparision.
    """
    
    fig, axes = plt.subplots(*shape, figsize=figsize)
    
    for i, ax in enumerate(axes.ravel()):
        title = titles[i] if titles is not None else None
        
        plot_conf_mx(mx_s[i], title=title, ax=ax, fontsize=fontsize)
        
    plt.show()

def plot_precision_recall_vs_threshold(precisions, recalls, thresholds, perc=.8, on_perc=True):
    """
    Code credits: Aurélien Geron
    https://github.com/ageron/handson-ml2/blob/master/03_classification.ipynb
    
    plots the precision-recall curve. The perc parameter plots the percentage of
    accuracy on the precision curve which is inverse to the recall curve, and also
    states the corresponding threshold.
    """
    
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
    plt.xlabel("Threshold")
    plt.grid(True)
    plt.legend()
    
    if on_perc:
        recall_precision = recalls[np.argmax(precisions >= perc)]
        threshold_precision = thresholds[np.argmax(precisions >= perc)]
    
        plt.plot([threshold_precision, threshold_precision], [0., perc], "r:")
        plt.axis([-4, 4, 0, 1])
        plt.plot([-4, threshold_precision], [perc, perc], "r:")
        plt.plot([-4, threshold_precision], [recall_precision, recall_precision], "r:")
        plt.plot([threshold_precision], [perc], "ro") 
        plt.plot([threshold_precision], [recall_precision], "ro")
    
    plt.show()

def plot_precision_vs_recall(precisions, recalls, perc=.8, on_perc=True):
    """
    Code credits: Aurélien Geron
    https://github.com/ageron/handson-ml2/blob/master/03_classification.ipynb
    
    plot the precision against the recall. The perc parameter plots the percentage of
    accuracy on the precision curve which is inverse to the recall curve.
    """
    
    plt.plot(recalls, precisions, "b-", linewidth=2)
    plt.xlabel("Recall", fontsize=16)
    plt.ylabel("Precision", fontsize=16)
    plt.axis([0, 1, 0, 1])
    plt.grid(True)
    
    if on_perc:
        recall_precision = recalls[np.argmax(precisions >= perc)]
        
        plt.plot([recall_precision, recall_precision], [0., perc], "r:")
        plt.plot([0.0, recall_precision], [perc, perc], "r:")
        plt.plot([recall_precision], [perc], "ro")
        
    plt.show()
    
def threshold_precision(predictions, precisions, thresholds, perc=.8):
    t_precision = thresholds[np.argmax(precisions >= perc)]
    
    return (predictions >= t_precision)
    
def prep_data(data):    
    first_half = binary_data + ordinal_data
    second_half = day_n_month
    
    # Handling binary data
    data['bin_3'] = data['bin_3'].replace(to_replace=['F', 'T'], value=['0', '1']).astype(int)
    data['bin_4'] = data['bin_4'].replace(to_replace=['Y', 'N'], value=['1', '0']).astype(int)
    
    # Handling ordinal data
    data['ord_0'] = data['ord_0'] - 1
    
    for col, mapper in zip(
        ['ord_1', 'ord_2', 'ord_3', 'ord_4'],        
        [mapper_ord_1, mapper_ord_2, mapper_ord_3, mapper_ord_4]
    ):
        data[col] = data[col].replace(mapper)
        
    # Handling ord_5 high cardinality data
    ord_5_matrix = data.ord_5.values.reshape(-1, 1)
    data.ord_5 = ordinal_encoder.fit_transform(ord_5_matrix)
    
    # Scaling Ordinal Data
    data[ordinal_data] = ordinal_scaler.fit_transform(data[ordinal_data])
        
    # One Hot Encoding on norminal data nom_0 - nom_4
    nom_0_9_matrix = data[norminal_data].values
    ohe_trans = ohe_encoder.fit_transform(nom_0_9_matrix)
    
    # Scaling Day and Month Data
    data['day'] = data['day'] - 1
    data['month'] = data['month'] - 1
    
    data[day_n_month] = dm_scaler.fit_transform(data[day_n_month])
    
    part_one_matrix = scipy.sparse.coo_matrix(
        data.loc[:, first_half].to_numpy()
    ).astype('float64')

    part_two_matrix = scipy.sparse.coo_matrix(
        data.loc[:, second_half].to_numpy()
    ).astype('float64')
    
    
    return  scipy.sparse.hstack([
        part_one_matrix, 
        ohe_trans, 
        part_two_matrix
    ]).tocsr()

# 1. Overview of the Data-set

I will be making use of the data from the [Categorical Feature Encoding Challenge](https://www.kaggle.com/ganiyuolalekan/eda-on-the-various-categorical-data-0-80464). The data-set consists of only categorical data, and can be said to be broken into 3 distinct parts

* **The Binary data** - labelled bin_: This are categorical variables consisting of binary data, i.e True/False, 1's/0's, Yes/No e.t.c

* **The Ordinal Data** - labelled ord_: This are categorical variables consisting of ordered data. They are finite list of categories with a form of order related to it. Like in the days of the week, what come next after tuesday?

* **The Norminal Data** - labelled nom_: This are categorical variables, finite like the ordinal data but with no order.

**checkout** this [notebook](https://www.kaggle.com/ganiyuolalekan/eda-on-the-various-categorical-data-0-80464) on my work in this challenge. In this notebook I will be introducing the logistic model I used to describe our objective for this course. 

To understand the logistic model better visit https://machinelearningmastery.com/logistic-regression-for-machine-learning

so let's start by loading the data!

In [None]:
data = pd.read_csv("../input/cat-in-the-dat/train.csv", index_col="id")

In [None]:
target = data.target
data.drop('target', inplace=True, axis=1)

I will be skipping the exploratory data analysis of this data and head forward to preparing the data for the model. The exploratory data analysis of the data can be found [here](https://www.kaggle.com/ganiyuolalekan/eda-on-the-various-categorical-data-0-80464#2.-Exploratory-Data-Analysis).

In [None]:
td = data.copy()

td = prep_data(td)

td.shape

In [None]:
log_clf = LogisticRegression(C=0.1, max_iter=1000, n_jobs=-2)

In [None]:
predictions = cross_val_predict(log_clf, td, target, cv=3)

In [None]:
print(f"""
Precision: {precision_score(target, predictions)},
Recall: {recall_score(target, predictions)}
Accuracy: {accuracy_score(target, predictions)}
""")

<img src="https://i.gifer.com/origin/96/96d4e0de7526f5343749be587062100e_w200.gif" width="50%" />

# 2. Confusion Matrix

The accuracy score is the measure of **true-positives** plus **true-negatives** to the overall document. From the accuracy above we can tell approximately 76% of the data is predicted correctly, which tells us quite alot about how good the model is. The confusion matrix goes a step further to describe what the model predicts better or what it predicts worst in.

The confusion matrix displays - in the order left-right top-bottom - the **true-negative**, **false-positive**, **false-negative** and **true-positive**.

- The **true-negative (TN)** is the number of negative predictions that were actually negative.
- The **false-positive (FP)** is the number of positive predictions that were actually negative.
- The **false-negative (FN)** is the number of negative predictions that were actually positive.
- The **true-positive (TP)** is the number of positive predictions that were actually positive.

![towardsdatascience.com](https://static.packt-cdn.com/products/9781838555078/graphics/C13314_06_05.jpg)

So let's see how the model truly performs with the confusion matrix


In [None]:
mx = confusion_matrix(target, predictions)
mx

We can see that the model performs particularly well on the **true-negative** (top-left) with **186852** of the negative data being predicted negative. As good as this sounds **50174** of the positive data are also predicted as negative (**false-negative**) and consititute of the second largest share of the data in the confusion matrix. 

Let's use an image representation to view the confusion matrix

In [None]:
plot_conf_mx(mx)

This is a cleaner representation of what is going on in the confusion matrix. The brighter the **true-positive** and **true-negative** and darker the **false-positive** and **false-negative** in the confusion matrix, the more accurate the model. A perfect model should be something like this.

![](../input/perfect-conf-mx/conf_mx.png)

What we see from our model is that we've only achieved half that. If we wish to manipulate the confusion matrix we have to adjust the precision and recall rate of the model. 

# 3. Precision vs Recall

### What is precision?

Precision is the fraction of retrieved documents that are relevant to the query. 

Precision = $\frac{TP}{TP + FP}$

If you inspect the image of the confusion matrix above, you would notice the TP and FP are on the positive prediction half of the matrix (i.e on the RHS of the confusion matrix). 

The summation of this two (2) consistutes the total positive predictions relevant to the documents, and from the precision equation above we can see that the precision is the measure of the actual positives from this documents.

Simple put the precision focuses on how much of the actual positive predictions were rightly predicted.

### What is recall?

Recall (also referred to as sensitivity or True Positive Rate (TPR)) is the fraction of the relevant documents that are successfully retrieved.

Recall = $\frac{TP}{TP + FN}$

If you inspect the image of the confusion matrix above, you would notice the TP and FN are on the actual positive half of the matrix (i.e on the bottom of the confusion matrix). 

The summation of this two (2) consistutes the total actual positive relevant to the documents and from the recall equation above we can tell that the recall is the measure of the actual positives from this documents.

The recall focuses on how much of the actual positive data were rightly predicted.

In [None]:
TN = mx[0][0]
TP = mx[1][1]
FP = mx[0][1]
FN = mx[1][0]

In [None]:
print(f"""
Precision: {TP / (TP + FP)},
Recall: {TP / (TP + FN)}
Accuracy: {(TP + TN) / (TP + FP + TN + FN)}
""")

Comparing this to our previous scores we see that the results are the same. 

Now lets plot the relationship between the precision and the recall as a function of their threshold value.

In [None]:
prediction_scores = cross_val_predict(log_clf, td, target, cv=3, method="decision_function")

In [None]:
precisions, recalls, thresholds = precision_recall_curve(target, prediction_scores)

In [None]:
plot_precision_recall_vs_threshold(precisions, recalls, thresholds, perc=.66)

Another way to select a good precision/recall trade-off is to plot precision directly against recall.

In [None]:
plot_precision_vs_recall(precisions, recalls, perc=.66)

Let's see what happens when we increase the precision by 10%

In [None]:
perc = 0.76

In [None]:
plot_precision_recall_vs_threshold(precisions, recalls, thresholds, perc=perc)

In [None]:
plot_precision_vs_recall(precisions, recalls, perc=perc)

In [None]:
pred_76 = threshold_precision(prediction_scores, precisions, thresholds, perc=perc)

In [None]:
print(f"""
Precision: {precision_score(target, pred_76)},
Recall: {recall_score(target, pred_76)}
Accuracy: {accuracy_score(target, pred_76)}
""")

We see that as the precision rises to about 76% the recall rate drops to 25% and the accuracy drops slightly as well from 76% to 74%. 

But what impart does it have on the model's confusion matrix? 

In [None]:
mx_76 = confusion_matrix(target, pred_76)
mx_76

In [None]:
plot_conf_mx(mx_76)

It's almost as thou nothing changed. Let's compare it with the precision at 66%

In [None]:
titles = [
    "Confusion matrix at precision approx 66%",
    "Confusion matrix at precision approx 76%"
]

compare_conf_mx((1, 2), [mx, mx_76], titles)

This way we can see that there's a slight change, the model actually get slighty worst than before. (i.e in comparision to the perfect confusion matrix as shown above)

**Note**: The darker the box in the confusion matrix the lower the value.

So its only logical that when precision increases the amount of **FP** should reduce while the **TP** should increase, but we notice here that the **TP** reduces slighlty and the **FN** slightly increases as well. This is where the precision/recall trade-off comes to play. 

> **The Recall rate is inversely proportional to the Precision rate**

So the condition for a rising recall is that as the **TP** increases the **FN** should reduce, the inverse of this is that as the **FN** increases the **TP** should reduces and we have a recall of about 25 which explains why the **TP** is the way it is.

let's increase the precision by another 10% and see how it affects the graph

In [None]:
perc = .86

In [None]:
plot_precision_vs_recall(precisions, recalls, perc=perc)

In [None]:
pred_86 = threshold_precision(prediction_scores, precisions, thresholds, perc=perc)

In [None]:
print(f"""
Precision: {precision_score(target, pred_86)},
Recall: {recall_score(target, pred_86)}
Accuracy: {accuracy_score(target, pred_86)}
""")

We can see how much the recall suffers and the accuracy is still good enough. This is a perfect model if your focus is on precision.

Let's also see how this affect the confusion matrix.

In [None]:
mx_86 = confusion_matrix(target, pred_86)
mx_86

In [None]:
plot_conf_mx(mx_86)

What we see here is not a very perfect model, but it is highly precise. (about 86% worth)

Let's see how we've progressed with the increase in precision.

In [None]:
titles = [
    "Confusion matrix at precision approx 66%",
    "Confusion matrix at precision approx 76%",
    "Confusion matrix at precision approx 86%"
]

compare_conf_mx((1, 3), [mx, mx_76, mx_86], titles, figsize=(20, 15))

By reducing the precision we will increase the recall. Using this we can tell how the model performs with high recall. So we start by reducing the precision from 66% to 56%.

In [None]:
perc = 0.56

In [None]:
plot_precision_recall_vs_threshold(precisions, recalls, thresholds, perc=perc)

In [None]:
plot_precision_vs_recall(precisions, recalls, perc=perc)

In [None]:
pred_56 = threshold_precision(prediction_scores, precisions, thresholds, perc=perc)

In [None]:
print(f"""
Precision: {precision_score(target, pred_56)},
Recall: {recall_score(target, pred_56)}
Accuracy: {accuracy_score(target, pred_56)}
""")

Well this looks like a better model, if not for the reduced accuracy. 

But according to the confusion matrix...

In [None]:
mx_56 = confusion_matrix(target, pred_56)
mx_56

In [None]:
plot_conf_mx(mx_56)

Just like during the precision the **FP** suffers while the **FN** rises and the **TP** gets better.

The reason for this is because now the model becomes less focused on how precise its decision making becomes, and more focused on generating more positive outcomes, so this way the **FN** reduces because more of the actual positive are being predicted right and the **FP** increases as well because few of the actual negative are also predicted as positive.

Let's compare this model with the main model (where precision equals 66%).

In [None]:
titles = [
    "Confusion matrix at precision approx 66%",
    "Confusion matrix at precision approx 56%"
]

compare_conf_mx((1, 2), [mx, mx_56], titles)

Ok... what happens if we reduce the precision by 10% more? Let's see...

In [None]:
perc = 0.46

In [None]:
plot_precision_recall_vs_threshold(precisions, recalls, thresholds, perc=perc)

In [None]:
plot_precision_vs_recall(precisions, recalls, perc=perc)

In [None]:
pred_46 = threshold_precision(prediction_scores, precisions, thresholds, perc=perc)

In [None]:
print(f"""
Precision: {precision_score(target, pred_46)},
Recall: {recall_score(target, pred_46)}
Accuracy: {accuracy_score(target, pred_46)}
""")

We can tell from the accuracy how important precision is to a good model. 

Let's see how the confusion matrix reacts to this.

In [None]:
mx_46 = confusion_matrix(target, pred_46)
mx_46

In [None]:
plot_conf_mx(mx_46)

Here we can see the effect of reduced precision. Although this model's confusion matrix is the closest to our target confusion matrix, it cares less about being very precise and more on predicting the positive data.

# Conclusion

You can't have an absolutely perfect model, but you can trade precision/recall to make the model attain your objectives. 

**Think of it this way...**

> The more **precise** the model the less **False Negatives**.

> The more **recall** the model possess the less **False Positives**

Just like in our experimentations.

In [None]:
titles = [
    "Confusion matrix at precision approx 46%",
    "Confusion matrix at precision approx 86%"
]

compare_conf_mx((1, 2), [mx_46, mx_86], titles)