In [None]:
from sklearn.metrics import roc_curve, auc, accuracy_score
from sklearn import preprocessing
from scipy import interp
import itertools

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# ROC曲线
ROC曲线及AUC系数主要用来检验模型对客户进行正确排序的能力。

ROC曲线描述了在一定累计好客户比例下的累计坏客户的比例，模型的分别能力越强，ROC曲线越往左上角靠近。
AUC系数表示ROC曲线下方的面积。AUC系数越高，模型的风险区分能力越强。

ROC值一般在0.5-1.0之间。值越大表示模型判断准确性越高，即越接近1越好。ROC=0.5表示模型的预测能力与随机结果没有差别。

In [None]:
def plot_auc(y, y_pred, n_classes=2,unique_classes=None):
    fig, ax = plt.subplots(1, 1, figsize=(30, 8),sharex='col', sharey='row')
    if n_classes==2:
        fpr, tpr, _ = roc_curve(y, y_pred, pos_label=1)
        roc_auc = auc(fpr, tpr)
        ax.plot(fpr, tpr,
                label='ROC curve (auc = {0:0.2f})'.format(roc_auc),
                color='deeppink', linestyle=':', linewidth=4)
        ax.plot([0, 1], [0, 1], 'k--')
        ax.set_xlim([0.0, 1.0])
        ax.set_ylim([0.0, 1.05])
    else:
        # 对于多分类， 要将标签二元化，才能使用roc_curve计算ROC值
        y_binarized = preprocessing.label_binarize(y, classes=unique_classes)
        y_pred_binarized = preprocessing.label_binarize(y_pred, classes=unique_classes)
        fpr, tpr, roc_auc, acc = {}, {}, {}, {}
        # 计算每个分类对应的值
        for i in range(0, n_classes):
            fpr[i], tpr[i], _ = roc_curve(y_binarized[:,i], y_pred_binarized[:, i], pos_label=1)
            roc_auc[i] = auc(fpr[i], tpr[i])
        # 所有类别的平均roc值
        fpr['micro'], tpr['micro'], _ = roc_curve(y_binarized.ravel(), y_pred_binarized.ravel())
        roc_auc['micro'] = auc(fpr['micro'], tpr['micro'])
        #
        all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
        mean_tpr = np.zeros_like(all_fpr)
        for i in range(n_classes):
            mean_tpr += interp(all_fpr, fpr[i], tpr[i])
        mean_tpr /= n_classes
        fpr["macro"] = all_fpr
        tpr["macro"] = mean_tpr
        roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
        # 画均值曲线
        ax.plot(fpr['micro'], tpr['micro'],
                label='micro-average ROC curve (auc = {0:0.2f})'.format(roc_auc["micro"]),
                color='deeppink', linestyle=':', linewidth=4)
        #
        ax.plot(fpr["macro"], tpr["macro"],
                label='macro-average ROC curve (auc = {0:0.2f})'.format(roc_auc["macro"]),
                color='navy', linestyle=':', linewidth=4)
        # 画每个类别的roc
        colors = itertools.cycle(['aqua', 'darkorange', 'cornflowerblue'])
        for i, color in zip(range(n_classes), colors):
            ax.plot(fpr[i], tpr[i],
                    color=color, lw=1,
                    label='ROC curve of class {0} (auc = {1:0.2f})'.format(i, roc_auc[i]))
    acc = accuracy_score(y, y_pred)
    ax.plot([0, 1], [0, 1], 'k--')
    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('False Positive Rate', fontsize=14)
    ax.set_ylabel('True Positive Rate', fontsize=14)
    ax.set_title('ROC curve(accuracy = {:0.2f})'.format(acc), fontsize=14)
    ax.legend(loc='lower right')

# KS曲线用于考察二分类的区分度

KS（Kolmogorov-Smirnov）检验:K－S检验主要是验证模型对违约对象的区分能力，通常是在模型预测全体样本的信用评分后，将全体样本按违约与非违约分为两部分，然后用KS统计量来检验这两组样本信用评分的分布是否有显著差异。

KS值越大，表示模型能够将正、负客户区分开的程度越大。 通常来讲，KS>0.2即表示模型有较好的预测准确性。

柯尔莫哥洛夫-斯米尔诺夫检验（Колмогоров-Смирнов检验）基于累计分布函数，用以检验两个经验分布是否不同或一个经验分布与另一个理想分布是否不同。

  绘制方式与ROC曲线略有相同，都要计算TPR和FPR。但是TPR和FPR都要做纵轴，横轴为把样本分成多少份。 
步骤： 
1. 按照分类模型返回的概率降序排列 
2. 把0-1之间等分N份，等分点为阈值，计算TPR、FPR 
3. 对TPR、FPR描点画图即可

KS值即为Max(TPR-FPR)


In [2]:
def PlotKS(labels,preds, asc, n=10):

    # preds is score: asc=1  预测值输出的是标签值
    # preds is prob: asc=0 预测值输出的是概率

    pred = preds  # 预测值
    good = labels  # 取0为bad反例, 1为good正例
    ksds = pd.DataFrame({'good': good, 'pred': pred})
    ksds['bad'] = 1 - ksds['good']

    if asc == 1:
        ksds1 = ksds.sort_values(by=['pred', 'bad'], ascending=[True, True])
    elif asc == 0:
        ksds1 = ksds.sort_values(by=['pred', 'bad'], ascending=[False, True])
    ksds1.index = range(len(ksds1.pred))
    ksds1['cumsum_good1'] = 1.0*ksds1['good'].cumsum()/ksds1['good'].sum()
    ksds1['cumsum_bad1'] = 1.0*ksds1['bad'].cumsum()/ksds1['bad'].sum()

    if asc == 1:
        ksds2 = ksds.sort_values(by=['pred', 'bad'], ascending=[True, False])
    elif asc == 0:
        ksds2 = ksds.sort_values(by=['pred', 'bad'], ascending=[False, False])
    ksds2.index = range(len(ksds2.pred))
    ksds2['cumsum_good2'] = 1.0*ksds2['good'].cumsum()/ksds2['good'].sum()
    ksds2['cumsum_bad2'] = 1.0*ksds2['bad'].cumsum()/ksds2['bad'].sum()

    # ksds1 ksds2 -> average
    ksds = ksds1.loc[:, ['cumsum_good1', 'cumsum_bad1']]
    ksds['cumsum_good2'] = ksds2['cumsum_good2']
    ksds['cumsum_bad2'] = ksds2['cumsum_bad2']
    ksds['cumsum_good'] = (ksds['cumsum_good1'] + ksds['cumsum_good2'])/2
    ksds['cumsum_bad'] = (ksds['cumsum_bad1'] + ksds['cumsum_bad2'])/2

    # ks
    ksds['ks'] = ksds['cumsum_bad'] - ksds['cumsum_good']
    ksds['tile0'] = range(1, len(ksds['ks']) + 1)
    ksds['tile'] = 1.0*ksds['tile0']/len(ksds['tile0'])

    qe = list(np.arange(0, 1, 1.0/n))
    qe.append(1)
    qe = qe[1:]

    ks_index = pd.Series(ksds.index)
    ks_index = ks_index.quantile(q = qe)
    ks_index = np.ceil(ks_index).astype(int)
    ks_index = list(ks_index)

    # ksds = ksds.loc[ks_index]
    ksds = ksds.iloc[ks_index, :]
    ksds = ksds[['tile', 'cumsum_good', 'cumsum_bad', 'ks']]
    ksds0 = np.array([[0, 0, 0, 0]])
    ksds = np.concatenate([ksds0, ksds], axis=0)
    ksds = pd.DataFrame(ksds, columns=['tile', 'cumsum_good', 'cumsum_bad', 'ks'])

    ks_value = ksds['ks'].max()
    ks_pop = ksds.loc[ksds['ks'].idxmax(), 'tile']

    # chart
    fig = plt.figure(figsize=(30, 8))
    plt.plot(ksds['tile'], ksds['cumsum_good'], label='cum_good',   color='blue', linestyle='-', linewidth=2)

    plt.plot(ksds['tile'], ksds['cumsum_bad'], label='cum_bad', color='red', linestyle='-', linewidth=2)

    plt.plot(ksds['tile'], ksds.ks, label='ks', color='green', linestyle='-', linewidth=2)

    plt.axvline(ks_pop, color='gray', linestyle='--')
    plt.axhline(ks_value, color='green', linestyle='--')
    plt.axhline(ksds.loc[ksds['ks'].idxmax(), 'cumsum_good'], color='blue', linestyle='--')
    plt.axhline(ksds.loc[ksds['ks'].idxmax(),'cumsum_bad'], color='red', linestyle='--')
    plt.title('KS=%s ' %np.round(ks_value, 4) + 'at Pop=%s' %np.round(ks_pop, 4), fontsize=15)
