In [1]:
import pandas as pd
import seaborn as sns
import matplotlib as plt

from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier


In [None]:
df = pd.read_csv('../data')

In [2]:
def model_report(df, query_test, target='target',prob='prob', time_id='dt_ref'):
    def model_stats(x):
        s = pd.Series({
            'prob':x[prob].mean(),
            target:x[target].mean(),
            'count':len(x),
            'event':x[target].sum(),
            'non_event': len(x) - x[target].sum()
        });
        return s

    temp = (
        df.query(query_test)
        .groupby('score').apply(model_stats).round(2)
        .assign(per_count=lambda x:x['count'] / x['count'].sum())
        .assign(per_event=lambda x:x['event'] / x['event'].sum())
        .assign(non_per_event=lambda x:x['non_event'] / x['non_event'].sum())
        .assign(odds=lambda x:x['per_event']/x['non_per_event'])
        .assign(per_pop_acc=lambda x:x['count'].cumsum()/x['count'].sum())
        .assign(per_event_acc=lambda x:x['event'].cumsum()/x['event'].sum())
        .assign(lift=lambda x:x[target] / ( x['event'].sum()/ x['count'].sum()))
    );
    
    display(temp.reset_index().round(2))
    temp_total = temp.mean();
    display(temp_total.to_frame().T.round(2))

    plt.figure(figsize=(12,8))
    plt.subplot(221);
    temp.plot.scatter(x='prob',y=target,ax=plt.gca());
    plt.xlabel('Prob');
    plt.xlabel('Mean target');

    plt.subplot(222);
    plt.title('Strategy plot')
    temp[['per_pop_acc','per_event_acc']].plot(ax=plt.gca());
    plt.grid();
    #print(temp.reset_index().columns)
    
    plt.subplot(223);
    plt.title('KS %0.2f'% np.abs( temp['per_event'].cumsum() -temp['non_per_event'].cumsum()).max())
    temp.reset_index().plot.bar(x='prob',y=['per_event','non_per_event'],ax=plt.gca());
    
    #ROC
    fpr, tpr, _ = roc_curve(df[target],df[prob])
    roc_auc = auc(fpr, tpr)
    
    plt.subplot(224);
    plt.plot(fpr, tpr, color='darkorange',label='ROC curve (area = %0.2f)' % roc_auc);
    plt.plot([0, 1], [0, 1], color='navy',  linestyle='--');
    plt.xlim([0.0, 1.0]);
    plt.ylim([0.0, 1.05]);
    plt.xlabel('False Positive Rate');
    plt.ylabel('True Positive Rate');
    plt.title('ROC curve');
    plt.legend(loc="lower right");
    plt.tight_layout();
    plt.show();
    
    
    # stability 
    plt.figure(figsize=(12,4), tight_layout=True)
    plt.subplot(121);
    df_score = df.pivot_table(index=time_id,columns='score',values=target);
    df_score.plot(ax=plt.gca());
    plt.title("Score stability");
    
    plt.subplot(122);
    df.pivot_table(
        index=time_id,
        columns='score',
        values=target,
        aggfunc='count').plot(ax=plt.gca(), kind='bar', stacked=True)
    plt.title("Score stability");
    plt.show();
    
    from scipy import stats;
    
    scores = np.sort(df['score'].unique());
    test = []
    for i in range(len(scores)-1):
        test.append({
            'score_1':i,
            'score_2':i+1,
            #'ks_normal_test_score_1': round(stats.kstest(df_score[scores[i]],'norm')[0],5),
            'p_value_test':round(stats.ttest_rel(df_score[scores[i]], df_score[i+1])[1],5),
            'cv_1': stats.variation(df_score[scores[i]]),
            'per_invertion': (df_score[scores[i]] > df_score[scores[i+1]]).mean(),
        })
        
    display(pd.DataFrame(test))

In [3]:
def createCluster(df, n_bins=10, train_query='sample_id < 7'):
    binner = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='kmeans')
    binner.fit(df.query(train_query)[['prob']])
    return binner.transform(df[['prob']])
