In [1]:
# %conda create -n FasterRisk python=3.9 # create a virtual environment
# %conda activate FasterRisk # activate the virtual environment
%pip install fasterrisk

Note: you may need to restart the kernel to use updated packages.


In [2]:
from fasterrisk.fasterrisk import RiskScoreOptimizer, RiskScoreClassifier
from fasterrisk.utils import download_file_from_google_drive,  compute_logisticLoss_from_X_y_beta0_betas, get_all_product_booleans, get_support_indices, isEqual_upTo_8decimal, isEqual_upTo_16decimal, get_all_product_booleans

import os.path

import numpy as np
import pandas as pd
import time
# Evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, roc_auc_score, log_loss, classification_report, confusion_matrix

In [3]:
def get_calculation_table(risk_score_model):
    assert risk_score_model.featureNames is not None, "please pass the featureNames to the model by using the function .reset_featureNames(featureNames)"

    nonzero_indices = get_support_indices(risk_score_model.coefficients)

    max_feature_length = max([len(featureName) for featureName in risk_score_model.featureNames])
    row_score_template = '{0}. {1:>%d}     {2:>2} point(s) | + ...' % (max_feature_length)

    print("The Risk Score is:")
    for count, feature_i in enumerate(nonzero_indices):
        row_score_str = row_score_template.format(count+1, risk_score_model.featureNames[feature_i], int(risk_score_model.coefficients[feature_i]))
        if count == 0:
            row_score_str = row_score_str.replace("+", " ")

        print(row_score_str)

    final_score_str = ' ' * (14+max_feature_length) + 'SCORE | =    '
    print(final_score_str)
    
    
    print("###")
    feature_names_list = []
    coefficients_list = []
    for count, feature_i in enumerate(nonzero_indices):
        feature_names_list.append(risk_score_model.featureNames[feature_i])
        coefficients_list.append(int(risk_score_model.coefficients[feature_i]))
    
    print("feature names: ", feature_names_list)
    print("coefficients: ", coefficients_list)
    print(len(feature_names_list) == len(coefficients_list))

def print_classification_metrics(risk_score_model, X, y):
    start = time.time()
    y_pred = risk_score_model.predict(X)
    stop = time.time()
    print(f"Predict time: {stop-start} s")
    
    # Compute the accuracy
    accuracy = accuracy_score(y, y_pred)
    print("Accuracy: {:.3f}".format(accuracy))
    # Compute the precision
    precision = precision_score(y, y_pred)
    print("Precision: {:.3f}".format(precision))
    # Compute the recall or sensitivity
    recall = recall_score(y, y_pred)
    print("Recall: {:.3f}".format(recall))
    # Compute the F1 score
    f1 = f1_score(y, y_pred)
    print("F1 score: {:.3f}".format(f1))
    # Compute the roc auc score
    auc = roc_auc_score(y,y_pred)
    print("AUC score: {:.3f}".format(auc))
    # Compute the log lossscore
    loss = log_loss(y,y_pred)
    print("Log loss: {:.3f}".format(loss))

    # Assume y and y_pred are the true and predicted labels for a binary classification problem
    tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
    # Calculate TPR and TNR
    tpr = tp / (tp + fn) # Sensitivity
    tnr = tn / (tn + fp) # Specificity
    # Calculate G-mean
    gmean = np.sqrt(tpr * tnr)
    print("G-mean: {:.3f}".format(gmean))

    print("Specificity: {:.3f}".format(tnr))

    # Print classification report and G-mean
    print(classification_report(y, y_pred))

    print("{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}".format(accuracy,precision,recall,f1,auc,loss,tnr))

    print(confusion_matrix(y,y_pred))

In [4]:
dataset_name ='data_f'
train_data_file_path = "../dataset/"+ dataset_name + "_train.csv"
test_data_file_path = "../dataset/"+ dataset_name + "_test.csv"
val_data_file_path = "../dataset/"+ dataset_name + "_val.csv"
test_imbalanced_data_file_path = "../dataset/"+ dataset_name + "_test_imbalanced.csv"

In [5]:
target_col = 'label'

# Train Data
train_df = pd.read_csv(train_data_file_path)
train_df[target_col] = train_df[target_col].map({1: 1, 0: -1})  
# Identify columns with boolean data type
bool_columns = train_df.select_dtypes(include=['bool']).columns
# Convert boolean columns to integer
train_df[bool_columns] = train_df[bool_columns].astype(int)

# Test Data 
test_df = pd.read_csv(test_data_file_path)
test_df[target_col] = test_df[target_col].map({1: 1, 0: -1})
# Identify columns with boolean data type
bool_columns = test_df.select_dtypes(include=['bool']).columns
# Convert boolean columns to integer
test_df[bool_columns] = test_df[bool_columns].astype(int)


# Val Data 
val_df = pd.read_csv(val_data_file_path)
val_df[target_col] = val_df[target_col].map({1: 1, 0: -1})
# Identify columns with boolean data type
bool_columns = val_df.select_dtypes(include=['bool']).columns
# Convert boolean columns to integer
val_df[bool_columns] = val_df[bool_columns].astype(int)

# Test Data  Imbalanced
test_imbalanced_df = pd.read_csv(test_imbalanced_data_file_path)
test_imbalanced_df[target_col] = test_imbalanced_df[target_col].map({1: 1, 0: -1})
# Identify columns with boolean data type
bool_columns = test_imbalanced_df.select_dtypes(include=['bool']).columns
# Convert boolean columns to integer
test_imbalanced_df[bool_columns] = test_imbalanced_df[bool_columns].astype(int)

In [6]:
train_data = np.asarray(train_df)
X_train, y_train = train_data[:, :-1], train_data[:, -1]

test_data = np.asarray(test_df)
X_test, y_test = test_data[:, :-1], test_data[:, -1]

val_data = np.asarray(val_df)
X_val, y_val = val_data[:, :-1], val_data[:, -1]

test_imbalanced_data = np.asarray(test_df)
X_test_imbalanced, y_test_imbalanced = test_imbalanced_data[:, :-1], test_imbalanced_data[:, -1]

In [7]:
print("Number of examples in the training set: ", len(X_train))
print("Number of examples in the validation set: ", len(X_val))
print("Number of examples in the test set: ", len(X_test))
print("Number of examples in the test imbalanced set: ", len(X_test_imbalanced))

Number of examples in the training set:  3753
Number of examples in the validation set:  1251
Number of examples in the test set:  1252
Number of examples in the test imbalanced set:  1252


In [8]:
train_df.head()

Unnamed: 0,order_count_with_promo,price_amount,promo_amount,category_f_order_count_with_promo,category_f_promo_amount,similar_device_count,similar_email_count,label
0,0.0,4.9e-05,0.0,0.0,0.0,0.0,0.0,-1
1,0.0,3e-06,0.0,0.0,0.0,0.0,0.0,-1
2,0.103448,0.004819,0.051818,0.0,0.0,0.0,0.0,1
3,0.0,9.3e-05,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.001837,0.0,0.0,0.0,0.0,0.0,-1


## Train Risk Score Models

### Create RiskScoreOptimizer and Perform Optimization

In [9]:
y_train

array([-1., -1.,  1., ..., -1.,  1.,  1.])

In [20]:
sparsity = 5
parent_size = 10
RiskScoreOptimizer_m = RiskScoreOptimizer(X = X_train, y = y_train, k = sparsity, parent_size = parent_size, gap_tolerance=0.3)

In [21]:
start_time = time.time()
RiskScoreOptimizer_m.optimize()
print("Optimization takes {:.2f} seconds.".format(time.time() - start_time))

Optimization takes 0.46 seconds.


## Get Risk Score Models

In [22]:
multipliers, sparseDiversePool_beta0_integer, sparseDiversePool_betas_integer = RiskScoreOptimizer_m.get_models()
print("We generate {} risk score models from the sparse diverse pool".format(len(multipliers)))

(3753, 11)
We generate 11 risk score models from the sparse diverse pool


### Access the first risk score model

In [23]:
model_index = 0 # first model
multiplier = multipliers[model_index]
intercept = sparseDiversePool_beta0_integer[model_index]
coefficients = sparseDiversePool_betas_integer[model_index]

### Use the first risk score model to do prediction

In [24]:
RiskScoreClassifier_m = RiskScoreClassifier(multiplier, intercept, coefficients)

In [25]:
y_test_pred = RiskScoreClassifier_m.predict(X_test)
print("y_test are predicted to be {}".format(y_test_pred))

y_test are predicted to be [ 1 -1 -1 ... -1 -1 -1]


In [26]:
y_test_pred_prob = RiskScoreClassifier_m.predict_prob(X_test)
print("The risk probabilities of having y_test to be +1 are {}".format(y_test_pred_prob))

The risk probabilities of having y_test to be +1 are [0.65198535 0.5        0.5        ... 0.5        0.5        0.5       ]


### Print the first model card

In [27]:
X_featureNames = list(train_df.columns[:-1])

RiskScoreClassifier_m.reset_featureNames(X_featureNames)
RiskScoreClassifier_m.print_model_card()

# get_calculation_table(RiskScoreClassifier_m)

The Risk Score is:
1.            order_count_with_promo      3 point(s) |   ...
2.                      promo_amount      2 point(s) | + ...
3. category_f_order_count_with_promo      3 point(s) | + ...
4.           category_f_promo_amount      3 point(s) | + ...
5.               similar_email_count      2 point(s) | + ...
                                               SCORE | =    
SCORE |   0.0  |   2.0  |   3.0  |   4.0  |   5.0  |   6.0  |
RISK  |  50.0% |  97.8% |  99.7% |  99.9% | 100.0% | 100.0% |
SCORE |   7.0  |   8.0  |   9.0  |  10.0  |  11.0  |  13.0  |
RISK  | 100.0% | 100.0% | 100.0% | 100.0% | 100.0% | 100.0% |


### Print Top N Model Cards from the Pool and their performance metrics

In [28]:
N = 5
num_models = min(N, len(multipliers))
print("Num models: ", num_models)
print("Number of multipliers: ", len(multipliers))
train_loss_list = []
test_auc_list = []
auc_list = []
for model_index in range(num_models):
    print("---------- Model {} ----------".format(model_index+1))
    multiplier = multipliers[model_index]
    intercept = sparseDiversePool_beta0_integer[model_index]
    coefficients = sparseDiversePool_betas_integer[model_index]

    RiskScoreClassifier_m = RiskScoreClassifier(multiplier, intercept, coefficients)
    RiskScoreClassifier_m.reset_featureNames(X_featureNames)
    RiskScoreClassifier_m.print_model_card()

    train_loss = RiskScoreClassifier_m.compute_logisticLoss(X_train, y_train)
    train_acc, train_auc = RiskScoreClassifier_m.get_acc_and_auc(X_train, y_train)
    test_acc, test_auc = RiskScoreClassifier_m.get_acc_and_auc(X_test, y_test)

    print("The logistic loss on the training set is {}".format(train_loss))
    print("The training accuracy and AUC are {:.3f}% and {:.3f}".format(train_acc*100, train_auc))
    print("The test accuracy and AUC are are {:.3f}% and {:.3f}\n".format(test_acc*100, test_auc))

    print("### CLASSIFICATION REPORT - VAL ###")
    print_classification_metrics(RiskScoreClassifier_m,X_val,y_val)
    
    print("### CLASSIFICATION REPORT - TEST ###")
    print_classification_metrics(RiskScoreClassifier_m,X_test,y_test)

    print("### CLASSIFICATION REPORT - TEST (IMBALANCED) ###")
    print_classification_metrics(RiskScoreClassifier_m,X_test_imbalanced,y_test_imbalanced)
    
    # TEST AUC
    y_pred = RiskScoreClassifier_m.predict(X_test)
    auc = roc_auc_score(y_test,y_pred)

    train_loss_list.append(round(train_loss,2))
    test_auc_list.append(round(test_auc,4))
    auc_list.append(round(auc,4))

avg_train_loss = sum(train_loss_list)/len(train_loss_list)
avg_test_auc = sum(test_auc_list)/len(test_auc_list)
avg_auc = sum(auc_list)/len(auc_list)

print(train_loss_list)
print(test_auc_list)

print("avg test auc: ", avg_auc)

Num models:  5
Number of multipliers:  11
---------- Model 1 ----------
The Risk Score is:
1.            order_count_with_promo      3 point(s) |   ...
2.                      promo_amount      2 point(s) | + ...
3. category_f_order_count_with_promo      3 point(s) | + ...
4.           category_f_promo_amount      3 point(s) | + ...
5.               similar_email_count      2 point(s) | + ...
                                               SCORE | =    
SCORE |   0.0  |   2.0  |   3.0  |   4.0  |   5.0  |   6.0  |
RISK  |  50.0% |  97.8% |  99.7% |  99.9% | 100.0% | 100.0% |
SCORE |   7.0  |   8.0  |   9.0  |  10.0  |  11.0  |  13.0  |
RISK  | 100.0% | 100.0% | 100.0% | 100.0% | 100.0% | 100.0% |
The logistic loss on the training set is 2440.2542665393544
The training accuracy and AUC are 66.960% and 0.671
The test accuracy and AUC are are 65.415% and 0.666

### CLASSIFICATION REPORT - VAL ###
Predict time: 0.000331878662109375 s
Accuracy: 0.651
Precision: 0.831
Recall: 0.388
F1 score: 

Log loss: 12.293
G-mean: 0.612
Specificity: 0.926
              precision    recall  f1-score   support

        -1.0       0.60      0.93      0.73       610
         1.0       0.85      0.40      0.55       642

    accuracy                           0.66      1252
   macro avg       0.72      0.67      0.64      1252
weighted avg       0.73      0.66      0.64      1252

0.659	0.852	0.405	0.549	0.666	12.293	0.926
[[565  45]
 [382 260]]
### CLASSIFICATION REPORT - TEST (IMBALANCED) ###
Predict time: 0.0003693103790283203 s
Accuracy: 0.659
Precision: 0.852
Recall: 0.405
F1 score: 0.549
AUC score: 0.666
Log loss: 12.293
G-mean: 0.612
Specificity: 0.926
              precision    recall  f1-score   support

        -1.0       0.60      0.93      0.73       610
         1.0       0.85      0.40      0.55       642

    accuracy                           0.66      1252
   macro avg       0.72      0.67      0.64      1252
weighted avg       0.73      0.66      0.64      1252

0.659	0.852	

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
