In [1]:
# %conda create -n FasterRisk python=3.9 # create a virtual environment
# %conda activate FasterRisk # activate the virtual environment
%pip install fasterrisk

Note: you may need to restart the kernel to use updated packages.


In [2]:
from fasterrisk.fasterrisk import RiskScoreOptimizer, RiskScoreClassifier
from fasterrisk.utils import download_file_from_google_drive,  compute_logisticLoss_from_X_y_beta0_betas, get_all_product_booleans, get_support_indices, isEqual_upTo_8decimal, isEqual_upTo_16decimal, get_all_product_booleans

import os.path

import numpy as np
import pandas as pd
import time
# Evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, roc_auc_score, log_loss, classification_report, confusion_matrix

In [3]:
def get_calculation_table(risk_score_model):
    assert risk_score_model.featureNames is not None, "please pass the featureNames to the model by using the function .reset_featureNames(featureNames)"

    nonzero_indices = get_support_indices(risk_score_model.coefficients)

    max_feature_length = max([len(featureName) for featureName in risk_score_model.featureNames])
    row_score_template = '{0}. {1:>%d}     {2:>2} point(s) | + ...' % (max_feature_length)

    print("The Risk Score is:")
    for count, feature_i in enumerate(nonzero_indices):
        row_score_str = row_score_template.format(count+1, risk_score_model.featureNames[feature_i], int(risk_score_model.coefficients[feature_i]))
        if count == 0:
            row_score_str = row_score_str.replace("+", " ")

        print(row_score_str)

    final_score_str = ' ' * (14+max_feature_length) + 'SCORE | =    '
    print(final_score_str)
    
    
    print("###")
    feature_names_list = []
    coefficients_list = []
    for count, feature_i in enumerate(nonzero_indices):
        feature_names_list.append(risk_score_model.featureNames[feature_i])
        coefficients_list.append(int(risk_score_model.coefficients[feature_i]))
    
    print("feature names: ", feature_names_list)
    print("coefficients: ", coefficients_list)
    print(len(feature_names_list) == len(coefficients_list))

def print_classification_metrics(risk_score_model, X, y):
    start = time.time()
    y_pred = risk_score_model.predict(X)
    stop = time.time()
    print(f"Predict time: {stop  start} s")
    
    # Compute the accuracy
    accuracy = accuracy_score(y, y_pred)
    print("Accuracy: {:.3f}".format(accuracy))
    # Compute the precision
    precision = precision_score(y, y_pred)
    print("Precision: {:.3f}".format(precision))
    # Compute the recall or sensitivity
    recall = recall_score(y, y_pred)
    print("Recall: {:.3f}".format(recall))
    # Compute the F1 score
    f1 = f1_score(y, y_pred)
    print("F1 score: {:.3f}".format(f1))
    # Compute the roc auc score
    auc = roc_auc_score(y,y_pred)
    print("AUC score: {:.3f}".format(auc))
    # Compute the log lossscore
    loss = log_loss(y,y_pred)
    print("Log loss: {:.3f}".format(loss))

    # Assume y and y_pred are the true and predicted labels for a binary classification problem
    tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
    # Calculate TPR and TNR
    tpr = tp / (tp + fn) # Sensitivity
    tnr = tn / (tn + fp) # Specificity
    # Calculate G-mean
    gmean = np.sqrt(tpr * tnr)
    print("G-mean: {:.3f}".format(gmean))

    print("Specificity: {:.3f}".format(tnr))

    # Print classification report and G-mean
    print(classification_report(y, y_pred))

    print("{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}".format(accuracy,precision,recall,f1,auc,loss,tnr))

    print(confusion_matrix(y,y_pred))

In [4]:
# train_data_file_path = "../tests/adult_train_data.csv"
# test_data_file_path = "../tests/adult_test_data.csv"

dataset_name ='data_f'
train_data_file_path = "../dataset/"+ dataset_name + "_train.csv"
test_data_file_path = "../dataset/"+ dataset_name + "_test.csv"
val_data_file_path = "../dataset/"+ dataset_name + "_val.csv"
test_imbalanced_data_file_path = "../dataset/"+ dataset_name + "_test_imbalanced.csv"

# if not os.path.isfile(train_data_file_path):
#     download_file_from_google_drive('1nuWn0QVG8tk3AN4I4f3abWLcFEP3WPec', train_data_file_path)
# if not os.path.isfile(test_data_file_path):
#     download_file_from_google_drive('1TyBO02LiGfHbatPWU4nzc8AndtIF-7WH', test_data_file_path)

In [5]:
# ## 
# train_df.head()

In [6]:
# train_df.columns

In [7]:
# train_df['target']

In [8]:
target_col = 'label'

# Train Data
train_df = pd.read_csv(train_data_file_path)
train_df[target_col] = train_df[target_col].map({1: 1, 0: -1})  
# Identify columns with boolean data type
bool_columns = train_df.select_dtypes(include=['bool']).columns
# Convert boolean columns to integer
train_df[bool_columns] = train_df[bool_columns].astype(int)

# Test Data 
test_df = pd.read_csv(test_data_file_path)
test_df[target_col] = test_df[target_col].map({1: 1, 0: -1})
# Identify columns with boolean data type
bool_columns = test_df.select_dtypes(include=['bool']).columns
# Convert boolean columns to integer
test_df[bool_columns] = test_df[bool_columns].astype(int)


# Val Data 
val_df = pd.read_csv(val_data_file_path)
val_df[target_col] = val_df[target_col].map({1: 1, 0: -1})
# Identify columns with boolean data type
bool_columns = val_df.select_dtypes(include=['bool']).columns
# Convert boolean columns to integer
val_df[bool_columns] = val_df[bool_columns].astype(int)

# Test Data  Imbalanced
test_imbalanced_df = pd.read_csv(test_imbalanced_data_file_path)
test_imbalanced_df[target_col] = test_imbalanced_df[target_col].map({1: 1, 0: -1})
# Identify columns with boolean data type
bool_columns = test_imbalanced_df.select_dtypes(include=['bool']).columns
# Convert boolean columns to integer
test_imbalanced_df[bool_columns] = test_imbalanced_df[bool_columns].astype(int)

In [9]:
train_data = np.asarray(train_df)
X_train, y_train = train_data[:, :-1], train_data[:, -1]

test_data = np.asarray(test_df)
X_test, y_test = test_data[:, :-1], test_data[:, -1]

val_data = np.asarray(val_df)
X_val, y_val = val_data[:, :-1], val_data[:, -1]

test_imbalanced_data = np.asarray(test_df)
X_test_imbalanced, y_test_imbalanced = test_imbalanced_data[:, :-1], test_imbalanced_data[:, -1]

In [10]:
# train_df = pd.read_csv(train_data_file_path)
# # convert target
# train_df[target_col] = train_df[target_col].replace({0:-1, 1:1})
# train_data = np.asarray(train_df)

# X_train, y_train = train_data[:, 1:], train_data[:, 0]
# # Convert 0 to -1 and 1 to +1
# y_train = np.where(y_train != 0, -1, 1)

# test_df = pd.read_csv(test_data_file_path)
# # convert target
# test_df[target_col] = test_df[target_col].replace({0:-1, 1:1})

# test_data = np.asarray(test_df)
# X_test, y_test = test_data[:, 1:], test_data[:, 0]
# # Convert 0 to -1 and 1 to +1
# y_test = np.where(y_test != 0, -1, 1)

In [11]:
train_df[target_col].value_counts()

-1    1899
 1    1854
Name: label, dtype: int64

In [12]:
# Get unique values and their counts
unique_values, counts = np.unique(y_test, return_counts=True)

# Create a dictionary with unique values as keys and counts as values
value_counts = dict(zip(unique_values, counts))

# Access the value counts
for value, count in value_counts.items():
    print(f'{value}: {count}')

-1.0: 610
1.0: 642


In [13]:
train_df.dtypes

order_count_with_promo               float64
price_amount                         float64
promo_amount                         float64
category_f_order_count_with_promo    float64
category_f_promo_amount              float64
similar_device_count                 float64
similar_email_count                  float64
label                                  int64
dtype: object

In [14]:
train_df.head()

Unnamed: 0,order_count_with_promo,price_amount,promo_amount,category_f_order_count_with_promo,category_f_promo_amount,similar_device_count,similar_email_count,label
0,0.0,4.9e-05,0.0,0.0,0.0,0.0,0.0,-1
1,0.0,3e-06,0.0,0.0,0.0,0.0,0.0,-1
2,0.103448,0.004819,0.051818,0.0,0.0,0.0,0.0,1
3,0.0,9.3e-05,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.001837,0.0,0.0,0.0,0.0,0.0,-1


In [15]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3753 entries, 0 to 3752
Data columns (total 8 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   order_count_with_promo             3753 non-null   float64
 1   price_amount                       3753 non-null   float64
 2   promo_amount                       3753 non-null   float64
 3   category_f_order_count_with_promo  3753 non-null   float64
 4   category_f_promo_amount            3753 non-null   float64
 5   similar_device_count               3753 non-null   float64
 6   similar_email_count                3753 non-null   float64
 7   label                              3753 non-null   int64  
dtypes: float64(7), int64(1)
memory usage: 234.7 KB


In [16]:
train_df.columns

Index(['order_count_with_promo', 'price_amount', 'promo_amount',
       'category_f_order_count_with_promo', 'category_f_promo_amount',
       'similar_device_count', 'similar_email_count', 'label'],
      dtype='object')

In [17]:
y_test

array([ 1., -1., -1., ...,  1., -1., -1.])

In [18]:
train_df.head()

Unnamed: 0,order_count_with_promo,price_amount,promo_amount,category_f_order_count_with_promo,category_f_promo_amount,similar_device_count,similar_email_count,label
0,0.0,4.9e-05,0.0,0.0,0.0,0.0,0.0,-1
1,0.0,3e-06,0.0,0.0,0.0,0.0,0.0,-1
2,0.103448,0.004819,0.051818,0.0,0.0,0.0,0.0,1
3,0.0,9.3e-05,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.001837,0.0,0.0,0.0,0.0,0.0,-1


In [19]:
test_df.head()

Unnamed: 0,order_count_with_promo,price_amount,promo_amount,category_f_order_count_with_promo,category_f_promo_amount,similar_device_count,similar_email_count,label
0,0.034483,6.1e-05,0.010098,0.058824,0.010098,0.0,0.0,1
1,0.0,0.000492,0.0,0.0,0.0,0.0,0.0,-1
2,0.0,0.000318,0.0,0.0,0.0,0.0,0.0,-1
3,0.0,0.018275,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.002064,0.0,0.0,0.0,0.0,0.0,-1


## Train Risk Score Models

### Create RiskScoreOptimizer and Perform Optimization

In [20]:
y_train

array([-1., -1.,  1., ..., -1.,  1.,  1.])

In [21]:
sparsity = 5
parent_size = 10
RiskScoreOptimizer_m = RiskScoreOptimizer(X = X_train, y = y_train, k = sparsity, parent_size = parent_size)

In [22]:
start_time = time.time()
RiskScoreOptimizer_m.optimize()
print("Optimization takes {:.2f} seconds.".format(time.time() - start_time))

Optimization takes 0.39 seconds.


## Get Risk Score Models

In [23]:
multipliers, sparseDiversePool_beta0_integer, sparseDiversePool_betas_integer = RiskScoreOptimizer_m.get_models()
print("We generate {} risk score models from the sparse diverse pool".format(len(multipliers)))

(3753, 11)
We generate 11 risk score models from the sparse diverse pool


### Access the first risk score model

In [24]:
model_index = 0 # first model
multiplier = multipliers[model_index]
intercept = sparseDiversePool_beta0_integer[model_index]
coefficients = sparseDiversePool_betas_integer[model_index]

### Use the first risk score model to do prediction

In [25]:
RiskScoreClassifier_m = RiskScoreClassifier(multiplier, intercept, coefficients)

In [26]:
y_test_pred = RiskScoreClassifier_m.predict(X_test)
print("y_test are predicted to be {}".format(y_test_pred))

y_test are predicted to be [ 1 -1 -1 ... -1 -1 -1]


In [27]:
y_test_pred_prob = RiskScoreClassifier_m.predict_prob(X_test)
print("The risk probabilities of having y_test to be +1 are {}".format(y_test_pred_prob))

The risk probabilities of having y_test to be +1 are [0.65198535 0.5        0.5        ... 0.5        0.5        0.5       ]


### Print the first model card

In [28]:
X_featureNames = list(train_df.columns[:-1])

RiskScoreClassifier_m.reset_featureNames(X_featureNames)
RiskScoreClassifier_m.print_model_card()

get_calculation_table(RiskScoreClassifier_m)

The Risk Score is:
1.            order_count_with_promo      3 point(s) |   ...
2.                      promo_amount      2 point(s) | + ...
3. category_f_order_count_with_promo      3 point(s) | + ...
4.           category_f_promo_amount      3 point(s) | + ...
5.               similar_email_count      2 point(s) | + ...
                                               SCORE | =    
SCORE |   0.0  |   2.0  |   3.0  |   4.0  |   5.0  |   6.0  |
RISK  |  50.0% |  97.8% |  99.7% |  99.9% | 100.0% | 100.0% |
SCORE |   7.0  |   8.0  |   9.0  |  10.0  |  11.0  |  13.0  |
RISK  | 100.0% | 100.0% | 100.0% | 100.0% | 100.0% | 100.0% |
The Risk Score is:
1.            order_count_with_promo      3 point(s) |   ...
2.                      promo_amount      2 point(s) | + ...
3. category_f_order_count_with_promo      3 point(s) | + ...
4.           category_f_promo_amount      3 point(s) | + ...
5.               similar_email_count      2 point(s) | + ...
                                           

### Print Top N Model Cards from the Pool and their performance metrics

In [29]:
N = 5
num_models = min(N, len(multipliers))
print("Num models: ", num_models)
print("Number of multipliers: ", len(multipliers))
train_loss_list = []
test_auc_list = []
auc_list = []
for model_index in range(num_models):
    print("---------- Model {} ----------".format(model_index+1))
    multiplier = multipliers[model_index]
    intercept = sparseDiversePool_beta0_integer[model_index]
    coefficients = sparseDiversePool_betas_integer[model_index]

    RiskScoreClassifier_m = RiskScoreClassifier(multiplier, intercept, coefficients)
    RiskScoreClassifier_m.reset_featureNames(X_featureNames)
    RiskScoreClassifier_m.print_model_card()

    train_loss = RiskScoreClassifier_m.compute_logisticLoss(X_train, y_train)
    train_acc, train_auc = RiskScoreClassifier_m.get_acc_and_auc(X_train, y_train)
    test_acc, test_auc = RiskScoreClassifier_m.get_acc_and_auc(X_test, y_test)

    print("The logistic loss on the training set is {}".format(train_loss))
    print("The training accuracy and AUC are {:.3f}% and {:.3f}".format(train_acc*100, train_auc))
    print("The test accuracy and AUC are are {:.3f}% and {:.3f}\n".format(test_acc*100, test_auc))
    
#     print("### CLASSIFICATION REPORT - TRAIN ###")
#     print_classification_metrics(RiskScoreClassifier_m,X_train,y_train)

    print("### CLASSIFICATION REPORT - VAL ###")
    print_classification_metrics(RiskScoreClassifier_m,X_val,y_val)
    
    print("### CLASSIFICATION REPORT - TEST ###")
    print_classification_metrics(RiskScoreClassifier_m,X_test,y_test)

    print("### CLASSIFICATION REPORT - TEST (IMBALANCED) ###")
    print_classification_metrics(RiskScoreClassifier_m,X_test_imbalanced,y_test_imbalanced)
    
    # TEST AUC
    y_pred = RiskScoreClassifier_m.predict(X_test)
    auc = roc_auc_score(y_test,y_pred)

    train_loss_list.append(round(train_loss,2))
    test_auc_list.append(round(test_auc,4))
    auc_list.append(round(auc,4))

avg_train_loss = sum(train_loss_list)/len(train_loss_list)
avg_test_auc = sum(test_auc_list)/len(test_auc_list)
avg_auc = sum(auc_list)/len(auc_list)

print(train_loss_list)
print(test_auc_list)

# print("avg train loss: ", avg_train_loss)
print("avg test auc: ", avg_auc)

Num models:  5
Number of multipliers:  11
---------- Model 1 ----------
The Risk Score is:
1.            order_count_with_promo      3 point(s) |   ...
2.                      promo_amount      2 point(s) | + ...
3. category_f_order_count_with_promo      3 point(s) | + ...
4.           category_f_promo_amount      3 point(s) | + ...
5.               similar_email_count      2 point(s) | + ...
                                               SCORE | =    
SCORE |   0.0  |   2.0  |   3.0  |   4.0  |   5.0  |   6.0  |
RISK  |  50.0% |  97.8% |  99.7% |  99.9% | 100.0% | 100.0% |
SCORE |   7.0  |   8.0  |   9.0  |  10.0  |  11.0  |  13.0  |
RISK  | 100.0% | 100.0% | 100.0% | 100.0% | 100.0% | 100.0% |
The logistic loss on the training set is 2440.2542665393544
The training accuracy and AUC are 66.960% and 0.671
The test accuracy and AUC are are 65.415% and 0.666

### CLASSIFICATION REPORT - VAL ###
Accuracy: 0.651
Precision: 0.831
Recall: 0.388
F1 score: 0.529
AUC score: 0.653
Log loss: 12.5

[[  1 618]
 [  2 630]]
### CLASSIFICATION REPORT - TEST ###
Accuracy: 0.513
Precision: 0.513
Recall: 1.000
F1 score: 0.678
AUC score: 0.500
Log loss: 17.561
G-mean: 0.000
Specificity: 0.000
              precision    recall  f1-score   support

        -1.0       0.00      0.00      0.00       610
         1.0       0.51      1.00      0.68       642

    accuracy                           0.51      1252
   macro avg       0.26      0.50      0.34      1252
weighted avg       0.26      0.51      0.35      1252

0.513	0.513	1.000	0.678	0.500	17.561	0.000
[[  0 610]
 [  0 642]]
### CLASSIFICATION REPORT - TEST (IMBALANCED) ###
Accuracy: 0.513
Precision: 0.513
Recall: 1.000
F1 score: 0.678
AUC score: 0.500
Log loss: 17.561
G-mean: 0.000
Specificity: 0.000
              precision    recall  f1-score   support

        -1.0       0.00      0.00      0.00       610
         1.0       0.51      1.00      0.68       642

    accuracy                           0.51      1252
   macro avg       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Additional Tutorial on Binarizing Continuous Features

If your data has continuous features, we recommend converting the continuous features to binary features as a preprocessing step to make the final model more interpretable. We use the public PIMA dataset to show how to do this as a preprocessing step.

### Download the PIMA dataset

In [30]:
# pima_original_data_file_path = "../tests/pima_original_data.csv"
# if not os.path.isfile(pima_original_data_file_path):
#     download_file_from_google_drive('184JhmJiSEUiBCo8ySAD8adDn_S9rjmjM', pima_original_data_file_path)

# pima_original_data_df = pd.read_csv(pima_original_data_file_path)
# X_original_df = pima_original_data_df.drop(columns="Outcome") # drop the Outcome column, which stores the y label for this binary classification problem

# X_original_df

### Convert the dataframe with continuous features to a new dataframe with binary features

In [31]:
# f5

You can then use X_binarized_df as your new design matrix and input to the FasterRisk algorithm!