In [1]:
# %conda create -n FasterRisk python=3.9 # create a virtual environment
# %conda activate FasterRisk # activate the virtual environment
%pip install fasterrisk

Note: you may need to restart the kernel to use updated packages.


In [2]:
# %pip install imblearn

In [3]:
from fasterrisk.fasterrisk import RiskScoreOptimizer, RiskScoreClassifier
from fasterrisk.utils import download_file_from_google_drive,  compute_logisticLoss_from_X_y_beta0_betas, get_all_product_booleans, get_support_indices, isEqual_upTo_8decimal, isEqual_upTo_16decimal, get_all_product_booleans

import os.path

import numpy as np
import pandas as pd
import time
# Evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, roc_auc_score, log_loss, classification_report, confusion_matrix

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import re

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# For resampling
from collections import Counter
from sklearn.datasets import make_classification
# from imblearn.over_sampling import SMOTE, SMOTENC, SMOTEN, ADASYN, RandomOverSampler
# from imblearn.under_sampling import NearMiss, RandomUnderSampler
# from imblearn.combine import SMOTEENN
from sklearn.utils import resample

# Ensemble Classifiers
from sklearn.ensemble import VotingClassifier, StackingClassifier

# Hyperparameter Tuning
from sklearn.model_selection import cross_val_predict, GridSearchCV, cross_val_score, train_test_split, KFold, cross_validate

# Evaluation
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, roc_auc_score, log_loss, classification_report, confusion_matrix

# from xgboost import plot_importance, to_graphviz

# from google.colab import files

In [5]:
def get_calculation_table(risk_score_model):
    assert risk_score_model.featureNames is not None, "please pass the featureNames to the model by using the function .reset_featureNames(featureNames)"

    nonzero_indices = get_support_indices(risk_score_model.coefficients)

    max_feature_length = max([len(featureName) for featureName in risk_score_model.featureNames])
    row_score_template = '{0}. {1:>%d}     {2:>2} point(s) | + ...' % (max_feature_length)

    print("The Risk Score is:")
    for count, feature_i in enumerate(nonzero_indices):
        row_score_str = row_score_template.format(count+1, risk_score_model.featureNames[feature_i], int(risk_score_model.coefficients[feature_i]))
        if count == 0:
            row_score_str = row_score_str.replace("+", " ")

        print(row_score_str)

    final_score_str = ' ' * (14+max_feature_length) + 'SCORE | =    '
    print(final_score_str)
    
    
    print("###")
    feature_names_list = []
    coefficients_list = []
    for count, feature_i in enumerate(nonzero_indices):
        feature_names_list.append(risk_score_model.featureNames[feature_i])
        coefficients_list.append(int(risk_score_model.coefficients[feature_i]))
    
    print("feature names: ", feature_names_list)
    print("coefficients: ", coefficients_list)
    print(len(feature_names_list) == len(coefficients_list))

def print_classification_metrics(risk_score_model, X, y):
    start = time.time()
    y_pred = risk_score_model.predict(X)
    stop = time.time()
    print(f"Predict time: {stop-start} s")
    
    # Compute the accuracy
    accuracy = accuracy_score(y, y_pred)
    print("Accuracy: {:.3f}".format(accuracy))
    # Compute the precision
    precision = precision_score(y, y_pred)
    print("Precision: {:.3f}".format(precision))
    # Compute the recall or sensitivity
    recall = recall_score(y, y_pred)
    print("Recall: {:.3f}".format(recall))
    # Compute the F1 score
    f1 = f1_score(y, y_pred)
    print("F1 score: {:.3f}".format(f1))
    # Compute the roc auc score
    auc = roc_auc_score(y,y_pred)
    print("AUC score: {:.3f}".format(auc))
    # Compute the log lossscore
    loss = log_loss(y,y_pred)
    print("Log loss: {:.3f}".format(loss))

    # Assume y and y_pred are the true and predicted labels for a binary classification problem
    tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
    # Calculate TPR and TNR
    tpr = tp / (tp + fn) # Sensitivity
    tnr = tn / (tn + fp) # Specificity
    # Calculate G-mean
    gmean = np.sqrt(tpr * tnr)
    print("G-mean: {:.3f}".format(gmean))

    print("Specificity: {:.3f}".format(tnr))

    # Print classification report and G-mean
    print(classification_report(y, y_pred))

    print("{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}".format(accuracy,precision,recall,f1,auc,loss,tnr))

    print(confusion_matrix(y,y_pred))

In [6]:
def train_and_evaluate_model(dataframe, sparsity=5, parent_size=10, model_index=0, random_state=0):
    ### DATAFRAME FORMATTING ###
    df = dataframe.copy()
    print(df['label'].value_counts())
    target_col = 'label'
    df[target_col] = df[target_col].apply(lambda x: 1 if x==1 else -1) 
#     ({1: 1, 0: -1})  
    print(df['label'].value_counts())
    # Identify columns with boolean data type
    bool_columns = df.select_dtypes(include=['bool']).columns
    # Convert boolean columns to integer
    df[bool_columns] = df[bool_columns].astype(int)
    print(df['label'].value_counts())

    ### SPLITTING THE DATA ###
    X = df.drop(['label','account_id'], axis=1)
    y = df['label']

    # Separate minority and majority classes
    minority_class = df[df['label'] == 1]
    majority_class = df[df['label'] == -1]
    print(len(majority_class), len(minority_class))
    # # Undersample majority class
    # undersampled_majority_class = resample(majority_class, 
    #                                       replace=False, 
    #                                       n_samples=len(minority_class),
    #                                       )

    # # Combine minority class with undersampled majority class
    # undersampled_data = pd.concat([minority_class, undersampled_majority_class])

    # # Split the undersampled data into training, validation, and test sets
    # X_undersampled = undersampled_data.drop(['label','account_id'], axis=1)
    # y_undersampled = undersampled_data['label']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=random_state)

#     # Print the number of examples in each set
#     print("Number of examples in the training set: ", len(X_train))
#     print("Number of examples in the validation set: ", len(X_val))
#     print("Number of examples in the test set: ", len(X_test))
    # Calculate the desired number of samples for each class based on the proportion in the test dataset
    desired_majority_samples = int(len(X_test) * 0.90)
    desired_minority_samples = int(len(X_test) * 0.10)

    print("desired",desired_majority_samples, desired_minority_samples)
    # Resample the majority class in the test dataset
    resampled_majority_class = resample(majority_class,
                                        replace=True,
                                        n_samples=desired_majority_samples,
                                        random_state=random_state
                                        )

    # Sample the minority class in the test dataset
    sampled_minority_class = resample(minority_class,
                                      replace=True,
                                      n_samples=desired_minority_samples,
                                    random_state=random_state

                                      )

    test_imbalanced = pd.concat([resampled_majority_class, sampled_minority_class])
    X_test_imbalanced = test_imbalanced.drop(['label', 'account_id'], axis=1)
    y_test_imbalanced= test_imbalanced['label']

#     print("Number of examples in the test imbalanced set: ", len(X_test_imbalanced))
#     # Print the number of examples in each class in the imbalanced test dataset
#     print("Number of examples in the imbalanced test dataset (label -1):", len(y_test_imbalanced[y_test_imbalanced == -1]))
#     print("Number of examples in the imbalanced test dataset (label 1):", len(y_test_imbalanced[y_test_imbalanced == 1]))
      
    ### CONVERT TO NUMPY ###
    X_train = np.asarray(X_train)
    y_train = np.asarray(y_train)
    X_val = np.asarray(X_val)
    y_val = np.asarray(y_val)
    X_test = np.asarray(X_test)
    y_test = np.asarray(y_test)
    X_test_imbalanced = np.asarray(X_test_imbalanced)
    y_test_imbalanced = np.asarray(y_test_imbalanced)

    ### MODELLING ###
    RiskScoreOptimizer_m = RiskScoreOptimizer(X = X_train, y = y_train, k = sparsity, parent_size = parent_size)
    start_training_time = time.time()
    RiskScoreOptimizer_m.optimize()
    stop_training_time = time.time()
    training_time = stop_training_time - start_training_time

    multipliers, sparseDiversePool_beta0_integer, sparseDiversePool_betas_integer = RiskScoreOptimizer_m.get_models()
    print("We generate {} risk score models from the sparse diverse pool".format(len(multipliers)))
    
#     model_index = 0 # first model
    multiplier = multipliers[model_index]
    intercept = sparseDiversePool_beta0_integer[model_index]
    coefficients = sparseDiversePool_betas_integer[model_index]
    model = RiskScoreClassifier(multiplier, intercept, coefficients)
    X_featureNames = list(X.columns)

    model.reset_featureNames(X_featureNames)
    model.print_model_card()
    ### RESULTS ###
    ## Val
    start_pred_val = time.time()
    y_pred_val = model.predict(X_val)
    stop_pred_val = time.time()
    pred_val_time = stop_pred_val - start_pred_val

    accuracy_val = accuracy_score(y_val, y_pred_val)
    precision_val = precision_score(y_val, y_pred_val)
    recall_val = recall_score(y_val, y_pred_val)
    f1_val = f1_score(y_val, y_pred_val)
    auc_val = roc_auc_score(y_val,y_pred_val)
    loss_val = log_loss(y_val,y_pred_val)
    # Assume y_val and y_pred are the true and predicted labels for a binary classification problem
    tn_val, fp_val, fn_val, tp_val = confusion_matrix(y_val, y_pred_val).ravel()
    # Calculate TPR and TNR
    tpr_val = tp_val / (tp_val + fn_val) # Sensitivity
    tnr_val = tn_val / (tn_val + fp_val) # Specificity
    # Calculate G-mean
    gmean_val = np.sqrt(tpr_val * tnr_val) 
    ## Test
    start_pred_test = time.time()
    y_pred_test = model.predict(X_test)
    stop_pred_test = time.time()
    pred_test_time = stop_pred_test - start_pred_test

    accuracy_test = accuracy_score(y_test, y_pred_test)
    precision_test = precision_score(y_test, y_pred_test)
    recall_test = recall_score(y_test, y_pred_test)
    f1_test = f1_score(y_test, y_pred_test)
    auc_test = roc_auc_score(y_test,y_pred_test)
    loss_test = log_loss(y_test,y_pred_test)
    # Assume y_test and y_pred are the true and predicted labels for a binary classification problem
    tn_test, fp_test, fn_test, tp_test = confusion_matrix(y_test, y_pred_test).ravel()
    # Calculate TPR and TNR
    tpr_test = tp_test / (tp_test + fn_test) # Sensitivity
    tnr_test = tn_test / (tn_test + fp_test) # Specificity
    # Calculate G-mean
    gmean_test = np.sqrt(tpr_test * tnr_test)
    
    ## test imbalanced
    start_pred_test_imbalanced = time.time()
    y_pred_test_imbalanced = model.predict(X_test_imbalanced)
    stop_pred_test_imbalanced = time.time()
    pred_test_imbalanced_time = stop_pred_test_imbalanced - start_pred_test_imbalanced

    accuracy_test_imbalanced = accuracy_score(y_test_imbalanced, y_pred_test_imbalanced)
    precision_test_imbalanced = precision_score(y_test_imbalanced, y_pred_test_imbalanced)
    recall_test_imbalanced = recall_score(y_test_imbalanced, y_pred_test_imbalanced)
    f1_test_imbalanced = f1_score(y_test_imbalanced, y_pred_test_imbalanced)
    auc_test_imbalanced = roc_auc_score(y_test_imbalanced,y_pred_test_imbalanced)
    loss_test_imbalanced = log_loss(y_test_imbalanced,y_pred_test_imbalanced)
    # Assume y_test_imbalanced and y_pred are the true and predicted labels for a binary classification problem
    tn_test_imbalanced, fp_test_imbalanced, fn_test_imbalanced, tp_test_imbalanced = confusion_matrix(y_test_imbalanced, y_pred_test_imbalanced).ravel()
    # Calculate TPR and TNR
    tpr_test_imbalanced = tp_test_imbalanced / (tp_test_imbalanced + fn_test_imbalanced) # Sensitivity
    tnr_test_imbalanced = tn_test_imbalanced / (tn_test_imbalanced + fp_test_imbalanced) # Specificity
    # Calculate G-mean
    gmean_test_imbalanced = np.sqrt(tpr_test_imbalanced * tnr_test_imbalanced) 
    print(confusion_matrix(y_test_imbalanced, y_pred_test_imbalanced))
    val_results = [accuracy_val, precision_val, recall_val, f1_val, auc_val, loss_val, tnr_val]
    test_results = [accuracy_test, precision_test, recall_test, f1_test, auc_test, loss_test, tnr_test]
    test_imbalanced_results = [accuracy_test_imbalanced, precision_test_imbalanced, recall_test_imbalanced, f1_test_imbalanced, auc_test_imbalanced, loss_test_imbalanced, tnr_test_imbalanced]

    time_results = [training_time, pred_val_time, pred_test_time, pred_test_imbalanced_time]
    return val_results, test_results, test_imbalanced_results, time_results
    # return accuracy, precision, recall, f1, auc, loss, tnr   

In [25]:
def run_iterations(df,sparsity,parent_size,model_index, iterations=5):
    results = []
    random_state = [12,23,34,45,56]
    for i in range(len(random_state)):
        val_results, test_results, test_imbalanced_results, time_results = train_and_evaluate_model(df,sparsity,parent_size,model_index, random_state[i])
        concatted_results = val_results + test_results + test_imbalanced_results + time_results
    
        results.append(concatted_results)

    columns = ['Val Accuracy', 'Val Precision', 'Val Recall', 'Val F1', 'Val AUC', 'Val Loss', 'Val Specificity',
               'Test Accuracy', 'Test Precision', 'Test Recall', 'Test F1', 'Test AUC', 'Test Loss', 'Test Specificity',
               'Test Imbalanced Accuracy', 'Test Imbalanced Precision', 'Test Imbalanced Recall', 'Test Imbalanced F1', 'Test Imbalanced AUC', 'Test Imbalanced Loss', 'Test Imbalanced Specificity',
               'Training Time', 'Pred Val Time', 'Pred Test Time', 'Pred Test Imbalanced Time'
               ]
    df = pd.DataFrame(results, columns=columns)
    print(df)
    stats = df.describe().loc[['mean']]
    # stats = df.describe()
    print(stats)
    return stats.to_csv(sep='\t')

In [26]:
dataset_name ='data_a'
dataset_file_path = '../dataset/' + dataset_name + '.csv'
# train_data_file_path = "../dataset/"+ dataset_name + "_train.csv"
# test_data_file_path = "../dataset/"+ dataset_name + "_test.csv"
# val_data_file_path = "../dataset/"+ dataset_name + "_val.csv"
# test_imbalanced_data_file_path = "../dataset/"+ dataset_name + "_test_imbalanced.csv"

In [27]:
target_col = 'label'

df = pd.read_csv(dataset_file_path)
df[target_col] = df[target_col].map({1: 1, 0: -1})  
# Identify columns with boolean data type
bool_columns = df.select_dtypes(include=['bool']).columns
# Convert boolean columns to integer
df[bool_columns] = df[bool_columns].astype(int)


# data = np.asarray(df)
# X, y = data[:, :-1], data[:, -1]

In [28]:
# train_and_evaluate_model(df,5,10,0)

In [29]:
print(run_iterations(df,5,10,0,5))

-1    68416
 1     3128
Name: label, dtype: int64
-1    68416
 1     3128
Name: label, dtype: int64
-1    68416
 1     3128
Name: label, dtype: int64
68416 3128
desired 12878 1430
(42926, 50)
We generate 50 risk score models from the sparse diverse pool
The Risk Score is:
1.                              is_email_verified     -1 point(s) |   ...
2.                               is_reseller_flag      2 point(s) | + ...
3.              order_count_with_promo_category_0     -5 point(s) | + ...
4.                        similar_name_category_0      3 point(s) | + ...
5.                        similar_name_category_1      2 point(s) | + ...
                                                            SCORE | =    
SCORE |  -6.0  |  -5.0  |  -4.0  |  -3.0  |  -2.0  |  -1.0  |   0.0  |
RISK  |   0.3% |   0.6% |   1.1% |   2.1% |   3.9% |   7.1% |  12.7% |
SCORE |   1.0  |   2.0  |   3.0  |   4.0  |   5.0  |   6.0  |   7.0  |
RISK  |  21.7% |  34.5% |  50.0% |  65.5% |  78.3% |  87.3% |  92.9% |




In [30]:
df.head()

Unnamed: 0,account_id,label,is_phone_number_verified,is_email_verified,is_reseller_flag,email_category_gmail,email_category_hotmail,email_category_icloud,email_category_live,email_category_others,...,similar_device_category_0,similar_device_category_1,similar_device_category_2,similar_device_category_> 2,similar_birth_date_category_0,similar_birth_date_category_1,similar_birth_date_category_10-13,similar_birth_date_category_2,similar_birth_date_category_3-9,similar_birth_date_category_> 13
0,34634632,1,1,0,0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,27536039,-1,1,0,0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,29164748,-1,0,0,0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,28115239,-1,1,0,0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,33491857,-1,1,0,0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
