In [None]:
# Make the notebook full screen
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import os
import pandas as pd
import importlib
import sys 

if sys.version_info[:3] < (3,4):
    os.getcdw()
    code_dir = os.path.dirname(os.getcdw())
    project_dir = os.path.dirname(os.path.dirname(os.getcdw()))
    data_path = os.path.join(code_dir, "data")
    functions_path = os.path.join(project_dir, "functions")
else: 
    from pathlib import Path
    current_directory = os.path.dirname(Path.cwd())
    code_dir = os.path.dirname(os.path.dirname(current_directory))
    project_dir = os.path.join(code_dir, "2_Supervised_Modeling\\Logistic_Regression")
    data_path = os.path.join(code_dir, "2_Supervised_Modeling\\Logistic_Regression\\data")
    functions_path = os.path.join(code_dir, 'functions')
    
print(code_dir)
print(project_dir)
print(data_path)
print(functions_path)

In [None]:
# General Python modules
import time
import json

In [None]:
# Set the path for the library
import sys
sys.path.insert(0, functions_path)
import data_transformation as dtran
import variable_reduction as vr
import feature_elimination as fe
import machine_learning as ml
import reports as rp
import useful_functions as ufun
from load_data import load_data

In [None]:
pd.set_option('display.max_columns', 100)

# Initialize the solution variables

In [None]:
with open(os.path.join(project_dir, 'data/input/Supervised_Modeling_Solution_Input.json')) as f:
    inputs = json.load(f)

In [None]:
inputs

## Essential parameters

In [None]:
# String. Specify how to load the data. Options: csv, parq.
Load_from = inputs["Load_from"]
# String. Specify the data location: this is the folder where the data for this project are saved. 
data_location = inputs["data_location"]
# String. Set the input data file. 
table_name = inputs["table_name"]
# Float. Number between 0-1 determining what percent of data to subsample. 
sample = float(inputs["sample"])
# String. Set the target variable name in the original dataset. 
target_variable_name = inputs["target_variable_name"]
# String. Set the weight variable name in the original dataset. If not avaulable, then provide "None" with quotes.
weight_variable_name = inputs["weight_variable_name"]
# String. Set the sample column that has sample information, e.g. train/test/OOT or segment information, and will be used to split the data in different samples
# If this column does not exist, then provide "None" with quotes.
sample_variable_name = inputs["sample_variable_name"]
# String. Set the monetary loss associated with a delinquent case, if available. If this information does not exist, then provide "None" with quotes.
amount_variable_name = inputs["amount_variable_name"]
# List of strings. Set the sub-sample values that are in the sample_variable_name field, e.f. for train/test data split and/or for different segments. 
# All samples defined in this parameters will be picked up by the solution and results will be created for these samples. 
# The first sample in the list will be used to train models. 
# If sample column does not exist, then provide '[None]' (without quotes).
sample_values = inputs["sample_values"]
# List. Provide the feature names for the numeric variables that will be used for modeling. 
original_candidate_variables_numeric = inputs["numeric_variables_modeling"]
# List. Provide the feature names for the character variables that will be used for modeling. 
original_candidate_variables_character = inputs["character_variables_modeling"]

## Advanced parameters

In [None]:
# Float. Takes values between 0 and 1. Used in 'select_missing_variables_to_drop' function. Variables with percentage missing values above this threshold will be 
# dropped from the rest of the process. 
select_missing_variables_to_drop_threshold = inputs["select_missing_variables_to_drop_threshold"]
# Integer. Used in 'character_classification' function. Character variables with more levels than this threshold will be dropped from the rest of the process. 
character_classification_threshold = inputs["character_classification_threshold"]
# Float. Used in the 'replace_outliers' function in the outlier removal section. This is the coefficient for Interquantile range. 
# It can be used to adjust how many outliers to replace; the higher the value the less outliers are replaced. 
iqr_coef = inputs["iqr_coef"]
# String. Used in 'impute_missing' class. Select the stratefy to impute the missing values. Current options are "median", "mean", 
# or a specific value without quotes, e.g. 0.
impute_missing_imputation_strategy = inputs["impute_missing_imputation_strategy"]
# Float. Variables with Gini coefficient below this threshold will be dropped from the reamained of the analysis. 
gini_threshold = inputs["gini_threshold"]
# Float. Used in 'corr_eliminator' function in the initial correlations calculations. Variables with correlation greater than this threshold will be dropped. 
corr_threshold = inputs["corr_threshold"]
# Int. Used in the 'corr_eliminator' function in the initial correlations calculations. After highly correlated features are dropped, this is the number of the next highest correlations. 
top_n = eval(inputs["top_n"])
# Float. Used in the 'vif_eliminator' function in the initial VIF calculations. Variables with VIF greater than this threshold will be dropped.
# This paramater is only applicable if VIF_reduction=true. 
first_vif_threshold = inputs["first_vif_threshold"]
# Float. Used in the 'vif_eliminator' function in the Lasso Logistic Regression. Variables with VIF greater than this threshold will be dropped.
second_vif_threshold = inputs["second_vif_threshold"]
# String. User selects which criterion to optimize for feature selection. Options are: "AIC", "BIC".
lasso_criterion = inputs["lasso_criterion"]
# Boolean. Used to determine whether VIF is run after the correlation feature elimination step. 
VIF_reduction = inputs["VIF_reduction"]
# String. This is the solver argument in sklearn.LogisticRegression. Use 'saga' to reproduce the results, but there might be convergence warnings. 
# Use 'liblinear' to avoid convergence warnings, but the results will not be reproducible. 
LogisticRegression_solver = inputs["LogisticRegression_solver"]

# Load the data

In [None]:
data_full = load_data(method = Load_from, 
                     data_path = data_location, 
                     table_name = table_name, 
                     sample = sample)

In [None]:
data_full.info()

In [None]:
data_full.head()

# Replace column name characters that are not compatible with the solution

In [None]:
data_full.columns = data_full.columns.str.replace(",", "/")
original_candidate_variables_numeric = [item.replace(",", "/") for item in original_candidate_variables_numeric]
original_candidate_variables_character = [item.replace(",", "/") for item in original_candidate_variables_character]

# Create the Weight, Sample and Amount variables, if not available in the input dataset

In [None]:
# Create the weight variable, if it doesn't exist.
data_full, weight_variable_name_solution = dtran.weight_var_assignment(input_data = data_full, 
                                                                                     weight_variable = weight_variable_name)

# Create the sample variable, if it doesn't exist.
data_full, sample_values_solution, sample_variable_name_solution = dtran.sample_var_assignment(input_data = data_full, 
                                                                                        sample_variable = sample_variable_name,
                                                                                        sample_values = sample_values)

# Create the amount variable, if it doesn't exist.
data_full, amount_variable_name_solution = dtran.amount_var_assignment(input_data = data_full, 
                                                                                     amount_variable = amount_variable_name)

# Subset the dataset to use only the samples selected by 'sample values'

In [None]:
data_full = data_full[data_full[sample_variable_name_solution].isin(sample_values_solution)]

# Convert variable data types based on user information

In [None]:
# Convert character variables
data_full, character_variables_list = dtran.convert_character_var(input_data = data_full, 
                                                        character_variables = original_candidate_variables_character,
                                                        sample_variable = sample_variable_name_solution)

# Convert numeric variables
data_full, numeric_variables_list = dtran.convert_numeric_var(input_data = data_full, 
                                                        numeric_variables = original_candidate_variables_numeric,
                                                        weight_variable = weight_variable_name_solution, 
                                                        amount_variable = amount_variable_name_solution, 
                                                        target_variable = target_variable_name)

# Data quality report

In [None]:
# Create folder, if it doesn't exist
ufun.create_folder(data_path = data_path, 
                   folder_name = 'output')

In [None]:
dq = rp.dq_report(input_data = data_full, 
                data_path = data_path, 
                variables = character_variables_list + numeric_variables_list, 
                weight_variable = weight_variable_name_solution, 
                dq_report_file = 'data_quality_report.csv')

# Split sample data

In [None]:
data, sample_values_dict = dtran.split_sample_data(
    input_data=data_full, 
    sample_values_solution=sample_values_solution, 
    sample_variable_name_solution=sample_variable_name_solution
    )

# Set the original candidate variables

In [None]:
original_candidate_variables = original_candidate_variables_character + original_candidate_variables_numeric
print(ufun.color.BLUE + 'Original candidate variables: ' + ufun.color.END + str(original_candidate_variables))

# Remove variables with high missing values percentage

In [None]:
# Variables excluded from the non-predictive features: keys, target, sample, etc
excluded_variables = [x for x in data['data_{}'.format(sample_values_solution[0])].columns if x not in original_candidate_variables]
print(ufun.color.BLUE + 'Variables to be excluded: ' + ufun.color.END + str(excluded_variables))
print()
# Produce and save the missing values table to review
missing_variables_table, missing_variables = vr.missing_values_vars(
    sample_values_dict=sample_values_dict, 
    data_path=data_path, 
    input_data=data, 
    weight_variable_name_solution=weight_variable_name_solution, 
    select_missing_variables_to_drop_threshold=select_missing_variables_to_drop_threshold
    )
# Create the variables to remove: non-predictors + variables with too many missing information
excluded_variables = excluded_variables + missing_variables
print(ufun.color.BLUE + 'Variables to remove from the remainder of the analysis: ' + ufun.color.END + str(excluded_variables))

# Remove character variables with many levels

In [None]:
keep_char_vars_levels, excl_char_vars = vr.character_var_levels(
    input_data = data, 
    data_path = data_path, 
    sample_values_solution = sample_values_solution,
    excluded_variables = excluded_variables, 
    character_classification_threshold = character_classification_threshold
    )

# Outlier replacement for numeric variables

In [None]:
outlier_variables = [i for i in original_candidate_variables_numeric if i not in excluded_variables]
data_full, outlier_info = dtran.replace_outliers(
    input_data = data_full, 
    variables = outlier_variables, 
    weight_variable = weight_variable_name_solution, 
    data_path = data_path, 
    outlier_info_file = 'outlier_info.csv', 
    iqr_coef = iqr_coef
    )

In [None]:
# Split sample data
data, temp_dict = dtran.split_sample_data(
    input_data=data_full, 
    sample_values_solution=sample_values_solution, 
    sample_variable_name_solution=sample_variable_name_solution
    )

# Convert categorical variables to binary variables

In [None]:
data_full = dtran.character_to_binary(
    input_data = data_full, 
    input_variable_list = keep_char_vars_levels, 
    drop = 'last', # Specifies which value to drop from the one hot encoder. None will return binary variables for all categories. 'first' will drop the most populated category. 'last' will drop the least populated category. 
    protected_class_valid_values = None # Specifies accepted values for the protected class column. For non-protected class conversions use 'None'
    )

In [None]:
# Split sample data
data, temp_dict = dtran.split_sample_data(
    input_data=data_full, 
    sample_values_solution=sample_values_solution, 
    sample_variable_name_solution=sample_variable_name_solution
    )

In [None]:
# Keep all numeric variables, including those that were one-hot encoded
keep_num_vars = ufun.identify_numeric_variables(input_data=data['data_{}'.format(sample_values_solution[0])])
keep_num_vars = [x for x in keep_num_vars if x not in excluded_variables]
print('Keeping the following variables: ', keep_num_vars)
print(len(keep_num_vars))

# Impute missing values

In [None]:
variables_with_missing_dict = vr.select_missing_variables_to_drop_dict(
    sample_values_dict = sample_values_dict, 
    data_path = data_path)

In [None]:
# Select numeric features with missing values. Imputation will be applied to only these features, in order to improve the performance of the code. 
variables_with_missing = list(dict.fromkeys(sum(variables_with_missing_dict.values(), [])))
num_variables_with_missing = [i for i in keep_num_vars if i in variables_with_missing]
num_variables_with_missing

In [None]:
# Impute missing values
start_time = time.time()
impute_missing = dtran.impute_missing(
        variables = num_variables_with_missing, 
        imputation_strategy = impute_missing_imputation_strategy)
impute_missing.imputation_fit_weight(
        input_data = data['data_{}'.format(sample_values_solution[0])], 
        weight_variable = weight_variable_name_solution)

for i, j in sample_values_dict.items():
    impute_missing.imputation_transform(input_data = data['data_{}'.format(i)])

print('This code took %.2fs. to run'%(time.time() - start_time))

In [None]:
# Check missing values for imputed variables
for i, j in sample_values_dict.items():
    start_time = time.time()
    print(ufun.color.BOLD + ufun.color.PURPLE + ufun.color.UNDERLINE + 'SAMPLE ' + i + ufun.color.END)

    if num_variables_with_missing != []:
        print(data['data_{}'.format(i)][num_variables_with_missing].apply
              (lambda x: (sum(data['data_{}'.format(i)][x.isnull()][weight_variable_name_solution])
                /sum(data['data_{}'.format(i)][weight_variable_name_solution])) * 100, axis=0).sort_values(ascending=False))
    else: 
        print('There are no variables with missing values to impute')

    print('This code took %.2fs. to run'%(time.time() - start_time))

# Drop numeric variables with only one value

In [None]:
keep_num_vars_one_v = vr.keep_num_variables_one_value(
    keep_num_vars = keep_num_vars, 
    data_path = data_path, 
    dq_report = 'data_quality_report.csv'
    )

# Drop variables based on low Gini

In [None]:
gini_table = fe.gini_values_weight(feats = keep_num_vars_one_v, 
                   input_data = data['data_{}'.format(sample_values_solution[0])], 
                   target_variable = target_variable_name, 
                   weight_variable = weight_variable_name_solution, 
                   data_path = data_path, 
                   gini_info_file = 'gini_info.csv', 
                   n_bands = 10)
keep_num_vars_gini = list(gini_table.loc[gini_table['Gini coefficient'] >= gini_threshold, 'variable'].values)
print(ufun.color.PURPLE + 'Keeping the following variables with Gini > ' + str(gini_threshold) + ': ' + ufun.color.END + str(keep_num_vars_gini))
print(len(keep_num_vars_gini))

# Remove highly correlated features

In [None]:
corrs = fe.calculate_correlations(
    input_data = data['data_{}'.format(sample_values_solution[0])], 
    features = keep_num_vars_gini, 
    corr_threshold = corr_threshold, 
    weight_variable_name = weight_variable_name_solution
    )

In [None]:
eliminated, remaining_predictors = fe.correlation_elimination(
    method = 'correlation', 
    features = keep_num_vars_gini, 
    input_data = data['data_{}'.format(sample_values_solution[0])], 
    data_path = data_path, 
    corr_threshold = corr_threshold, 
    top_n = top_n, 
    weight_variable_name = weight_variable_name_solution, 
    correlations = corrs
    )

# Optional: VIF elimination

In [None]:
eliminated, remaining_predictors = fe.run_VIF(
    VIF_reduction = VIF_reduction, 
    features = remaining_predictors, 
    input_data = data['data_{}'.format(sample_values_solution[0])], 
    data_path = data_path, 
    vif_threshold = first_vif_threshold, 
    corr_threshold = corr_threshold, 
    weight_variable_name = weight_variable_name_solution
    )

# Lasso Logistic Regression for feature selection

In [None]:
bic_dict = fe.perform_lasso(
    sample_values_dict = sample_values_dict, 
    sample_values_solution = sample_values_solution, 
    data = data, 
    target_variable_name = target_variable_name, 
    predictor_variables = remaining_predictors, 
    data_path = data_path, 
    LogisticRegression_solver = LogisticRegression_solver,
    early_stop = True, 
    weight_variable_name = weight_variable_name_solution, 
    standardization=False, 
    c_min = 1e-4, 
    c_max = 0.5, 
    num = 10, 
    vif_threshold = second_vif_threshold, 
    random_state = 42, 
    lasso_criterion = lasso_criterion
    )

In [None]:
lasso = bic_dict[next(iter(bic_dict))]
# Obtain the best C value based on the criterion selected by the user
lasso.best_vars()
# Running the second VIF using the lasso_features from the best_vars function
vifs = lasso.calculate_vifs(lasso.lasso_features, weight_variable_name=weight_variable_name_solution, silent=False)

In [None]:
# Obtain the final list of features after the second VIF threshold calculation
final_vars = lasso.remaining_predictors()

# Logistic Regression

## Remove features based on p-value information

In [None]:
logistic_regression_pre = ml.logistic_regression(
    input_data = data, 
    final_feats = final_vars, 
    target_variable = target_variable_name, 
    weight_variable_name = weight_variable_name_solution, 
    data_path = data_path
    )

In [None]:
stepwise_features = logistic_regression_pre.stepwise_fun(sample_values_solution = sample_values_solution, 
        method = 'backward', # Possible values: 'backward', 'forward', 'combined'
        number_of_features = None, # Set to None to allow for feature selection using the p-value
        significance_level = 0.05 # Features with p-value greater than this threshold will not be included in the selected features    
)

## Execute Logistic regression based on the remaining features

In [None]:
logistic_regression = ml.logistic_regression(
    input_data = data, 
    final_feats = stepwise_features, 
    target_variable = target_variable_name, 
    weight_variable_name = weight_variable_name_solution, 
    data_path = data_path
    )

In [None]:
lreg_glm, lreg_summary = logistic_regression.glm_bin(
    sample_values_solution = sample_values_solution 
    )

## Produce reports

In [None]:
lr_output = logistic_regression.glm_report()

In [None]:
# Create the dataframes dictionary with the predicted variables that will be used as input to other reports
predictions_dict = logistic_regression.create_predictions(
        sample_values_dict=sample_values_dict, 
        amount_variable_name = amount_variable_name_solution)

In [None]:
binary_regression_report_class = rp.binary_regression_report(
    predictions_dictionary = predictions_dict, 
    target_variable = target_variable_name, 
    predicted_score_numeric = 'predicted_score_numeric', 
    amount_variable_name = amount_variable_name_solution, 
    weight_variable_name = weight_variable_name_solution, 
    sample_values_dict = sample_values_dict, 
    select_top_percent = 100, 
    n_bands = 10, 
    rows = 10, 
    data_path = data_path
    )

In [None]:
lr_eval = binary_regression_report_class.get_evaluation(predicted_score_binary = 'predicted_score_binary', 
                                                       filename = 'evaluation_metrics.csv')

In [None]:
# Create Lift table
lift_table_dict = binary_regression_report_class.create_lift_table(filename = 'lift_table_')

In [None]:
# Create folder, if it doesn't exist
folder_name = 'graphs_LR'
ufun.create_folder(data_path = data_path, 
                   folder_name = 'output/{}'.format(folder_name))

In [None]:
binary_regression_report_class.plot_ADR_Quantile(
        folder_name = folder_name,
        xlim=None, 
        ylim=None
        )

In [None]:
binary_regression_report_class.plot_cADR_Quantile(
        folder_name = folder_name,
        xlim=None, 
        ylim=None
        )

In [None]:
binary_regression_report_class.plot_FPR_Quantile(
        folder_name = folder_name,
        xlim=None, 
        ylim=None
        )

In [None]:
binary_regression_report_class.plot_cFPR_Quantile(
        folder_name = folder_name,
        xlim=None, 
        ylim=None
        )

In [None]:
binary_regression_report_class.plot_ROC_curve(folder_name = folder_name)

In [None]:
binary_regression_report_class.plot_precision_recall_curve(folder_name = folder_name)

In [None]:
binary_regression_report_class.plot_cutoffs(
        folder_name = folder_name,
        n_bands = 100, # Number of bands between 0 and 1
        cost_fp = 500, # Cost of blocking a legitimate customer
        cost_fn = 10000, # Cost of missing a fraud/credit risk customer
        return_table=True # Set to True in order to return the table that produced the graph, otherwise set to False
        )

In [None]:
from importlib import reload
reload(rp)