In [None]:
# Make the notebook full screen
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import os
import pandas as pd
import importlib
import sys 

if sys.version_info[:3] < (3,4):
    os.getcdw()
    code_dir = os.path.dirname(os.getcdw())
    project_dir = os.path.dirname(os.path.dirname(os.getcdw()))
    data_path = os.path.join(code_dir, "data")
    functions_path = os.path.join(project_dir, "functions")
else: 
    from pathlib import Path
    current_directory = os.path.dirname(Path.cwd())
    code_dir = os.path.dirname(current_directory)
    project_dir = os.path.join(code_dir, "2_Supervised_Modeling")
    data_path = os.path.join(code_dir, "2_Supervised_Modeling\\data")
    functions_path = os.path.join(code_dir, 'functions')
    
#code_dir = r'D:\BackUp - 151110\Side_Projects\Analytical_Solutions\Sotiris_Solutions\2_Supervised_Modeling\src'
#project_dir = r'D:\BackUp - 151110\Side_Projects\Analytical_Solutions\Sotiris_Solutions\2_Supervised_Modeling'
#data_path = r'D:\BackUp - 151110\Side_Projects\Analytical_Solutions\Sotiris_Solutions\2_Supervised_Modeling\data'
#functions_path = r'D:\BackUp - 151110\Side_Projects\Analytical_Solutions\Sotiris_Solutions\functions'
#print(code_dir)
print(project_dir)
print(data_path)
print(functions_path)

In [None]:
# General Python modules
import time
import pandas as pd
import numpy as np
import importlib
import json

In [None]:
# Set the path for the library
import sys
sys.path.insert(0, functions_path)
import variable_reduction as vr
from solution_steps import color
from data_quality_report import dq_report
import preprocessing as pp
import data_processing as cpd
import lasso_feature_selection as lfs
import logistic_regression as lreg 
from load_data import load_data
import solution_steps as ss

In [None]:
pd.set_option('display.max_columns', 100)

# Initialize the solution variables

In [None]:
with open(os.path.join(project_dir, 'data/input/Supervised_Modeling_Solution_Input.json')) as f:
    inputs = json.load(f)

In [None]:
inputs

## Essential parameters

In [None]:
# String. Specify how to load the data. Options: csv, parq.
Load_from = inputs["Load_from"]
# String. Specify the data location: this is the folder where the data for this project are saved. 
data_location = inputs["data_location"]
# String. Set the input data file. 
table_name = inputs["table_name"]
# Float. Number between 0-1 determining what percent of data to subsample. 
sample = float(inputs["sample"])
# String. Set the target variable name in the original dataset. 
target_variable_name = inputs["target_variable_name"]
# String. Set the weight variable name in the original dataset. If not avaulable, then provide "None".
weight_variable_name = inputs["weight_variable_name"]
# String. Set the sample column that has sample information, e.g. train/test/OOT or segment information, and will be used to split the data in different samples
# If this column does not exist, then provide "None".
sample_variable_name = inputs["sample_variable_name"]
# List of strings. Set the sub-sample values that are in the sample_variable_name field, e.f. for train/test data split and/or for different segments. 
# All samples defined in this parameters will be picked up by the solution and results will be created for these samples. 
# If sample column does not exist, then provide '[None]' (without quotes).
sample_values = inputs["sample_values"]
# List. Provide the feature names for the numeric variables that will be used for modeling. 
original_candidate_variables_numeric = inputs["numeric_variables_modeling"]
# List. Provide the feature names for the character variables that will be used for modeling. 
original_candidate_variables_character = inputs["character_variables_modeling"]

## Advanced parameters

In [None]:
# Float. Takes values between 0 and 1. Used in 'select_missing_variables_to_drop' function. Variables with percentage missing values above this threshold will be 
# dropped from the rest of the process. 
select_missing_variables_to_drop_threshold = inputs["select_missing_variables_to_drop_threshold"]
# Integer. Used in 'character_classification' function. Character variables with more levels than this threshold will be dropped from the rest of the process. 
character_classification_threshold = inputs["character_classification_threshold"]
# Float. Used in the 'replace_outliers' function in the outlier removal section. This is the coefficient for Interquantile range. 
# It can be used to adjust how many outliers to replace; the higher the value the less outliers are replaced. 
iqr_coef = inputs["iqr_coef"]
# String. Used in 'impute_missing' class. Select the stratefy to impute the missing values. Current options are "median", "mean", 
# or a specific value without quotes, e.g. 0.
impute_missing_imputation_strategy = inputs["impute_missing_imputation_strategy"]
# Float. Variables with Gini coefficient below this threshold will be dropped from the reamained of the analysis. 
gini_threshold = inputs["gini_threshold"]
# Float. Used in 'corr_eliminator' function in the initial correlations calculations. Variables with correlation greater than this threshold will be dropped. 
corr_threshold = inputs["corr_threshold"]
# Int. Used in the 'corr_eliminator' function in the initial correlations calculations. After highly correlated features are dropped, this is the number of the next highest correlations. 
top_n = eval(inputs["top_n"])
# Float. Used in the 'vif_eliminator' function in the initial VIF calculations. Variables with VIF greater than this threshold will be dropped.
first_vif_threshold = inputs["first_vif_threshold"]
# Float. Used in the 'vif_eliminator' function in the Lasso Logistic Regression. Variables with VIF greater than this threshold will be dropped.
second_vif_threshold = inputs["second_vif_threshold"]
# String. User selects which criterion to optimize for feature selection. Options are: "AIC", "BIC".
lasso_criterion = inputs["lasso_criterion"]
# Boolean. Used to determine whether VIF is run after the correlation feature elimination step. 
VIF_reduction = inputs["VIF_reduction"]

# Load the data

In [None]:
data_full = load_data(method = Load_from, 
                     data_path = data_location, 
                     table_name = table_name, 
                     sample = sample)

In [None]:
data_full.info()

In [None]:
data_full.head()

# Create the Weight and Sample variables, if not available in the input dataset

In [None]:
# Create the weight variable
data_full, weight_variable_name_solution = ss.weight_var_assignment(data_full = data_full, 
                                                                 weight_variable_name = weight_variable_name)

# Create the sample variable
data_full, sample_values_solution, sample_variable_name_solution = ss.sample_var_assignment(data_full = data_full, 
                                                                                         sample_variable_name = sample_variable_name, 
                                                                                           sample_values = sample_values)

# Convert variable data types based on user information

In [None]:
# Convert character variables
data_full, character_variables_list = ss.convert_character_var(data_full = data_full, 
                                                        original_candidate_variables_character = original_candidate_variables_character,
                                                        sample_variable_name_solution = sample_variable_name_solution)

# Convert numeric variables
data_full, numeric_variables_list = ss.convert_numeric_var(data_full = data_full, 
                                                        original_candidate_variables_numeric = original_candidate_variables_numeric,
                                                        weight_variable_name_solution = weight_variable_name_solution, 
                                                        target_variable_name = target_variable_name)

# Data quality report

In [None]:
dq = dq_report(df = data_full, 
                data_path = data_path, 
                variables = character_variables_list + numeric_variables_list, 
                weight_variable = weight_variable_name_solution, 
                dq_report_file = 'data_quality_report.csv')

# Split sample data

In [None]:
data, sample_values_dict = ss.split_sample_data(
    data_full=data_full, 
    sample_values_solution=sample_values_solution, 
    sample_variable_name_solution=sample_variable_name_solution
    )

# Set the original candidate variables

In [None]:
original_candidate_variables = original_candidate_variables_character + original_candidate_variables_numeric
print(color.BLUE + 'Original candidate variables: ' + color.END + str(original_candidate_variables))

# Remove variables with high missing values percentage

In [None]:
# Variables excluded from the non-predictive features: keys, target, sample, etc
excluded_variables = [x for x in data['data_{}'.format(sample_values_solution[0])].columns if x not in original_candidate_variables]
print(color.BLUE + 'Variables to be excluded: ' + color.END + str(excluded_variables))
print()
# Produce and save the missing values table to review
missing_variables_table, missing_variables = ss.missing_values_vars(
    sample_values_dict=sample_values_dict, 
    data_path=data_path, 
    data=data, 
    weight_variable_name_solution=weight_variable_name_solution, 
    select_missing_variables_to_drop_threshold=select_missing_variables_to_drop_threshold
    )
# Create the variables to remove: non-predictors + variables with too many missing information
excluded_variables = excluded_variables + missing_variables
print(color.BLUE + 'Variables to remove from the remainder of the analysis: ' + color.END + str(excluded_variables))

# Remove character variables with many levels

In [None]:
keep_char_vars_levels = ss.character_var_levels(
    data = data, 
    data_path = data_path, 
    sample_values_solution = sample_values_solution,
    excluded_variables = excluded_variables, 
    character_classification_threshold = character_classification_threshold
    )

# Outlier replacement for numeric variables

In [None]:
outlier_variables = [i for i in original_candidate_variables_numeric if i not in excluded_variables]
data_full = cpd.replace_outliers(
    input_data = data_full, 
    variables = outlier_variables, 
    weight_variable = weight_variable_name_solution, 
    data_path = data_path, 
    outlier_info_file = 'outlier_info.csv', 
    iqr_coef = iqr_coef
    )

In [None]:
# Split sample data
data = {}
for i, j in sample_values_dict.items():
    start_time = time.time()
    print(color.BOLD + color.PURPLE + color.UNDERLINE + j + color.END)
    
    data['data_{}'.format(i)] = data_full[data_full[sample_variable_name_solution]==i]
    print('The shape is: ', data['data_{}'.format(i)].shape)
    
    print('This code took %.2fs. to run'%(time.time() - start_time))

# Convert categorical variables to binary variables

In [None]:
cpd.character_to_binary(
    input_data = data_full, 
    input_variable_list = keep_char_vars_levels, 
    drop = 'last', # Specifies which value to drop from the one hot encoder. None will return binary variables for all categories. 'first' will drop the most populated category. 'last' will drop the less populated category. 
    protected_class_valid_values = None # Specifies accepted values for the protected class column. For non-protected class conversions use 'None'
    )

In [None]:
# Split sample data
data = {}
for i, j in sample_values_dict.items():
    start_time = time.time()
    print(color.BOLD + color.PURPLE + color.UNDERLINE + j + color.END)
    
    data['data_{}'.format(i)] = data_full[data_full[sample_variable_name_solution]==i]
    print('The shape is: ', data['data_{}'.format(i)].shape)
    
    print('This code took %.2fs. to run'%(time.time() - start_time))

In [None]:
# Keep all numeric variables, including those that were one-hot encoded
keep_num_vars = cpd.identify_numeric_variables(input_data=data['data_{}'.format(sample_values_solution[0])])
keep_num_vars = [x for x in keep_num_vars if x not in excluded_variables]
print('Keeping the following variables: ', keep_num_vars)
print(len(keep_num_vars))

# Impute missing values

In [None]:
variables_with_missing_dict = {}
for i, j in sample_values_dict.items():
    start_time = time.time()
    print(color.BOLD + color.PURPLE + color.UNDERLINE + j + color.END)
    
    variables_with_missing_dict['variables_with_missing_dict_{}'.format(i)] = cpd.select_missing_variables_to_drop(
    data_path = data_path, 
    sample_name = j, 
    threshold = 0
    )
    
    print('This code took %.2fs. to run'%(time.time() - start_time))

In [None]:
# Select numeric features with missing values. Imputation will be applied to only these features, in order to improve the performance of the code. 
variables_with_missing = list(dict.fromkeys(sum(variables_with_missing_dict.values(), [])))
num_variables_with_missing = [i for i in keep_num_vars if i in variables_with_missing]
num_variables_with_missing

In [None]:
# Impute missing values
start_time = time.time()
impute_missing = cpd.impute_missing(
        variables = num_variables_with_missing, 
        imputation_strategy = impute_missing_imputation_strategy)
impute_missing.imputation_fit_weight(
        input_data = data['data_{}'.format(sample_values_solution[0])], 
        weight_variable = weight_variable_name_solution)

for i, j in sample_values_dict.items():
    impute_missing.imputation_transform(input_data = data['data_{}'.format(i)])

print('This code took %.2fs. to run'%(time.time() - start_time))

In [None]:
# Check missing values for imputed variables
for i, j in sample_values_dict.items():
    start_time = time.time()
    print(color.BOLD + color.PURPLE + color.UNDERLINE + j + color.END)

    if num_variables_with_missing != []:
        print(data['data_{}'.format(i)][num_variables_with_missing].apply
              (lambda x: (sum(data['data_{}'.format(i)][x.isnull()][weight_variable_name_solution])
                /sum(data['data_{}'.format(i)][weight_variable_name_solution])) * 100, axis=0).sort_values(ascending=False))
    else: 
        print('There are no variables with missing values to impute')

    print('This code took %.2fs. to run'%(time.time() - start_time))

# Drop numeric variables with only one value

In [None]:
keep_num_vars_one_v = ss.keep_num_variables_one_value(
    keep_num_vars = keep_num_vars, 
    data_path = data_path, 
    dq_report = 'data_quality_report.csv'
    )

# Drop variables based on low Gini

In [None]:
gini_table = pp.gini_values_weight(feats = keep_num_vars_one_v, 
                   input_data = data['data_{}'.format(sample_values_solution[0])], 
                   target_variable = target_variable_name, 
                   weight_variable = weight_variable_name_solution, 
                   data_path = data_path, 
                   gini_info_file = 'gini_info.csv', 
                   n_bands = 10)
keep_num_vars_gini = list(gini_table.loc[gini_table['Gini coefficient'] >= gini_threshold, 'variable'].values)
print(color.PURPLE + 'Keeping the following variables with Gini > ' + str(gini_threshold) + ': ' + color.END + str(keep_num_vars_gini))
print(len(keep_num_vars_gini))

# Remove highly correlated features

In [None]:
corrs = vr.calculate_correlations(
    train_df = data['data_{}'.format(sample_values_solution[0])], 
    features = keep_num_vars_gini, 
    corr_threshold = corr_threshold, 
    weight_variable_name = weight_variable_name_solution
    )

In [None]:
eliminated, remaining_predictors = vr.correlation_elimination(
    method = 'correlation', 
    features = keep_num_vars_gini, 
    train_df = data['data_{}'.format(sample_values_solution[0])], 
    data_path = data_path, 
    corr_threshold = corr_threshold, 
    top_n = top_n, 
    weight_variable_name = weight_variable_name_solution, 
    correlations = corrs
    )

# Optional: VIF elimination

In [None]:
eliminated, remaining_predictors = vr.run_VIF(
    VIF_reduction = VIF_reduction, 
    features = remaining_predictors, 
    train_df = data['data_{}'.format(sample_values_solution[0])], 
    data_path = data_path, 
    vif_threshold = first_vif_threshold, 
    corr_threshold = corr_threshold, 
    weight_variable_name = weight_variable_name_solution
    )

# Lasso Logistic Regression for feature selection

In [None]:
bic_dict = ss.perform_lasso(
    sample_values_dict = sample_values_dict, 
    sample_values_solution = sample_values_solution, 
    data = data, 
    target_variable_name = target_variable_name, 
    predictor_variables = remaining_predictors, 
    data_path = data_path, 
    early_stop = True, 
    weight_variable_name = weight_variable_name_solution, 
    standardization=False, 
    c_min = 1e-4, 
    c_max = 0.5, 
    num = 10, 
    vif_threshold = second_vif_threshold, 
    random_state = 42
    )

In [None]:
lasso = bic_dict['training']
# Obtain the best C value based on the criterion selected by the user
lasso.best_vars(lasso_criterion)
# Running the second VIF using the lasso_features from the best_vars function
vifs = lasso.calculate_vifs(lasso.lasso_features, weight_variable_name=weight_variable_name_solution, silent=False)

In [None]:
# Obtain the final list of features after the second VIF threshold calculation
final_vars = lasso.remaining_predictors()

# Logistic Regression

In [None]:
lreg_glm, lreg_summary = lreg.glm_bin(
    sample_values_solution = sample_values_solution, 
    data = data, 
    final_feats = final_vars, 
    target_variable_name = target_variable_name, 
    weight_variable_name_solution = weight_variable_name_solution
    )

In [None]:
output = lreg.glm_report(
    data_path = data_path, 
    glm_bin_summary = lreg_summary
    )

In [None]:
eval_ = lreg.get_evaluation(
    model = lreg_glm, 
    sample_values_dict = sample_values_dict, 
    final_feats = final_vars, 
    target_variable_name = target_variable_name, 
    data = data, 
    data_path = data_path
    )