# LIMITATIONS: Where weights are currently not implemented

In [None]:
#- Standardization
#- Unsupervised metrics like silhouette score
#- Factor Analysis
#- PCA

In [None]:
# Make the notebook full screen
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import os
import pandas as pd
import importlib
#os.getcdw()
#code_dir = os.path.dirname(os.getcdw())
#project_dir = os.path.dirname(os.path.dirname(os.getcdw()))
#data_path = os.path.join(code_dir, 'data')
#functions_path = os.path.join(project_dir, 'functions')
code_dir = r'C:\Users\creep\Anaconda\Sotiris_Solutions\3_Unsupervised_Modeling\src'
project_dir = r'C:\Users\creep\Anaconda\Sotiris_Solutions\3_Unsupervised_Modeling'
data_path = r'C:\Users\creep\Anaconda\Sotiris_Solutions\3_Unsupervised_Modeling\data'
functions_path = r'C:\Users\creep\Anaconda\Sotiris_Solutions\functions'
print(code_dir)
print(project_dir)
print(data_path)
print(functions_path)

In [None]:
# General Python modules
import time
import pandas as pd
import numpy as np
import importlib

In [None]:
# Set the path for the library
import sys
sys.path.insert(0, functions_path)
import variable_reduction as vr
from solution_steps import color
from data_quality_report import dq_report
import data_processing as cpd
import dimensionality_reduction as dr
import profiling as pro
import factor_analysis
import select_model as sm
import feature_importance as fi
from load_data import load_data
import solution_steps as ss
import json
from sklearn.cluster import KMeans, DBSCAN

In [None]:
pd.set_option('display.max_columns', 100)

# Initialize the solution variables

In [None]:
with open(os.path.join(project_dir, 'data/input/Unsupervised_Modeling_Solution_Input.json')) as f:
    inputs = json.load(f)

In [None]:
inputs

## Essential parameters

In [None]:
# String. Specify how to load the data. Options: csv, parq.
Load_from = inputs["Load_from"]
# String. Specify the data location: this is the folder where the data for this project are saved. 
data_location = inputs["data_location"]
# String. Set the input data file. 
table_name = inputs["table_name"]
# Float. Number between 0-1 determining what percent of data to subsample. 
sample = float(inputs["sample"])
# String. Set the weight variable name in the original dataset. If not avaulable, then provide "None".
weight_variable_name = inputs["weight_variable_name"]
# String. Set the sample column that has sample information, e.g. train/test/OOT or segment information, and will be used to split the data in different samples
# If this column does not exist, then provide "None".
sample_variable_name = inputs["sample_variable_name"]
# List of strings. Set the sub-sample values that are in the sample_variable_name field, e.f. for train/test data split and/or for different segments. 
# All samples defined in this parameters will be picked up by the solution and results will be created for these samples. 
# If sample column does not exist, then provide '[None]' (without quotes).
sample_values = inputs["sample_values"]
# List. Provide the feature names for the numeric variables that will be used for clustering. 
numeric_variables_for_clustering = inputs["numeric_variables_clustering"]
# List. Provide the feature names for the character variables that will be used for clustering. 
character_variables_for_clustering = inputs["character_variables_clustering"]
# List. Provide the feature names for the numeric variables that will be used for profiling/overlaying. 
numeric_variables_for_profiling = inputs["numeric_variables_profiling"]
# List. Provide the feature names for the character variables that will be used for profiling/overlaying. 
character_variables_for_profiling = inputs["character_variables_profiling"]
# Int. Used in factor_analysis.remove_features function. Determines the number of factors to be used in Factor Analysis. 
number_factors = inputs["number_factors"]
# Int. Used in dimensionality_reduction.fit_transform. Determines the number of principal components to be used in the final PCA model. 
number_pcs = inputs["number_pcs"]
# Int. Used in dimensionality_reduction.fit_transform. Determines the number of principal components to be used in the final PCA model during the second iteration, after dropping features with low feautre importnance. 
number_pcs_2 = inputs["number_pcs_2"]
# List. Models to test. Must include the model function, fedault arguments, the test argument name, and a list of values to test. 
models_to_test = eval(inputs["models_to_test"])
# Dictionary. Includes the model function and arguments for the final model to be used in clustering. 
final_model = eval(inputs["final_model"])
# Dictionary. Includes the model function and arguments for the final model to be used in clustering, after features were dropped due to low feature importance. 
final_model_2 = eval(inputs["final_model_2"])

## Advanced parameters

In [None]:
# Float. Takes values between 0 and 1. Used in 'select_missing_variables_to_drop' function. Variables with percentage missing values above this threshold will be 
# dropped from the rest of the process. 
select_missing_variables_to_drop_threshold = inputs["select_missing_variables_to_drop_threshold"]
# Integer. Used in 'character_classification' function. Character variables with more levels than this threshold will be dropped from the rest of the process. 
character_classification_threshold = inputs["character_classification_threshold"]
# Float. Used in the 'replace_outliers' function in the outlier removal section. This is the coefficient for Interquantile range. 
# It can be used to adjust how many outliers to replace; the higher the value the less outliers are replaced. 
iqr_coef = inputs["iqr_coef"]
# String. Used in 'impute_missing' class. Select the stratefy to impute the missing values. Current options are "median", "mean", 
# or a specific value without quotes, e.g. 0.
impute_missing_imputation_strategy = inputs["impute_missing_imputation_strategy"]
# Float. Used in 'corr_eliminator' function in the initial correlations calculations. Variables with correlation greater than this threshold will be dropped. 
corr_threshold = inputs["corr_threshold"]
# Int. Used in the 'corr_eliminator' function in the initial correlations calculations. After highly correlated features are dropped, this is the number of the next highest correlations. 
top_n = eval(inputs["top_n"])
# Float. Used in factor_analysis.setup function. Variables with KMO above this threshold will be tested in Factor Analysis. 
kmo_threshold = inputs["kmo_threshold"]
# Float. Used in factor_analysis.remove features function. Variables with factor loadings above this threshold will be dropped. 
loadings_threshold = inputs["loadings_threshold"]
# Float. Used in FeatureImportance.feature_importance_keep_vars. Variables that have feature importance less than this threshold will be dropped from clustering. 
feature_importance_threshold = inputs["feature_importance_threshold"]

# Load the data

In [None]:
data_full = load_data(method = Load_from, 
                     data_path = data_location, 
                     table_name = table_name, 
                     sample = sample)

In [None]:
data_full.info()

In [None]:
data_full.head()

# Create the Weight and Sample variables, if not available in the input dataset

In [None]:
# Create the weight variable
data_full, weight_variable_name_solution = ss.weight_var_assignment(data_full = data_full, 
                                                                 weight_variable_name = weight_variable_name)

# Create the sample variable
data_full, sample_values_solution, sample_variable_name_solution = ss.sample_var_assignment(data_full = data_full, 
                                                                                         sample_variable_name = sample_variable_name, 
                                                                                           sample_values = sample_values)

# Convert variable data types based on user information

In [None]:
# Convert character variables for clustering
data_full, character_variables_list_clustering = ss.convert_character_var(data_full = data_full, 
                                                        original_candidate_variables_character = character_variables_for_clustering,
                                                        sample_variable_name_solution = sample_variable_name_solution)
data_full, character_variables_list_profiling = ss.convert_character_var(data_full = data_full, 
                                                        original_candidate_variables_character = character_variables_for_profiling,
                                                        sample_variable_name_solution = sample_variable_name_solution)
character_variables_list = list(set(character_variables_list_clustering + character_variables_list_profiling))

# Convert numeric variables for clustering
data_full, numeric_variables_list_clustering = ss.convert_numeric_var(data_full = data_full, 
                                                        original_candidate_variables_numeric = numeric_variables_for_clustering,
                                                        weight_variable_name_solution = weight_variable_name_solution, 
                                                        target_variable_name = '')
data_full, numeric_variables_list_profiling = ss.convert_numeric_var(data_full = data_full, 
                                                        original_candidate_variables_numeric = numeric_variables_for_profiling,
                                                        weight_variable_name_solution = weight_variable_name_solution, 
                                                        target_variable_name = '')
numeric_variables_list = list(set(numeric_variables_list_clustering + numeric_variables_list_profiling))

# Data quality report

In [None]:
dq = dq_report(df = data_full, 
                data_path = data_path, 
                variables = character_variables_list + numeric_variables_list, 
                weight_variable = weight_variable_name_solution, 
                dq_report_file = 'data_quality_report.csv')

# Split sample data

In [None]:
data, sample_values_dict = ss.split_sample_data(
    data_full=data_full, 
    sample_values_solution=sample_values_solution, 
    sample_variable_name_solution=sample_variable_name_solution
    )

# Set the original candidate variables

In [None]:
original_variables_clustering = character_variables_for_clustering + numeric_variables_for_clustering
print(color.BLUE + 'Original variables for clustering: ' + color.END + str(original_variables_clustering))
original_variables_profiling = character_variables_for_profiling + numeric_variables_for_profiling
print(color.BLUE + 'Original variables for profiling: ' + color.END + str(original_variables_profiling))

# Remove variables with high missing values percentage

In [None]:
# Variables excluded from the non-predictive features: keys, target, sample, etc
excluded_variables = [x for x in data['data_{}'.format(sample_values_solution[0])].columns if x not in original_variables_clustering]
print(color.BLUE + 'Variables to be excluded: ' + color.END + str(excluded_variables))
print()
# Produce and save the missing values table to review
missing_variables_table, missing_variables = ss.missing_values_vars(
    sample_values_dict=sample_values_dict, 
    data_path=data_path, 
    data=data, 
    weight_variable_name_solution=weight_variable_name_solution, 
    select_missing_variables_to_drop_threshold=select_missing_variables_to_drop_threshold
    )
# Create the variables to remove: non-predictors + variables with too many missing information
excluded_variables = excluded_variables + missing_variables
print(color.BLUE + 'Variables to remove from the remainder of the analysis: ' + color.END + str(excluded_variables))

# Remove character variables with many levels

In [None]:
keep_char_vars_levels = ss.character_var_levels(
    data = data, 
    data_path = data_path, 
    sample_values_solution = sample_values_solution,
    excluded_variables = excluded_variables, 
    character_classification_threshold = character_classification_threshold
    )

# Outlier replacement for numeric variables

In [None]:
outlier_variables = [i for i in numeric_variables_list if i not in excluded_variables]
data_full = cpd.replace_outliers(
    input_data = data_full, 
    variables = outlier_variables, 
    weight_variable = weight_variable_name_solution, 
    data_path = data_path, 
    outlier_info_file = 'outlier_info.csv', 
    iqr_coef = iqr_coef
    )

In [None]:
# Split sample data
data = {}
for i, j in sample_values_dict.items():
    start_time = time.time()
    print(color.BOLD + color.PURPLE + color.UNDERLINE + j + color.END)
    
    data['data_{}'.format(i)] = data_full[data_full[sample_variable_name_solution]==i]
    print('The shape is: ', data['data_{}'.format(i)].shape)
    
    print('This code took %.2fs. to run'%(time.time() - start_time))

# Convert categorical variables to binary variables

In [None]:
cpd.character_to_binary(
    input_data = data_full, 
    input_variable_list = keep_char_vars_levels, 
    drop = 'last', # Specifies which value to drop from the one hot encoder. None will return binary variables for all categories. 'first' will drop the most populated category. 'last' will drop the less populated category. 
    protected_class_valid_values = None # Specifies accepted values for the protected class column. For non-protected class conversions use 'None'
    )

In [None]:
# Split sample data
data = {}
for i, j in sample_values_dict.items():
    start_time = time.time()
    print(color.BOLD + color.PURPLE + color.UNDERLINE + j + color.END)
    
    data['data_{}'.format(i)] = data_full[data_full[sample_variable_name_solution]==i]
    print('The shape is: ', data['data_{}'.format(i)].shape)
    
    print('This code took %.2fs. to run'%(time.time() - start_time))

In [None]:
# Keep all numeric variables, including those that were one-hot encoded
keep_num_vars = cpd.identify_numeric_variables(input_data=data['data_{}'.format(sample_values_solution[0])])
keep_num_vars = [x for x in keep_num_vars if x not in excluded_variables]
print('Keeping the following variables: ', keep_num_vars)
print(len(keep_num_vars))

# Impute missing values

In [None]:
variables_with_missing_dict = {}
for i, j in sample_values_dict.items():
    start_time = time.time()
    print(color.BOLD + color.PURPLE + color.UNDERLINE + j + color.END)
    
    variables_with_missing_dict['variables_with_missing_dict_{}'.format(i)] = cpd.select_missing_variables_to_drop(
    data_path = data_path, 
    sample_name = j, 
    threshold = 0
    )
    
    print('This code took %.2fs. to run'%(time.time() - start_time))

In [None]:
# Select numeric features with missing values. Imputation will be applied to only these features, in order to improve the performance of the code. 
variables_with_missing = list(dict.fromkeys(sum(variables_with_missing_dict.values(), [])))
num_variables_with_missing = [i for i in keep_num_vars if i in variables_with_missing]
num_variables_with_missing

In [None]:
# Impute missing values
start_time = time.time()
impute_missing = cpd.impute_missing(
        variables = num_variables_with_missing, 
        imputation_strategy = impute_missing_imputation_strategy)
impute_missing.imputation_fit_weight(
        input_data = data['data_{}'.format(sample_values_solution[0])], 
        weight_variable = weight_variable_name_solution)

for i, j in sample_values_dict.items():
    impute_missing.imputation_transform(input_data = data['data_{}'.format(i)])

print('This code took %.2fs. to run'%(time.time() - start_time))

In [None]:
# Check missing values for imputed variables
for i, j in sample_values_dict.items():
    start_time = time.time()
    print(color.BOLD + color.PURPLE + color.UNDERLINE + j + color.END)

    if num_variables_with_missing != []:
        print(data['data_{}'.format(i)][num_variables_with_missing].apply
              (lambda x: (sum(data['data_{}'.format(i)][x.isnull()][weight_variable_name_solution])
                /sum(data['data_{}'.format(i)][weight_variable_name_solution])) * 100, axis=0).sort_values(ascending=False))
    else: 
        print('There are no variables with missing values to impute')

    print('This code took %.2fs. to run'%(time.time() - start_time))

# Drop numeric variables with only one value

In [None]:
keep_num_vars_one_v = ss.keep_num_variables_one_value(
    keep_num_vars = keep_num_vars, 
    data_path = data_path, 
    dq_report = 'data_quality_report.csv'
    )

# Select features for clustering

In [None]:
keep_one_hot_list = []
for i in character_variables_for_clustering:
    keep_one_hot_list = keep_one_hot_list + [col for col in keep_num_vars_one_v if col.startswith(i)]
keep_numeric_vars_list = [x for x in original_variables_clustering if x in keep_num_vars_one_v]
keep_vars_for_clustering = keep_numeric_vars_list + keep_one_hot_list
print(keep_vars_for_clustering)
print(len(keep_vars_for_clustering))

# Remove highly correlated features

In [None]:
corrs = vr.calculate_correlations(
    train_df = data['data_{}'.format(sample_values_solution[0])], 
    features = keep_vars_for_clustering, 
    corr_threshold = corr_threshold, 
    weight_variable_name = weight_variable_name_solution
    )

In [None]:
eliminated, remaining_predictors = vr.correlation_elimination(
    method = 'correlation', 
    features = keep_vars_for_clustering, 
    train_df = data['data_{}'.format(sample_values_solution[0])], 
    data_path = data_path, 
    corr_threshold = corr_threshold, 
    top_n = top_n, 
    weight_variable_name = weight_variable_name_solution, 
    correlations = corrs
    )

# Standardize the data

In [None]:
data_standardized = cpd.standardize_data(
    input_data = data, 
    variables = remaining_predictors, 
    training_sample = 'data_{}'.format(sample_values_solution[0]), 
    data_path = data_path, 
    filename = 'standard_scaler.pkl'
    )

# Remove features using Factor Analysis

In [None]:
fa = factor_analysis.FactorAnalysis(
        data = data_standardized, 
        training_sample = 'data_{}'.format(sample_values_solution[0]), 
        datapath = data_path, 
        filename = 'FactorAnalysis'
    )
fa.setup(kmo_threshold = kmo_threshold)

In [None]:
data_standardized = fa.remove_features(
        n_factors = number_factors, 
        loadings_threshold = loadings_threshold
    )

# PCA transformation

In [None]:
dimension_reduction = dr.dimension_reduction(dic_of_dfs = data_standardized, 
        data_path = data_path, 
        training_sample = 'data_{}'.format(sample_values_solution[0])
        )
dimension_reduction.explore()

In [None]:
pca_data = dimension_reduction.fit_transform(pca_components = number_pcs, 
        solver = 'full', 
        filename = 'pca_model.pkl'
        )

In [None]:
pca_data['data_{}'.format(sample_values_solution[0])].head()

# Clustering methodology

## Select the best parameters

In [None]:
sm_object = sm.SelectModel(df = pca_data,
        sample_values_solution = sample_values_solution, 
        weights = data['data_{}'.format(sample_values_solution[0])][weight_variable_name_solution], 
        data_path = data_path, 
        filename = 'ClusterProfile_'
        )

In [None]:
for m in models_to_test:
    sm_object.set_test_model(m)
    display(sm_object.get_profile(bootstraps = 10, 
        sample_size = 0.1
        ))

# Develop the Clustering model

In [None]:
model = sm_object.create_model(model_inputs = final_model, filename='model.pkl')

# Test the Clustering model on validation data

In [None]:
sm_object.validate_data()

# Feature contribution to clusters

In [None]:
character_variables_dropped = [x for x in character_variables_for_clustering if x not in keep_char_vars_levels]
numeric_variables_dropped = [x for x in numeric_variables_for_clustering if x not in keep_vars_for_clustering]
non_feature_imp = character_variables_dropped + numeric_variables_dropped + [sample_variable_name_solution, weight_variable_name_solution]
variables_list_profiling = [col for col in numeric_variables_list_profiling + character_variables_list_profiling if col not in non_feature_imp]

In [None]:
fi_object = fi.FeatureImportance(X = data['data_{}'.format(sample_values_solution[0])][variables_list_profiling], 
        labels = model.labels_, 
        weights = data['data_{}'.format(sample_values_solution[0])][weight_variable_name_solution], 
        data_path = data_path, 
        filename = 'FeatureImportance'
    )

In [None]:
imps = fi_object.get_report()
imps

# Keep features based on Feature Importance

In [None]:
keep_fi_vars = fi_object.feature_importance_keep_vars(
        feature_importance_threshold = feature_importance_threshold
        )
keep_fi_vars = [x for x in remaining_predictors if x in keep_fi_vars]
print(keep_fi_vars)
print(len(keep_fi_vars))

# Standardize the data

### The steps below are optional, but recommended since there is an opportunity to remove the least important features from clustering as they add noise to the clusters. 

In [None]:
data_standardized = cpd.standardize_data(
    input_data = data, 
    variables = keep_fi_vars, 
    training_sample = 'data_{}'.format(sample_values_solution[0]), 
    data_path = data_path, 
    filename = 'standard_scaler_2.pkl'
    )    

# PCA transformation

In [None]:
dimension_reduction = dr.dimension_reduction(dic_of_dfs = data_standardized, 
        data_path = data_path, 
        training_sample = 'data_{}'.format(sample_values_solution[0])
        )
dimension_reduction.explore()

In [None]:
pca_data = dimension_reduction.fit_transform(pca_components = min(number_pcs_2, data_standardized['data_{}'.format(sample_values_solution[0])].shape[1]), 
        solver = 'full', 
        filename = 'pca_model_2.pkl'
        )

In [None]:
pca_data['data_{}'.format(sample_values_solution[0])].head()

# Clustering methodology

## Select the best parameters

In [None]:
sm_object = sm.SelectModel(df = pca_data,
        sample_values_solution = sample_values_solution, 
        weights = data['data_{}'.format(sample_values_solution[0])][weight_variable_name_solution], 
        data_path = data_path, 
        filename = 'ClusterProfile_2_'
        )

In [None]:
for m in models_to_test:
    sm_object.set_test_model(m)
    display(sm_object.get_profile(bootstraps = 10, 
        sample_size = 0.1
        ))

# Develop the Clustering model

In [None]:
model = sm_object.create_model(model_inputs = final_model_2, filename='model_2.pkl')

# Test the Clustering model on validation data

In [None]:
sm_object.validate_data()

# Feature contribution to clusters


In [None]:
fi_object = fi.FeatureImportance(X = data['data_{}'.format(sample_values_solution[0])][variables_list_profiling], 
        labels = model.labels_, 
        weights = data['data_{}'.format(sample_values_solution[0])][weight_variable_name_solution], 
        data_path = data_path, 
        filename = 'FeatureImportance_2'
    )

In [None]:
imps = fi_object.get_report()
imps

# Profiling

In [None]:
# Add cluster labels to the input data
data_cluster = data['data_{}'.format(sample_values_solution[0])]
data_cluster['cluster_labels'] = model.labels_
data_cluster.head()

## Profile categorical attributes

In [None]:
pro.character_summary_statistics(
    table_name = data_cluster, 
    variable_list = [x for x in character_variables_list_profiling if x not in [sample_variable_name_solution]], 
    cluster_variable_name = 'cluster_labels', 
    weight_variable_name = weight_variable_name_solution,
    data_path = data_path
    )

## Profile numeric attributes

In [None]:
pro.numeric_summary_statistics(
    table_name = data_cluster, 
    variable_list = [x for x in numeric_variables_list_profiling if x not in [weight_variable_name_solution]], 
    cluster_variable_name = 'cluster_labels', 
    weight_variable_name = weight_variable_name_solution,
    data_path = data_path
    )