## Mandatory Reading before the notebook
        -- https://medium.com/@guptaguptalokesh2002/function-and-power-transformer-in-ml-c5a10e011f4d#:~:text=It%20is%20a%20feature%20transformation,data%20more%20suitable%20for%20modeling.

        -- https://www.geeksforgeeks.org/data-pre-processing-wit-sklearn-using-standard-and-minmax-scaler/

        -- https://proclusacademy.com/blog/robust-scaler-outliers/
        

# Initial Setup

In [None]:
from pathlib import Path
import yaml
import os

In [None]:
current_directory = Path(os.path.abspath(""))
with open(current_directory / "config.yaml") as f:
    documents = yaml.full_load(f)

In [None]:
general = documents.get("general")
verbose = general.get("verbose")

# environment parameters
env = documents.get("environment")
output_folder = env.get("output_folder")
input_path = Path(output_folder) / "01_Initial_Data_Prep"
output_path = Path(output_folder) / "02_Feature_Selection"

# feature selection parameters
feature_selection = documents.get("feature_selection")
transformations = feature_selection.get("transformations")
transformation_evaluation = feature_selection.get("transformation_evaluation")
transformation_sig = feature_selection.get("transformation_significance")
vif_na_method = feature_selection.get("vif_na_method")
vif_threshold = feature_selection.get("vif_threshold")
kbins_params = feature_selection.get("kbins_params")
feature_selection

In [None]:
%load_ext autoreload

In [None]:
%autoreload 2

In [None]:
import pandas as pd
from Utils import low_variance

train_data = pd.read_csv(
    f"{Path(output_folder)}/01_Initial_Data_Prep/training_sampled_data.csv"
)
perf_data = pd.read_csv(
    f"{Path(output_folder)}/01_Initial_Data_Prep/performance_sampled_data.csv"
)
train_data, perf_data, cols_removed = low_variance(train_data, perf_data, 0.01)

In [None]:
train_data.to_csv(f"{output_path}/training_sampled_data_low_variance.csv", index=False)
perf_data.to_csv(
    f"{output_path}/performance_sampled_data_low_variance.csv", index=False
)


with open(f"{output_path}/low_variance_columns.txt", "w") as f:
    for col in cols_removed:
        f.write(f"{col}\n")

In [None]:
X_train = train_data.iloc[:, :-1]
X_test = perf_data.iloc[:, :-1]
y_train = train_data.target
y_test = perf_data.target
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
y_train.to_csv(f"{output_path}/y_train.csv")
y_test.to_csv(f"{output_path}/y_test.csv")

# Transformations

        -- Removing the low variance cols

In [None]:
with open(f"{input_path}/numeric_columns.txt", "r") as f:
    lines = f.readlines()

numeric_cols = [x.replace("\n", "") for x in lines]

numeric_cols = list(set(numeric_cols) - set(cols_removed))

In [None]:
from Utils import test_normality

test_normality(X_train, numeric_cols)

## since all the tests show no columns have normality, we are good now

In [None]:
%%time
import warnings
original_warning_state = warnings.filters[:]
warnings.simplefilter("ignore", category=RuntimeWarning)

from GenerateTransformations import DataTransformer

X_train_transformed, X_test_transformed, trans_objects, trans_details = DataTransformer(
    train=X_train,
    valid=X_test,
    cols=numeric_cols,
    transformations=transformations
).run()

warnings.filters = original_warning_state
X_train_transformed.shape, X_test_transformed.shape

In [None]:
trans_objects

In [None]:
trans_details

In [None]:
X_train_transformed

In [None]:
X_train_transformed.to_parquet(
    f"{output_path}/X_train_transformed.parquet", index=False
)
X_test_transformed.to_parquet(f"{output_path}/X_test_transformed.parquet", index=False)

In [None]:
import pickle

for k, v in trans_objects.items():
    with open(f"{output_path}/transformer_{k}.pkl", "wb") as f:
        pickle.dump(v, f)

In [None]:
import json
import numpy as np


def convert_ndarray(obj):
    """Recursively convert ndarray objects in dict to lists"""
    if isinstance(obj, dict):
        return {key: convert_ndarray(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_ndarray(element) for element in obj]
    elif isinstance(obj, np.ndarray):
        return obj.tolist()  # Convert NumPy arrays to lists
    else:
        return obj


# Convert all NumPy arrays in trans_details
trans_details_serializable = convert_ndarray(trans_details)

# Save the dictionary as a JSON file
with open(f"{output_path}/transformer_details.json", "w") as outfile:
    json.dump(trans_details_serializable, outfile)

# Evaluate Transformations

In [None]:
import numpy as np

trans_score_df = pd.DataFrame(data=numeric_cols, columns=["features"])
trans_score_df.set_index("features", inplace=True)
trans_score_df["nominal"] = np.nan
for t in transformations:
    trans_score_df[t] = np.nan

trans_score_df.head()

In [None]:
import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning
from EvaluateTransformation import TransformationEvaluator

In [None]:
%%time
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

for col in numeric_cols:
    test_features = [col + "_" + x for x in transformations]
    test_features = [col + "_nominal"] + test_features
    for f in test_features:
        try:
            score = TransformationEvaluator(X_train_transformed.loc[:,[f]], y_train, transformation_evaluation).run()
        except np.linalg.LinAlgError:
            score = -1
        feat_name = col
        transformation_name = f.replace(feat_name + "_", "")
        trans_score_df.loc[feat_name, transformation_name] = score
        
warnings.filterwarnings("default", category=ConvergenceWarning)
warnings.filterwarnings("default", category=RuntimeWarning)
trans_score_df

In [None]:
trans_score_df = abs(trans_score_df)
trans_score_df

In [None]:
trans_score_df["transformation"] = trans_score_df.idxmax(axis=1)
trans_score_df

In [None]:
trans_score_df.transformation.value_counts().sort_index()

# ANOVA

In [None]:
trans_score_df

In [None]:
%%time
from Utils import transformation_significance
trans_score_df = transformation_significance(X_train_transformed, trans_score_df, transformation_sig, verbose)
trans_score_df

In [None]:
trans_score_df.transformation_override.value_counts().sort_index()

In [None]:
all_cols = []
for col in numeric_cols:
    for t in transformations + ["nominal"]:
        all_cols.append(col + "_" + t)

non_transformed_cols = [
    col for col in X_train_transformed.columns if col not in all_cols
]
non_transformed_cols

In [None]:
selected_transformations = trans_score_df.reset_index()[
    ["features", "transformation_override"]
].agg("_".join, axis=1)
selected_transformations

In [None]:
anova_features = non_transformed_cols + list(selected_transformations)
len(anova_features)

## 2,462: matching with the original shape of the columns

In [None]:
with open(f"{output_path}/anova_columns.txt", "w") as f:
    for item in anova_features:
        f.write("%s\n" % item)

In [None]:
X_train_scaled = X_train_transformed.loc[:, anova_features]
X_test_scaled = X_test_transformed.loc[:, anova_features]

In [None]:
X_train_scaled.to_csv(f"{output_path}/X_train_scaled_anova.csv", index=False)
X_test_scaled.to_csv(f"{output_path}/X_test_scaled_anova.csv", index=False)

# Free Up Memory

In [None]:
# free up memory
import sys


def sizeof_fmt(num, suffix="B"):
    for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, "Yi", suffix)


for name, size in sorted(
    ((name, sys.getsizeof(value)) for name, value in list(locals().items())),
    key=lambda x: -x[1],
)[:15]:
    print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))

In [None]:
del train_data, perf_data, X_train, X_test, X_train_transformed, X_test_transformed

# VIF

## Compute

In [None]:
X_train_vif = X_train_scaled.copy(deep=True)
X_test_vif = X_test_scaled.copy(deep=True)

In [None]:
X_train_vif.shape, X_test_vif.shape

## VIF
-- This is taking some insane amount of time, almost met the god during its execution

In [None]:
# %%time
# from ReduceVif import ReduceVIF
# X_train_vif, final_vif = ReduceVIF(data=X_train_vif.iloc[:,1:], threshold=vif_threshold, verbose=verbose).run()

# ## not letting the lg-seq-index to be part of this
# ## to get the custloc: we will use the lg-seq-index to get from the data from EDA dataset

## Correlation Matrix

In [None]:
%%time
from Utils import remove_correlated

X_train_vif, one_unique_feature, to_drop = remove_correlated(X_train_vif.iloc[:,1:], threshold = 0.90)
len(to_drop),len(one_unique_feature), X_train_vif.shape

In [None]:
vif_features = X_train_vif.columns
vif_features = ["lg_seq_index"] + list(
    vif_features
)  # if running the VIF, then this line needs to be executed, else no
vif_features

In [None]:
with open(f"{output_path}/vif_columns.txt", "w") as f:
    for item in vif_features:
        f.write("%s\n" % item)

with open(f"{output_path}/one_unique_value_feature.txt", "w") as f:
    for item in one_unique_feature:
        f.write("%s\n" % item)

In [None]:
X_train_vif_selected = X_train_scaled.loc[:, vif_features]
X_test_vif_selected = X_test_scaled.loc[:, vif_features]

X_train_vif_selected.fillna(-1, inplace=True)
X_test_vif_selected.fillna(-1, inplace=True)
X_train_vif_selected.isna().sum().sum(), X_test_vif_selected.isna().sum().sum()

In [None]:
X_train_vif_selected.to_csv(f"{output_path}/X_train_vif.csv", index=False)
X_test_vif_selected.to_csv(f"{output_path}/X_test_vif.csv", index=False)

# Variable Selection

this part is more manual and up to developer to use best judgements. The default implementations to use are below. They can be changed and merely act as a starting point. Depending on outputs, select features that are in n+ number of models
- boruta
- anova (via kbest)
- recursive
- extratree
- lasso (via logistic regression)

## Boruta

In [None]:
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=50, n_jobs=-1, max_depth=5, random_state=0)
boruta_feature_selector = BorutaPy(
    clf, n_estimators=100, random_state=42, verbose=2, max_iter=50, perc=50
)
# boruta_feature_selector.fit(X_train_kbins.iloc[:,3:].values, y_train.values)
boruta_feature_selector.fit(X_train_vif_selected.iloc[:, 1:].values, y_train.values)

## not using lg-seq-index again for obvious reasons

In [None]:
# boruta_features = X_train_kbins.iloc[:,3:].iloc[:,np.where(boruta_feature_selector.support_)[0]].columns
boruta_features = (
    X_train_vif_selected.iloc[:, 1:]
    .iloc[:, np.where(boruta_feature_selector.support_)[0]]
    .columns
)
len(boruta_features)

## ANOVA

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

selector = SelectKBest(score_func=f_classif, k=100)
# selector.fit(X_train_kbins.iloc[:,3:].values, y_train)
selector.fit(X_train_vif_selected.iloc[:, 1:].values, y_train)

In [None]:
# anova_features = X_train_kbins.iloc[:,3:].iloc[:,selector.get_support(indices=True)].columns
anova_features = (
    X_train_vif_selected.iloc[:, 1:].iloc[:, selector.get_support(indices=True)].columns
)
len(anova_features)

In [None]:
# pd.DataFrame(selector.scores_, columns=["ANOVA"], index=X_train_kbins.iloc[:, 3:].columns).sort_values(
#     by="ANOVA", ascending=False
# ).head(10)
pd.DataFrame(
    selector.scores_, columns=["ANOVA"], index=X_train_vif_selected.iloc[:, 1:].columns
).sort_values(by="ANOVA", ascending=False).head(10)

## Recursive

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

warnings.filterwarnings("ignore")

lg = LogisticRegression(solver="sag", n_jobs=-1)
rfe = RFE(lg, n_features_to_select=100, step=10, verbose=1)
# rfe.fit(X_train_kbins.iloc[:,3:].values, y_train)
rfe.fit(X_train_vif_selected.iloc[:, 1:].values, y_train)
warnings.filterwarnings("default")

In [None]:
# recursive_features = X_train_kbins.iloc[:,3:].iloc[:,np.where(rfe.support_)[0]].columns
recursive_features = (
    X_train_vif_selected.iloc[:, 1:].iloc[:, np.where(rfe.support_)[0]].columns
)
len(recursive_features)

## Extra Tree

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

clf = ExtraTreesClassifier(n_estimators=50, n_jobs=-1, verbose=1)
# clf.fit(X_train_kbins.iloc[:,3:].values, y_train)
clf.fit(X_train_vif_selected.iloc[:, 1:].values, y_train)

In [None]:
feat_importance_norm = np.std(
    [tree.feature_importances_ for tree in clf.estimators_], axis=0
)
# extra_tree_features = X_train_kbins.iloc[:,3:].iloc[:,[feat_importance_norm > np.mean(feat_importance_norm)][0]]
extra_tree_features = X_train_vif_selected.iloc[:, 1:].iloc[
    :, [feat_importance_norm > np.mean(feat_importance_norm)][0]
]

## Lasso

In [None]:
# from sklearn.linear_model import Lasso
lasso = LogisticRegression(penalty="l1", C=0.5, solver="saga")
# lasso.fit(X_train_kbins.iloc[:,3:].values, y_train)
lasso.fit(X_train_vif_selected.iloc[:, 1:].values, y_train)

In [None]:
# lasso_features = X_train_kbins.iloc[:,3:].iloc[:,(lasso.coef_ > 0)[0]]
lasso_features = X_train_vif_selected.iloc[:, 1:].iloc[:, (lasso.coef_ > 0)[0]]

## Aggregated

In [None]:
# df_selected_features = pd.DataFrame(data=X_train_kbins.columns, columns=["features"])
df_selected_features = pd.DataFrame(
    data=X_train_vif_selected.columns, columns=["features"]
)
df_selected_features["boruta"] = df_selected_features.features.isin(boruta_features)
df_selected_features["anova"] = df_selected_features.features.isin(anova_features)
df_selected_features["recursive"] = df_selected_features.features.isin(
    recursive_features
)
df_selected_features["extra_tree"] = df_selected_features.features.isin(
    extra_tree_features
)
df_selected_features["lasso"] = df_selected_features.features.isin(lasso_features)
df_selected_features["num_models"] = df_selected_features[
    ["boruta", "anova", "recursive", "extra_tree", "lasso"]
].sum(axis=1)
df_selected_features

In [None]:
df_selected_features.num_models.value_counts().sort_index()

In [None]:
df_selected_features[df_selected_features.num_models == 4].features.values

In [None]:
min_model_count = 3
# selected_features = (
#     X_train_kbins.iloc[:, :3].columns.tolist()
#     + df_selected_features[
#         df_selected_features.num_models >= min_model_count
#     ].features.tolist()
# )
selected_features = (
    X_train_vif_selected.iloc[
        :, :1
    ].columns.tolist()  # just first column and features that occured in more than equal 3 methods
    + df_selected_features[
        df_selected_features.num_models >= min_model_count
    ].features.tolist()
)
selected_features

In [None]:
with open(f"{output_path}/selected_columns.txt", "w") as f:
    for item in selected_features:
        f.write("%s\n" % item)

In [None]:
# X_train_selected = X_train_kbins.loc[:, selected_features]
# X_valid_selected = X_valid_kbins.loc[:, selected_features]
X_train_selected = X_train_vif_selected.loc[:, selected_features]
X_test_selected = X_test_vif_selected.loc[:, selected_features]
X_train_selected.shape, X_test_selected.shape

In [None]:
X_train_selected.to_csv(f"{output_path}/X_train_selected.csv", index=False)
X_test_selected.to_csv(f"{output_path}/X_test_selected.csv", index=False)
# TODO: add step for y_test and train to be saved

# Bin Data

optional step to use odds ratio...add more commentary

In [None]:
X_train_selected.shape, X_test_selected.shape

In [None]:
# transformations
binning_cols = [x for x in selected_transformations if x in selected_features]
non_binning_cols = [x for x in non_transformed_cols if x in selected_features]

# X_train_binning = X_train_selected.loc[:, binning_cols]
# X_train_non_binning = X_train_selected.loc[:, non_binning_cols]
# X_test_binning = X_test_selected.loc[:, binning_cols]
# X_test_non_binning = X_test_selected.loc[:, non_binning_cols]

In [None]:
len(binning_cols), len(non_binning_cols), X_train_selected.shape

In [None]:
for i in binning_cols:
    if i not in selected_features:
        binning_cols.remove(i)

for i in non_binning_cols:
    if i not in selected_features:
        non_binning_cols.remove(i)

len(binning_cols), len(non_binning_cols)

In [None]:
# remove all valid
%time
from BinData import BinData

X_train_kbins, X_test_kbins, kbins_est = BinData(
    X_train=X_train_selected,
    X_valid=X_test_selected,
    binning_cols=binning_cols,
    non_binning_cols=non_binning_cols,
    n_bins=kbins_params["n_bins"],
    bin_encoding=kbins_params["bin_encoding"],
    bin_strategy=kbins_params["bin_strategy"],
    output_path=output_path,
).run()

X_train_kbins

In [None]:
with open(f"{output_path}/kbins_discretizer.pkl", "wb") as f:
    pickle.dump(kbins_est, f)

In [None]:
X_train_kbins.to_csv(f"{output_path}/X_train_kbins.csv", index=False)
X_test_kbins.to_csv(f"{output_path}/X_test_kbins.csv", index=False)
X_train_kbins.shape, X_test_kbins.shape

# PCA

optional step to try data reductions...add more commentary

In [None]:
from PCAData import ReduceData

X_train_reduced, pca, _ = ReduceData(
    data=X_train_selected.iloc[:, 1:].values, var=0.95
).run()

In [None]:
with open(f"{output_path}/pca.pkl", "wb") as f:
    pickle.dump(pca, f)

In [None]:
X_train_reduced = pd.concat(
    [X_train_selected.iloc[:, :1], pd.DataFrame(X_train_reduced)], axis=1
)
X_test_reduced = pd.concat(
    [
        X_test_selected.iloc[:, :1],
        pd.DataFrame(pca.transform(X_test_selected.iloc[:, 1:].values)),
    ],
    axis=1,
)
X_train_reduced.shape, X_test_reduced.shape  # TODO: check why sizes different?

In [None]:
X_train_reduced.to_csv(f"{output_path}/X_train_reduced.csv", index=False)
X_test_reduced.to_csv(f"{output_path}/X_test_reduced.csv", index=False)

# Performance Data Prep

In [None]:
# X_perf = pd.read_csv(f"{Path(output_folder)}/01_Initial_Data_Prep/X_perf.csv")
# y_perf = pd.read_csv(f"{Path(output_folder)}/01_Initial_Data_Prep/y_perf.csv")
# X_perf.shape, y_perf.shape

In [None]:
# from build_performance import BuildPerformance

# X_perf_selected = BuildPerformance(
#     X=X_perf,
#     transformations=selected_transformations,
#     scalers=scalers,
#     vif_cols=vif_features,
#     nan_replacements=null_dict,
#     bin_cols=binning_cols,
#     kbins_est=kbins_est,
#     kbins_df_cols=X_train_kbins.columns.tolist(),
#     selected_features=selected_features,
#     X_train=X_train, # temp
# ).run()
# X_perf_selected

In [None]:
# X_perf_selected.to_csv(f"{output_path}/X_perf_selected.csv", index=False)

In [None]:
# X_perf_reduced = pd.concat(
#     [X_valid_selected.iloc[:, :3], pd.DataFrame(pca.transform(X_perf_selected.iloc[:, 3:].values))], axis=1
# )
# X_perf_reduced.shape

In [None]:
# X_perf_reduced.to_csv(f"{output_path}/X_perf_reduced.csv", index=False)