# Clarificaciones
In the challenge, there is no investigation into the data due to a lack of knowledge about their behavior and meaning. For a proper Exploratory Data Analysis (EDA), a meeting should be held with the dataset owners.

In [1]:
%load_ext autoreload
%autoreload 2

# Libraries

In [2]:
import os
import pandas as pd
import yaml
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from feature_engine.imputation import CategoricalImputer, MeanMedianImputer
from src.data.make_dataset import get_Xs_ys, ReduceMemoryUsageTransformer
from src.preprocess.encoding import one_hot_encoder, freq_encoder
from src.preprocess.feature_selection import (
    pipe_feature_selection,
    save_selected_columns,
)
from src.utils.utils import correlation_matrix, check_first_buy, class_weight

## Data Extraction

### Params

In [3]:
url_dest = os.getcwd() + "/../data/sample_HVA_DS_Beauty.zip"
get_Xs_ys_dict = {
    "y_col": "OBJETIVO",
    "url": "https://github.com/sebakirill/meli_challenge/raw/develop/data/sample_HVA_DS_Beauty.zip",
    "dst": url_dest,
    "member_name": "raw/sample_HVA_DS_Beauty.csv",
}

Extract data with a request from a GitHub url

In [4]:
X_train, X_test, y_train, y_test = get_Xs_ys(**get_Xs_ys_dict)

### Check Fist Buy

Check if the customers buy beauty products for the first time.

In [5]:
check_first_buy(pd.concat([X_train, y_train], axis=1))

Unnamed: 0_level_0,OBJETIVO
CUST_ID,Unnamed: 1_level_1


In [6]:
check_first_buy(pd.concat([X_test, y_test], axis=1))

Unnamed: 0_level_0,OBJETIVO
CUST_ID,Unnamed: 1_level_1


## Feature Selection
Filter the important features using a simplified XGBoost model and visualization with a corr matrix.

### params


In [7]:
pipe_feature_selection_dict = {
    "objective": "binary:logistic",
    "enable_categorical": True,
    "imputation_num": "mean",
    "imputation_cat": "frequent",
    "col": ["PHOTO_DATE", "SIT_SITE_ID"],
}

## pipe

In [8]:
pipe_feature_selection = pipe_feature_selection(**pipe_feature_selection_dict)
pipe_feature_selection.fit(X_train, y_train)

## Columns with feature importance equal to zero

In [9]:
selected_columns = [
    col
    for col, importance in zip(
        pipe_feature_selection.named_steps["xgb_class"].get_booster().feature_names,
        pipe_feature_selection.named_steps["xgb_class"].feature_importances_,
    )
    if importance <= 0
]

The "selected_columns" are the columns that should be removed from the dataset as they do not enrich the model

In [36]:
reduce_memory_usage_transformer_dict = {
    "feature_selection": True,
    "col_selec": selected_columns,
}

In [37]:
memory_step = ReduceMemoryUsageTransformer(**reduce_memory_usage_transformer_dict)

In [38]:
memory_step.fit(X_train, y_train)

# Pipeline

## params extractor

In [40]:
def extract_best_params(url_file):
    with open(url_file, "r") as yaml_file:
        config = yaml.safe_load(yaml_file)
        num_imputer = config.pop("imp_num")
        cat_imputer = config.pop("imp_cat")
    return config, num_imputer, cat_imputer

## Frequency Encoder

In [41]:
def preprocess_freq_encoder_pipe(imp_cat, imp_num):
    preprocess_freq_encoder_pipe = ColumnTransformer(
        transformers=[
            (
                "cat",
                Pipeline(
                    [
                        ("imputing", CategoricalImputer(imputation_method=imp_cat)),
                        ("encoding", freq_encoder()),
                    ]
                ),
                memory_step.fit_transform(X_train).select_dtypes("object").columns,
            ),
            (
                "num",
                Pipeline(
                    [
                        ("imputing", MeanMedianImputer(imputation_method=imp_num)),
                    ]
                ),
                memory_step.fit_transform(X_train).select_dtypes("number").columns,
            ),
        ]
    )
    return preprocess_freq_encoder_pipe

## One Hot Encoder

In [42]:
def preprocess_one_hot_encoder_pipe(imp_cat, imp_num):
    preprocess_one_hot_encoder_pipe = ColumnTransformer(
        transformers=[
            (
                "cat",
                Pipeline(
                    [
                        ("imputing", CategoricalImputer(imputation_method=imp_cat)),
                        ("encoding", freq_encoder()),
                    ]
                ),
                memory_step.fit_transform(X_train).select_dtypes("object").columns,
            ),
            (
                "num",
                Pipeline(
                    [
                        ("imputing", MeanMedianImputer(imputation_method=imp_num)),
                    ]
                ),
                memory_step.fit_transform(X_train).select_dtypes("number").columns,
            ),
        ]
    )
    return preprocess_one_hot_encoder_pipe

## XGBoost

### Pipeline XGBoost Frequency Enconding

#### params

In [43]:
url_xgb_fe = (
    os.getcwd() + "/../conf/best_hiperparameters/xgboost_frequency_encoder.yaml"
)
cfg_xgb_fe, cfg_xgb_fe_imp_num, cfg_xgb_fe_imp_cat  = extract_best_params(url_xgb_fe)

#### pipe

In [45]:
model_xgb_fe_pipe = Pipeline(
    [
        ("pipe_prep", memory_step),
        (
            "pipe_end",
            preprocess_freq_encoder_pipe(
                imp_cat=cfg_xgb_fe_imp_cat, imp_num=cfg_xgb_fe_imp_num
            ),
        ),
        ("model", XGBClassifier(**cfg_xgb_fe)),
    ]
)

### Pipeline XGBoost One Hot Encoding

#### params

In [48]:
url_xgb_ohe = os.getcwd() + "/../conf/best_hiperparameters/xgboost_one_hot_encoder.yaml"
cfg_xgb_ohe, cfg_xgb_ohe_imp_num, cfg_xgb_ohe_imp_cat  = extract_best_params(url_xgb_ohe)

AttributeError: 'NoneType' object has no attribute 'pop'

#### pipe

In [46]:
model_xgb_ohe_pipe = Pipeline(
    [
        ("pipe_prep", memory_step),
        (
            "pipe_end",
            preprocess_one_hot_encoder_pipe(
                imp_cat=cfg_xgb_ohe_imp_cat, imp_num=cfg_xgb_ohe_imp_num
            ),
        ),
        ("model", XGBClassifier(**cfg_xgb_ohe)),
    ]
)

NameError: name 'cfg_xgb_ohe_imp_cat' is not defined

## LightGBM

### Pipeline LightGBM Frequency Encoding

#### params

In [None]:
url_lgbm_fe = (
    os.getcwd() + "/../conf/best_hiperparameters/lightgbm_frequency_encoder.yaml"
)
cfg_lgbm_fe, cfg_lgbm_fe_imp_num, cfg_lgbm_fe_imp_cat = extract_best_params(url_lgbm_fe)

#### pipe

In [None]:
model_lgbm_fe_pipe = Pipeline(
    [
        ("pipe_prep", memory_step),
        (
            "pipe_end",
            preprocess_freq_encoder_pipe(
                imp_cat=cfg_lgbm_fe_imp_cat, cfg_lgbm_fe_imp_num
            ),
        ),
        ("model", LGBMClassifier(**cfg_lgbm_fe)),
    ]
)

### Pipeline LightGBM One Hot Encoding

#### params

In [None]:
url_lgbm_ohe = (
    os.getcwd() + "/../conf/best_hiperparameters/lightgbm_one_hot_encoder.yaml"
)
cfg_lgbm_ohe, cfg_lgbm_ohe_imp_num, cfg_lgbm_ohe_imp_cat = extract_best_params(url_lgbm_ohe)

#### pipe

In [None]:
model_lgbm_ohe_pipe = Pipeline(
    [
        ("pipe_prep", memory_step),
        ("pipe_end", preprocess_one_hot_encoder_pipe(
            imp_cat=cfg_lgbm_ohe_imp_cat, imp_num=cfg_lgbm_ohe_imp_num
        )),
        ("model", LGBMClassifier(**cfg_lgbm_ohe)),
    ]
)

## Logistic Regression

### Pipeline Logistic Regression Frequency Encoding

#### params

In [None]:
url_lgr_fe = (
    os.getcwd()
    + "/../conf/best_hiperparameters/logidtic_regression_frequency_encoder.yaml"
)
cfg_lgr_fe, cfg_lgr_fe_imp_num, cfg_lgr_fe_imp_cat = extract_best_params(url_lgr_fe)

#### pipe

In [None]:
model_lgr_fe_pipe = Pipeline(
    [
        ("pipe_prep", memory_step),
        ("pipe_end", preprocess_freq_encoder_pipe(
            imp_cat=cfg_lgr_fe_imp_cat, imp_num=cfg_lgr_fe_imp_num
        )),
        ("model", LogisticRegression(**cfg_lgr_fe)),
    ]
)

### Pipeline Logistic Regression One Hot Encoding

#### params

In [None]:
url_lgr_ohe = (
    os.getcwd()
    + "/../conf/best_hiperparameters/logidtic_regression_one_hot_encoder.yaml"
)
cfg_lgr_ohe, cfg_lgr_ohe_imp_num, cfg_lgr_ohe_imp_cat = extract_best_params(url_lgr_ohe)

In [None]:
model_lgr_ohe_pipe = Pipeline(
    [
        ("pipe_prep", memory_step),
        ("pipe_end", preprocess_one_hot_encoder_pipe(
            imp_cat=cfg_lgr_ohe_imp_cat, imp_num=cfg_lgr_ohe_imp_num
        )),
        ("model", LogisticRegression(**cfg_lgr_ohe)),
    ]
)

## Random Forest

### Pipeline Random Forest Frequency Encoding

#### params

In [None]:
url_rf_fe = (
    os.getcwd() + "/../conf/best_hiperparameters/random_forest_frequency_encoder.yaml"
)
cfg_rf_fe, cfg_rf_fe_imp_num, cfg_rf_fe_imp_cat = extract_best_params(url_rf_fe)

#### pipe

In [None]:
model_rf_fe_pipe = Pipeline(
    [
        ("pipe_prep", memory_step),
        ("pipe_end", preprocess_freq_encoder_pipe(
            imp_cat=cfg_rf_fe_imp_cat, imp_num=cfg_rf_fe_imp_num
        )),
        ("model", RandomForestClassifier(**cfg_rf_fe)),
    ]
)

### Pipeline Random Forest One Hot Encoding

#### params

In [None]:
url_rf_ohe = (
    os.getcwd() + "/../conf/best_hiperparameters/random_forest_one_hot_encoder.yaml"
)
cfg_rf_ohe, cfg_rf_ohe_imp_num, cfg_rf_ohe_imp_cat = extract_best_params(url_rf_ohe)

In [None]:
model_rf_ohe_pipe = Pipeline(
    [
        ("pipe_prep", memory_step),
        ("pipe_end", preprocess_one_hot_encoder_pipe(
            imp_cat=cfg_rf_ohe_imp_cat, imp_num=cfg_rf_ohe_imp_num
        )),
        ("model", RandomForestClassifier(**cfg_rf_ohe)),
    ]
)

# Fit Model