# Clarificaciones
In the challenge, there is no investigation into the data due to a lack of knowledge about their behavior and meaning. For a proper Exploratory Data Analysis (EDA), a meeting should be held with the dataset owners.

In [1]:
%load_ext autoreload
%autoreload 2

# Libraries

In [72]:
import os
import pandas as pd
import yaml
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from feature_engine.imputation import CategoricalImputer, MeanMedianImputer
from src.data.make_dataset import get_Xs_ys, ReduceMemoryUsageTransformer
from src.preprocess.encoding import one_hot_encoder, freq_encoder
from src.preprocess.feature_selection import (
    pipe_feature_selection,
    save_selected_columns,
)
from src.utils.utils import correlation_matrix, check_first_buy, class_weight
from src.pipeline.pipeline import main_pipe

## Data Extraction

### Params

In [2]:
url_dest = os.getcwd() + "/../data/sample_HVA_DS_Beauty.zip"
get_Xs_ys_dict = {
    "y_col": "OBJETIVO",
    "url": "https://github.com/sebakirill/meli_challenge/raw/develop/data/sample_HVA_DS_Beauty.zip",
    "dst": url_dest,
    "member_name": "raw/sample_HVA_DS_Beauty.csv",
}

Extract data with a request from a GitHub url

In [3]:
X_train, X_test, y_train, y_test = get_Xs_ys(**get_Xs_ys_dict)

### Check Fist Buy

Check if the customers buy beauty products for the first time.

In [4]:
check_first_buy(pd.concat([X_train, y_train], axis=1))

Unnamed: 0_level_0,OBJETIVO
CUST_ID,Unnamed: 1_level_1


In [5]:
check_first_buy(pd.concat([X_test, y_test], axis=1))

Unnamed: 0_level_0,OBJETIVO
CUST_ID,Unnamed: 1_level_1


## Feature Selection
Filter the important features using a simplified XGBoost model and visualization with a corr matrix.

### params


In [6]:
pipe_feature_selection_dict = {
    "objective": "binary:logistic",
    "enable_categorical": True,
    "imputation_num": "mean",
    "imputation_cat": "frequent",
    "col": ["PHOTO_DATE", "SIT_SITE_ID"],
}

## pipe

In [7]:
pipe_feature_selection = pipe_feature_selection(**pipe_feature_selection_dict)
pipe_feature_selection.fit(X_train, y_train)

## Columns with feature importance equal to zero

In [8]:
selected_columns = [
    col
    for col, importance in zip(
        pipe_feature_selection.named_steps["xgb_class"].get_booster().feature_names,
        pipe_feature_selection.named_steps["xgb_class"].feature_importances_,
    )
    if importance <= 0
]

The "selected_columns" are the columns that should be removed from the dataset as they do not enrich the model

In [9]:
reduce_memory_usage_transformer_dict = {
    "feature_selection": True,
    "col_selec": selected_columns,
}

In [10]:
memory_step = ReduceMemoryUsageTransformer(**reduce_memory_usage_transformer_dict)

In [11]:
X_mem_red = memory_step.fit_transform(X_train, y_train)
cat_col = X_mem_red.select_dtypes('object')
num_col = X_mem_red.select_dtypes('number')

# Pipeline

## params extractor

In [49]:
def extract_best_params(url_file):
    with open(url_file, "r") as yaml_file:
        config = yaml.safe_load(yaml_file)
        num_imputer = config.pop("imp_num")
        cat_imputer = config.pop("imp_cat")
    return config, num_imputer, cat_imputer

## XGBoost

### Pipeline XGBoost Frequency Enconding

#### params

In [50]:
url_xgb_fe = (
    os.getcwd() + "/../conf/best_hiperparameters/xgboost_frequency_encoder.yaml"
)
cfg_xgb_fe, cfg_xgb_fe_imp_num, cfg_xgb_fe_imp_cat  = extract_best_params(url_xgb_fe)

#### pipe

In [65]:
model_xgb_fe_pipe = main_pipe(
    imp_cat=cfg_xgb_fe_imp_cat,
    imp_num=cfg_xgb_fe_imp_num,
    cat_col=cat_col,
    num_col=num_col,
    cfg_model=cfg_xgb_fe,
    model=XGBClassifier,
    encoder_type=freq_encoder,
    feature_selection=reduce_memory_usage_transformer_dict["feature_selection"],
    col_selec=reduce_memory_usage_transformer_dict["col_selec"],
)

### Pipeline XGBoost One Hot Encoding

#### params

In [53]:
url_xgb_ohe = os.getcwd() + "/../conf/best_hiperparameters/xgboost_one_hot_encoder.yaml"
cfg_xgb_ohe, cfg_xgb_ohe_imp_num, cfg_xgb_ohe_imp_cat  = extract_best_params(url_xgb_ohe)

#### pipe

In [54]:
model_xgb_ohe_pipe = main_pipe(
    imp_cat=cfg_xgb_ohe_imp_cat,
    imp_num=cfg_xgb_ohe_imp_num,
    cat_col=cat_col,
    num_col=num_col,
    cfg_model=cfg_xgb_fe,
    model=XGBClassifier,
    encoder_type=one_hot_encoder,
    feature_selection=reduce_memory_usage_transformer_dict["feature_selection"],
    col_selec=reduce_memory_usage_transformer_dict["col_selec"],
)

## LightGBM

### Pipeline LightGBM Frequency Encoding

#### params

In [18]:
url_lgbm_fe = (
    os.getcwd() + "/../conf/best_hiperparameters/lightgbm_frequency_encoder.yaml"
)
cfg_lgbm_fe, cfg_lgbm_fe_imp_num, cfg_lgbm_fe_imp_cat = extract_best_params(url_lgbm_fe)

#### pipe

In [19]:
model_lgbm_fe_pipe = model_lgbm_ohe_pipe = main_pipe(
    imp_cat=cfg_lgbm_fe_imp_cat,
    imp_num=cfg_lgbm_fe_imp_num,
    cat_col=cat_col,
    num_col=num_col,
    cfg_model=cfg_lgbm_fe,
    model=LGBMClassifier,
    encoder_type=freq_encoder,
    feature_selection=reduce_memory_usage_transformer_dict["feature_selection"],
    col_selec=reduce_memory_usage_transformer_dict["col_selec"],
)

### Pipeline LightGBM One Hot Encoding

#### params

In [20]:
url_lgbm_ohe = (
    os.getcwd() + "/../conf/best_hiperparameters/lightgbm_one_hot_encoder.yaml"
)
cfg_lgbm_ohe, cfg_lgbm_ohe_imp_num, cfg_lgbm_ohe_imp_cat = extract_best_params(url_lgbm_ohe)

#### pipe

In [22]:
model_lgbm_ohe_pipe = main_pipe(
    imp_cat=cfg_lgbm_ohe_imp_cat,
    imp_num=cfg_lgbm_ohe_imp_num,
    cat_col=cat_col,
    num_col=num_col,
    cfg_model=cfg_lgbm_ohe,
    model=LGBMClassifier,
    encoder_type=one_hot_encoder,
    feature_selection=reduce_memory_usage_transformer_dict["feature_selection"],
    col_selec=reduce_memory_usage_transformer_dict["col_selec"],
)

## Logistic Regression

### Pipeline Logistic Regression Frequency Encoding

#### params

In [24]:
url_lgr_fe = (
    os.getcwd()
    + "/../conf/best_hiperparameters/logistic_regression_frequency_encoder.yaml"
)
cfg_lgr_fe, cfg_lgr_fe_imp_num, cfg_lgr_fe_imp_cat = extract_best_params(url_lgr_fe)

#### pipe

In [26]:
model_lgr_fe_pipe = model_lgbm_ohe_pipe = main_pipe(
    imp_cat=cfg_lgr_fe_imp_cat,
    imp_num=cfg_lgr_fe_imp_num,
    cat_col=cat_col,
    num_col=num_col,
    cfg_model=cfg_lgr_fe,
    model=LogisticRegression,
    encoder_type=freq_encoder,
    feature_selection=reduce_memory_usage_transformer_dict["feature_selection"],
    col_selec=reduce_memory_usage_transformer_dict["col_selec"],
)

### Pipeline Logistic Regression One Hot Encoding

#### params

In [28]:
url_lgr_ohe = (
    os.getcwd()
    + "/../conf/best_hiperparameters/logistic_regression_one_hot_encoder.yaml"
)
cfg_lgr_ohe, cfg_lgr_ohe_imp_num, cfg_lgr_ohe_imp_cat = extract_best_params(url_lgr_ohe)

In [30]:
model_lgr_ohe_pipe = main_pipe(
    imp_cat=cfg_lgr_ohe_imp_cat,
    imp_num=cfg_lgr_ohe_imp_num,
    cat_col=cat_col,
    num_col=num_col,
    cfg_model=cfg_lgr_ohe,
    model=LogisticRegression(),
    encoder_type=one_hot_encoder,
    feature_selection=reduce_memory_usage_transformer_dict["feature_selection"],
    col_selec=reduce_memory_usage_transformer_dict["col_selec"],
)

## Random Forest

### Pipeline Random Forest Frequency Encoding

#### params

In [31]:
url_rf_fe = (
    os.getcwd() + "/../conf/best_hiperparameters/random_forest_frequency_encoder.yaml"
)
cfg_rf_fe, cfg_rf_fe_imp_num, cfg_rf_fe_imp_cat = extract_best_params(url_rf_fe)

#### pipe

In [32]:
model_rf_fe_pipe = main_pipe(
    imp_cat=cfg_rf_fe_imp_cat,
    imp_num=cfg_rf_fe_imp_num,
    cat_col=cat_col,
    num_col=num_col,
    cfg_model=cfg_rf_fe,
    model=RandomForestClassifier(),
    encoder_type=freq_encoder,
    feature_selection=reduce_memory_usage_transformer_dict["feature_selection"],
    col_selec=reduce_memory_usage_transformer_dict["col_selec"],
)

### Pipeline Random Forest One Hot Encoding

#### params

In [33]:
url_rf_ohe = (
    os.getcwd() + "/../conf/best_hiperparameters/random_forest_one_hot_encoder.yaml"
)
cfg_rf_ohe, cfg_rf_ohe_imp_num, cfg_rf_ohe_imp_cat = extract_best_params(url_rf_ohe)

In [34]:
model_rf_ohe_pipe = main_pipe(
    imp_cat=cfg_rf_ohe_imp_cat,
    imp_num=cfg_rf_ohe_imp_num,
    cat_col=cat_col,
    num_col=num_col,
    cfg_model=cfg_rf_ohe,
    model=RandomForestClassifier,
    encoder_type=one_hot_encoder,
    feature_selection=reduce_memory_usage_transformer_dict["feature_selection"],
    col_selec=reduce_memory_usage_transformer_dict["col_selec"],
)

# Fit Model

In [39]:
def fit_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred=model.predict(X_test)
    return roc_auc_score(y_test, y_pred), model

In [36]:
models_pipe = [model_xgb_fe_pipe, model_xgb_ohe_pipe, model_lgbm_fe_pipe, model_lgbm_ohe_pipe,
                model_lgr_fe_pipe, model_lgr_ohe_pipe, model_rf_fe_pipe, model_rf_fe_pipe]

In [37]:
roc_results = []
models_fited = []

In [64]:
for model in models_pipe:
    roc, model_fited =fit_model(model=model, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test)
    roc_results.add(roc)
    model_fited.add(model_fited)

ValueError: Need to specify at least one of 'labels', 'index' or 'columns'