# Pré-processamento

Nesta etapa do projeto iremos utilizar das informações coletadas na etapa de EDA para tratarmos nossas features visando uma melhor aplicação ao modelo. Trazendo nossos resultados do EDA, temos:

- Exclusão das variáveis `score_8` e `produto`
- Há variáveis discretas dentre as numéricas
- Podemos aplicar a transformação log nas variávels `score_3` e `valor_compra`
- Podemos aplicar a transformação cúbica ao `score_6`
- A variável `pais` pode ser agrupada em continentes
- Podemos transformar a variável `data_compra` em hora do dia e dia da semana.

In [73]:
import pandas as pd
import numpy as np

from sklearn.model_selection import (
    train_test_split,
    KFold,
    cross_val_score,
    GridSearchCV,
    RandomizedSearchCV,
)

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import classification_report, roc_auc_score

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, TargetEncoder
from sklearn.compose import ColumnTransformer
import pycountry_convert as pc

import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

In [2]:
df = pd.read_csv("../data/raw/dados.csv")
df.head()

Unnamed: 0,score_1,score_2,score_3,score_4,score_5,score_6,pais,score_7,produto,categoria_produto,score_8,score_9,score_10,entrega_doc_1,entrega_doc_2,entrega_doc_3,data_compra,valor_compra,score_fraude_modelo,fraude
0,4,0.7685,94436.24,20.0,0.444828,1.0,BR,5,Máquininha Corta Barba Cabelo Peito Perna Pelo...,cat_8d714cd,0.883598,240.0,102.0,1,,N,2020-03-27 11:51:16,5.64,66,0
1,4,0.755,9258.5,1.0,0.0,33.0,BR,0,Avental Descartavel Manga Longa - 50 Un. Tnt ...,cat_64b574b,0.376019,4008.0,0.0,1,Y,N,2020-04-15 19:58:08,124.71,72,0
2,4,0.7455,242549.09,3.0,0.0,19.0,AR,23,Bicicleta Mountain Fire Bird Rodado 29 Alumini...,cat_e9110c5,0.516368,1779.0,77.0,1,,N,2020-03-25 18:13:38,339.32,95,0
3,4,0.7631,18923.9,50.0,0.482385,18.0,BR,23,Caneta Delineador Carimbo Olho Gatinho Longo 2...,cat_d06e653,0.154036,1704.0,1147.0,1,,Y,2020-04-16 16:03:10,3.54,2,0
4,2,0.7315,5728.68,15.0,0.0,1.0,BR,2,Resident Evil Operation Raccoon City Ps3,cat_6c4cfdc,0.855798,1025.0,150.0,1,,N,2020-04-02 10:24:45,3.53,76,0


Devemos dividir os dados da amostra em treino e teste antes de qualquer processamento.

In [95]:
X = df.drop("fraude", axis=1)
y = df["fraude"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

Iremos construir as etapas de pré processamento como classes para serem aplicadas a um Pipeline

In [4]:
class CustomProcessor(BaseEstimator, TransformerMixin):
    def __init__(self, cols=[]):
        self.cols = cols

    def fit(self, X, y=None):
        return self

In [5]:
class DropColumn(CustomProcessor):
    def transform(self, X):
        return X.drop(self.cols, axis=1)

In [6]:
class DocumentsProcessor(CustomProcessor):
    def transform(self, X):
        X_new = X.copy()
        X_new[self.cols] = X_new[self.cols].fillna("N")
        X_new[self.cols] = (X_new[self.cols] == "Y").astype(int)

        return X_new

In [7]:
class DateProcessor(CustomProcessor):
    def transform(self, X):
        X_new = X.copy()
        date = pd.to_datetime(X_new["data_compra"])

        X_new["hora_compra"] = date.dt.hour
        X_new["dia_compra"] = date.dt.dayofweek

        X_new = X_new.drop("data_compra", axis=1)

        return X_new

In [8]:
class OneHotEncoderProcessor(CustomProcessor):
    def transform(self, X):
        X_encoded = pd.get_dummies(X, columns=self.cols, drop_first=True, dtype=int)
        return X_encoded

In [9]:
class ImputeValuesProcessor(BaseEstimator, TransformerMixin):
    def __init__(self, discrete_cols=[], continuous_cols=[]):
        self.discrete_cols = discrete_cols
        self.continuous_cols = continuous_cols
        self.numerical_imputer = ColumnTransformer(
            transformers=[
                (
                    "discrete",
                    SimpleImputer(strategy="most_frequent"),
                    self.discrete_cols,
                ),
                ("continuous", SimpleImputer(strategy="mean"), self.continuous_cols),
            ],
            remainder="passthrough",
        )

    def fit(self, X, y=None):
        self.numerical_imputer.fit(X)
        return self

    def transform(self, X):

        X_transformed = self.numerical_imputer.transform(X)
        X_transformed = pd.DataFrame(X_transformed, columns=self._get_column_names(X))
        return X_transformed

    def _get_column_names(self, X):

        transformed_columns = (
            self.discrete_cols
            + self.continuous_cols
            + [
                col
                for col in X.columns
                if col not in self.discrete_cols + self.continuous_cols
            ]
        )
        return transformed_columns

In [10]:
class TransformColumns(CustomProcessor):
    def transform(self, X):
        X_new = X.copy()

        print(X_new["valor_compra"].dtype)
        X_new["log_score_3"] = np.log1p(X_new["score_3"].astype(float))
        X_new["log_valor_compra"] = np.log1p(X_new["valor_compra"].astype(float))
        X_new["cbrt_score_6"] = np.cbrt(X_new["score_6"].astype(float))

        X_new = X_new.drop(["score_3", "valor_compra", "score_6"], axis=1)

        return X_new

In [11]:
class CountryProcessor(CustomProcessor):
    def transform(self, X):
        X_new = X.copy()
        X_new["pais"] = X_new["pais"].fillna(X_new["pais"].mode()[0])

        X_new["continente"] = X_new["pais"].apply(
            lambda x: pc.country_alpha2_to_continent_code(x)
        )

        X_new = X_new.drop("pais", axis=1)

        return X_new

In [18]:
discrete_columns = ["score_4", "score_7"]
continuous_columns = [
    "score_2",
    "score_3",
    "score_5",
    "score_6",
    "score_9",
    "score_10",
    "valor_compra",
]

to_drop_columns = ["score_fraude_modelo", "produto", "score_8"]

documents_columns = ["entrega_doc_1", "entrega_doc_2", "entrega_doc_3"]


pipeline = Pipeline(
    [
        ("dropper", DropColumn(to_drop_columns)),
        (
            "imputer",
            ImputeValuesProcessor(
                discrete_cols=discrete_columns, continuous_cols=continuous_columns
            ),
        ),
        ("docs", DocumentsProcessor(documents_columns)),
        ("country", CountryProcessor()),
        ("date", DateProcessor()),
        ("encoder", OneHotEncoderProcessor(["score_1", "continente"])),
        ("transform", TransformColumns()),
    ]
)

In [19]:
X_train_t = pipeline.fit_transform(X_train)
X_test_t = pipeline.transform(X_test)

category_train_df = X_train_t["categoria_produto"].to_frame(name="categoria_produto")
category_test_df = X_test_t["categoria_produto"].to_frame(name="categoria_produto")


target_encoder = TargetEncoder()

X_train_t["categoria_produto"] = target_encoder.fit_transform(
    category_train_df, y_train
)
X_test_t["categoria_produto"] = target_encoder.transform(category_test_df)

object
object


Para o preenchimento de valores vazios iremos utilizar a média para variáveis contínuas e a moda para variáveis discretas.

In [20]:
def convert_to_numeric(df):
    new_df = df.copy()
    print(new_df.select_dtypes(include=["object"]).columns)
    for col in new_df.select_dtypes(include=["object"]).columns:
        try:
            new_df[col] = pd.to_numeric(new_df[col], errors="coerce")
        except ValueError as e:
            print(e)
            pass

    return new_df


X_train_t = convert_to_numeric(X_train_t)
X_test_t = convert_to_numeric(X_test_t)

Index(['score_4', 'score_7', 'score_2', 'score_5', 'score_9', 'score_10'], dtype='object')
Index(['score_4', 'score_7', 'score_2', 'score_5', 'score_9', 'score_10'], dtype='object')


In [21]:
X_train_t.dtypes

score_4              float64
score_7              float64
score_2              float64
score_5              float64
score_9              float64
score_10             float64
categoria_produto    float64
entrega_doc_1          int32
entrega_doc_2          int32
entrega_doc_3          int32
hora_compra            int32
dia_compra             int32
score_1_2              int32
score_1_3              int32
score_1_4              int32
continente_AS          int32
continente_EU          int32
continente_NA          int32
continente_OC          int32
continente_SA          int32
log_score_3          float64
log_valor_compra     float64
cbrt_score_6         float64
dtype: object

In [33]:
models = [
    ("Random Forest", RandomForestClassifier(n_estimators=100, random_state=42)),
    ("XGBoost", XGBClassifier(scale_pos_weight=20, random_state=42)),
    ("LightGBM", LGBMClassifier(class_weight="balanced", random_state=42)),
    ("Decision Tree", DecisionTreeClassifier(class_weight="balanced", random_state=42)),
]

results = []
for name, model in models:
    kfold = KFold(n_splits=4, random_state=42, shuffle=True)
    cv_results = cross_val_score(model, X_train_t, y_train, cv=kfold, scoring="roc_auc")

    recall_results = cross_val_score(
        model, X_train_t, y_train, cv=kfold, scoring="recall"
    )

    results.append(cv_results)
    msg = "%s: %f (%f)" % (
        name,
        cv_results.mean(),
        cv_results.std(),
    )
    print(msg)
    print(recall_results)

Random Forest: 0.770848 (0.003251)
[0.06894288 0.08440243 0.08024275 0.0704607 ]
XGBoost: 0.754982 (0.010100)
[0.51411687 0.5172181  0.51449764 0.49728997]
[LightGBM] [Info] Number of positive: 4440, number of negative: 85560
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004990 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2197
[LightGBM] [Info] Number of data points in the train set: 90000, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 4482, number of negative: 85518
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003814 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=

In [81]:
lgbm = LGBMClassifier(is_unbalance=True, random_state=42)

# Define Hyperparameter Grid
param_distributions = {
    "num_leaves": [31, 50, 70],
    "max_depth": [-1, 10, 20],
    "learning_rate": [0.01, 0.05, 0.1],
    "n_estimators": [100, 200, 300],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "reg_alpha": [0, 1, 10],
    "reg_lambda": [0, 1, 10],
    # "scale_pos_weight": [1, 10, 50],
}

random_search = RandomizedSearchCV(
    estimator=lgbm,
    param_distributions=param_distributions,
    n_iter=20,
    scoring="roc_auc",  # Use ROC-AUC as the evaluation metric
    cv=3,  # 3-fold cross-validation
    verbose=2,
    n_jobs=-1,  # Use all available CPU cores
    random_state=42,
)

best_model = random_search.fit(X_train_t, y_train)

print(random_search.best_score_)
print("Best Parameters:", random_search.best_params_)
best_model = random_search.best_estimator_

y_pred = best_model.predict_proba(X_test_t)[:, 1]
auc_score = roc_auc_score(y_test, y_pred)
print(f"AUC Score: {auc_score}")

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[LightGBM] [Info] Number of positive: 5963, number of negative: 114037
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006742 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2197
[LightGBM] [Info] Number of data points in the train set: 120000, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.049692 -> initscore=-2.950949
[LightGBM] [Info] Start training from score -2.950949
0.7890954478930023
Best Parameters: {'subsample': 1.0, 'reg_lambda': 10, 'reg_alpha': 1, 'num_leaves': 50, 'n_estimators': 100, 'max_depth': 10, 'learning_rate': 0.05, 'colsample_bytree': 1.0}
AUC Score: 0.7940221951675509


In [82]:
y_pred = best_model.predict(X_test_t)

In [83]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.76      0.86     28463
           1       0.13      0.67      0.22      1537

    accuracy                           0.76     30000
   macro avg       0.55      0.72      0.54     30000
weighted avg       0.93      0.76      0.82     30000



Fazer a diferença de métricas pro modelo antigo vs novo

In [89]:
y_pred = best_model.predict_proba(X_test_t)[:, 1]

In [92]:
y_pred

array([0.33869662, 0.44186255, 0.71116987, ..., 0.16637931, 0.10491473,
       0.21172617])

In [None]:
X_test_new = X_test
X_test_new["score_new_model"] = y_pred * 100
X_test_new["fraude"] = y_test

X_test_new.head()

Unnamed: 0,score_1,score_2,score_3,score_4,score_5,score_6,pais,score_7,produto,categoria_produto,...,score_9,score_10,entrega_doc_1,entrega_doc_2,entrega_doc_3,data_compra,valor_compra,score_fraude_modelo,score_new_model,fraude
59770,4,0.7643,224172.84,1.0,0.0,39.0,BR,35,Prateleira Aço Refrigerador Continental Rfct47...,cat_dcfa25e,...,4705.0,0.0,1,N,N,2020-03-15 22:24:55,30.11,35,33.869662,0
21362,4,0.6197,575.78,2.0,0.0,0.0,BR,0,100 Máscaras Descartáveis Prevenção Epidemia C...,cat_604df77,...,1526.0,0.0,0,N,N,2020-03-19 11:36:11,38.47,90,44.186255,0
127324,4,0.8011,56806.3,2.0,0.561304,0.0,BR,2,Capinha Capa Silicone C/ Logo iPhone 6s 7 8 Pl...,cat_0820fab,...,1006.0,129.0,1,Y,Y,2020-04-12 4:47:44,6.02,58,71.116987,0
140509,4,0.8588,2163.71,3.0,0.59708,0.0,BR,3,Azulejo Decorativo Vintage 3 Kits De 9 Peças D...,cat_583f57b,...,2986.0,324.0,1,,Y,2020-03-29 14:19:12,13.63,72,16.382614,0
144297,4,0.8511,3550.93,16.0,0.390681,23.0,BR,12,Patch Troféu Copa Libertadores Da América 2019...,cat_1f924ff,...,1990.0,131.0,1,,Y,2020-03-18 22:27:39,5.98,53,25.541478,0


In [101]:
%pwd
%cd ..

c:\Users\renne\Documents\Estudos Dados\Repositórios\ML\fraud-detection


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [107]:
%pwd
from src.features.base_metrics import BaseMetrics


bm_1 = BaseMetrics(X_test_new, "score_fraude_modelo", "fraude", "valor_compra")

bm_1.find_best_threshold()
bm_1.show_all_metrics()

O limiar ótimo encontrado para a amostra é de: 72
Ganhos por transações aprovadas: R$ 80329.99
Prejuízos com transações fraudulentas aprovadas: R$ 25353.32
Receita gerada com limiar ótimo: R$ 54976.67
Taxa de pressão de entrada é de 5.123333333333333%
Taxa de aprovação total é de 73.37%
Taxa de declínio total é de 25.66%
A precisão do modelo é de 13.43%
A precisão do modelo é de 68.21%
A taxa de falsos positivos é de 23.64%


In [106]:
bm_1 = BaseMetrics(X_test_new, "score_new_model", "fraude", "valor_compra")

bm_1.find_best_threshold()
bm_1.show_all_metrics()

O limiar ótimo encontrado para a amostra é de: 63
Ganhos por transações aprovadas: R$ 93952.96
Prejuízos com transações fraudulentas aprovadas: R$ 32723.68
Receita gerada com limiar ótimo: R$ 61229.28
Taxa de pressão de entrada é de 5.123333333333333%
Taxa de aprovação total é de 64.43%
Taxa de declínio total é de 34.60%
A precisão do modelo é de 19.79%
A precisão do modelo é de 49.77%
A taxa de falsos positivos é de 10.89%
