## **Data Wrangling**

- Para criar `requirements.txt`:
> poetry export -f requirements.txt --output requirements.txt --without-hashes

### **IMPORTS**

In [5]:
%load_ext autoreload
%autoreload 2

In [6]:
import os
os.chdir("..")

In [7]:
import pandas as pd

### **RAW**

In [4]:
offers = pd.read_json("data/raw/offers.json")
customers = pd.read_json("data/raw/profile.json")
transactions = pd.read_json("data/raw/transactions.json")

In [5]:
offers["id"].duplicated().any(), customers["id"].duplicated().any(), transactions["account_id"].duplicated().any()

(False, False, True)

### **PREPROCESS**

In [6]:
from src.preprocess import preprocess_customers_info, preprocess_transactions_info, join_campaigns_info

customers_ = preprocess_customers_info(data=customers)
transactions_ = preprocess_transactions_info(data=transactions)

In [7]:
# Total de: 10 ofertas, 17000 clientes, 306534 evento registrados, todos os clientes com registro
offers.shape, customers_.shape, transactions_.shape, transactions_["account_id"].nunique()

((10, 6), (17000, 5), (306534, 6), 17000)

In [8]:
data = join_campaigns_info(offers=offers, customers=customers_, transactions=transactions_)
data.shape

(306534, 15)

In [9]:
# Não há inconsistências entre o valor comunicado da oferta e os valores de desconto gerados
((data["reward"].notna()) & (data["discount_value"] != data["reward"])).any()

False

In [10]:
data.to_csv("data/processed/campaigns_data.csv", index=False)

### **ENRICH**

In [11]:
import ast

data = pd.read_csv("data/processed/campaigns_data.csv")
# Corrigindo tipos após carregamento do `.csv`
# Não necessário se `data` for gerado no próprio notebook
data["registered_on"] = pd.to_datetime(data["registered_on"])
data["channels"] = data["channels"].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else x)

data.shape

(306534, 15)

In [12]:
from src.preprocess import enricher_transactions_information
# Funciona, mas ver uma forma de otimizar depois
# 35s para 306534 linhas [ok]
dataf = enricher_transactions_information(data)
dataf.shape

(306534, 16)

In [13]:
dataf.to_csv("data/processed/enriched_campaigns_data.csv", index=False)

### **FEATURE ENGINERING**

In [18]:
import ast

dataf = pd.read_csv("data/processed/enriched_campaigns_data.csv")
# Corrigindo tipos após carregamento do `.csv`
# Não necessário se `dataf` for gerado no próprio notebook
dataf["registered_on"] = pd.to_datetime(dataf["registered_on"])
dataf["channels"] = dataf["channels"].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else x)

dataf.shape

(306534, 16)

In [19]:
from src.utils import (
    built_target,
    calculate_days_between_same_events,
    calculate_days_between_receiving_viewing,
)
data_model = built_target(data=dataf)
data_model = calculate_days_between_same_events(data=data_model)
data_model = calculate_days_between_receiving_viewing(data=data_model)

data_model.shape

(306534, 21)

In [20]:
from src.utils import (
    build_customer_features,
    build_offer_features,
    build_engagement_features
)
groupby_customer = ["account_id"]
groupby_target = ["account_id", "offer_id"]
groupby_offers = ["account_id", "offer_id", "duration", "min_value", "discount_value"]

customer_feats = build_customer_features(data=data_model, agg_columns=groupby_customer)
offers_feats = build_offer_features(data=data_model, agg_columns=groupby_offers)
engagement_feats = build_engagement_features(data=offers_feats, agg_columns=groupby_customer)

# Target: nível account_id + offer_id
profile_offer_target = data_model.groupby(groupby_target)["target"].max().reset_index()

customer_feats.shape, offers_feats.shape, engagement_feats.shape, profile_offer_target.shape

((17000, 14), (63288, 18), (16994, 10), (63288, 3))

In [21]:
from src.utils import unify_modeling_dataset
data_model = unify_modeling_dataset(
    offer_feats=offers_feats,
    engagement_feats=engagement_feats,
    customer_feats=customer_feats,
    profile_offer_target=profile_offer_target,
)
data_model.columns = [col.replace(" ", "_") for col in data_model.columns]
data_model.shape

(63288, 41)

In [22]:
# dataset com registros ("account_id", "offer_id") únicos
data_model[["account_id", "offer_id"]].duplicated().any()

False

In [24]:
data_model.to_csv("data/processed/data_model.csv", index=False)