In [1]:
import numpy as np 
import pandas as pd

import random

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import average_precision_score

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
rng = np.random.RandomState(123)

In [3]:
filepath = "/home/28ed2e24-6dc0-4639-9c20-93842c733782/puboost/notebooks/input/marketing_campaign.csv"
df = pd.read_csv(filepath , sep = '\t')

df.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,04-09-2012,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,08-03-2014,38,11,...,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,21-08-2013,26,426,...,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,10-02-2014,26,11,...,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,19-01-2014,94,173,...,5,0,0,0,0,0,0,3,11,0


In [4]:
df = df.set_index('Dt_Customer')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2240 entries, 04-09-2012 to 15-10-2012
Data columns (total 28 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2240 non-null   int64  
 1   Year_Birth           2240 non-null   int64  
 2   Education            2240 non-null   object 
 3   Marital_Status       2240 non-null   object 
 4   Income               2216 non-null   float64
 5   Kidhome              2240 non-null   int64  
 6   Teenhome             2240 non-null   int64  
 7   Recency              2240 non-null   int64  
 8   MntWines             2240 non-null   int64  
 9   MntFruits            2240 non-null   int64  
 10  MntMeatProducts      2240 non-null   int64  
 11  MntFishProducts      2240 non-null   int64  
 12  MntSweetProducts     2240 non-null   int64  
 13  MntGoldProds         2240 non-null   int64  
 14  NumDealsPurchases    2240 non-null   int64  
 15  NumWebPurchases      2240 no

In [6]:
df.describe()

Unnamed: 0,ID,Year_Birth,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
count,2240.0,2240.0,2216.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,...,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0
mean,5592.159821,1968.805804,52247.251354,0.444196,0.50625,49.109375,303.935714,26.302232,166.95,37.525446,...,5.316518,0.072768,0.074554,0.072768,0.064286,0.013393,0.009375,3.0,11.0,0.149107
std,3246.662198,11.984069,25173.076661,0.538398,0.544538,28.962453,336.597393,39.773434,225.715373,54.628979,...,2.426645,0.259813,0.262728,0.259813,0.245316,0.114976,0.096391,0.0,0.0,0.356274
min,0.0,1893.0,1730.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,11.0,0.0
25%,2828.25,1959.0,35303.0,0.0,0.0,24.0,23.75,1.0,16.0,3.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,11.0,0.0
50%,5458.5,1970.0,51381.5,0.0,0.0,49.0,173.5,8.0,67.0,12.0,...,6.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,11.0,0.0
75%,8427.75,1977.0,68522.0,1.0,1.0,74.0,504.25,33.0,232.0,50.0,...,7.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,11.0,0.0
max,11191.0,1996.0,666666.0,2.0,2.0,99.0,1493.0,199.0,1725.0,259.0,...,20.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,11.0,1.0


In [7]:
df['Response'].value_counts()

0    1906
1     334
Name: Response, dtype: int64

In [8]:
# Data process
df['Age'] = df.apply(lambda row: 2023- int(row.Year_Birth), axis=1)

df = pd.get_dummies(df, columns=['Education', 'Marital_Status'], prefix=['Education', 'Marital'])

df = df.dropna()

In [9]:
X = df.drop(columns=['Response'])
y = df['Response']

# Tasa de contaminación (convertir la mitad de los 1s en 0s)
contamination = 0.5

# Obtener los índices de los 1s en la Serie
indices_ones = y[y == 1].index

# Determinar cuántos 1s convertir a 0s
num_to_convert = int(len(indices_ones) * contamination)

# Usar una semilla aleatoria para garantizar reproducibilidad
random.seed(42)

# Seleccionar aleatoriamente los índices de 1s a convertir
idx_hidden = random.sample(indices_ones.tolist(), num_to_convert)

# Modificar el DataFrame y convirtiendo los 1s seleccionados a 0s
y_true = y.copy()
y = y.copy()
y.loc[idx_hidden] = 0

y_true.loc[idx_hidden] = 1
y_true = y_true[y == 0]

### Modelos base

In [10]:
### Classification Model and predictions

# classifier
clf = 'tree'
# classifier param
kwargs_clf = {'min_samples_leaf': 0.1}

# train
if clf=='rf':
    mod = RandomForestClassifier(random_state=rng, **kwargs_clf)
elif clf=='logistic':
    mod = LogisticRegression(random_state=rng, **kwargs_clf)
elif clf=='tree':
    mod = DecisionTreeClassifier(random_state=rng, **kwargs_clf)

# fit    
mod.fit(X, y)

# predict
predh = mod.predict_proba(X.loc[y==0,:])[:, mod.classes_ == 1].ravel()

# average precision
average_precision_score(y_true, predh)

0.34102636833572075

In [11]:
def smartbagged_clf(X_seed, X_poblacion, random_state, T=50, clf='logistic', 
                    l1=1, l2=1, e1=1, e2=1, **kwargs_clf):
    """
    Returns avg of oob predictions of classifier para la poblacion
    Param:
        - T number of baggint iteractions 
        - clf: base estimator (one of rg, logistic)
    """
    # K: size of boostrap sample (= size of seed)
    K = X_seed.shape[0]
    # U: size of poblation
    U = X_poblacion.shape[0]
    # se entrena con una muestra balanceada
    # vector target: primero seed - luego poblacion
    y_poblacion = np.zeros(U)
#     y_train = np.concatenate([np.ones(K), np.zeros(K)])
    # initialize numerador de predicciones
    pred = np.zeros(U)
    # initialize denominador de predicciones
    n = np.zeros(U)
    # iniialize weight vectors
    w_poblacion = np.ones(U)
    w_seed = np.ones(K)

    # bagging
    for t in range(T):
        # get sample
        idx_train = np.random.choice(U, K, replace=True)
        X_train = np.concatenate([X_seed, X_poblacion.iloc[idx_train,:]])
        # y_train vector
        y_train = np.concatenate([np.ones(K), y_poblacion[idx_train]])
        # weights
        # print(w_poblacion[idx_train], "/n")
        weights = np.concatenate([w_seed, w_poblacion[idx_train]])      
        # train
        if clf=='rf':
            clf = RandomForestClassifier(**kwargs_clf)
        if clf=='logistic':
            clf = LogisticRegression(**kwargs_clf)
        if clf=='tree':
            clf = DecisionTreeClassifier(**kwargs_clf)
        if clf=='knn':
            clf = KNeighborsClassifier(**kwargs_clf)
        clf.fit(X_train, y_train, sample_weight = weights)
        # predict OOB
        idx_oob = np.full(U, True)
        idx_oob[idx_train] = False
        _pred = clf.predict_proba(X_poblacion.iloc[idx_oob,:])[:,clf.classes_ == 1].ravel()
        pred[idx_oob] += _pred
        n[idx_oob] += 1
        # update weight vector
        if t > (T*l1):
            _wupdate = np.zeros(U)
            _wupdate[idx_oob] = _pred
            w_poblacion += (-_wupdate/T*l2) 
        if t > (T*e1):
            y_poblacion[(pred/n)>e2] = 1
    scores = pred / n
    return scores

In [12]:
pclf='tree'
pkwargs_clf = {'min_samples_leaf': 5}

import warnings
warnings.filterwarnings("ignore", category=UserWarning)


# predict bagged
pred_bagged = smartbagged_clf(pd.DataFrame(X.loc[y==1, :]), pd.DataFrame(X.loc[y==0, :]),
                         random_state = 42, T=100, clf=pclf, l1=1, l2=1, **pkwargs_clf)

# average precision
print(average_precision_score(y_true, pred_bagged))

# Crea el gráfico de densidad
plt.figure(figsize=(8, 6))
sns.kdeplot(data=pred_sbagged[y_true==0], label='y_true = 0')
sns.kdeplot(data=pred_sbagged[y_true==1], label='y_true = 1')

# Agrega etiquetas y leyenda
plt.xlabel('pred_sbagged')
plt.ylabel('Densidad')
plt.legend(title='y_true')

# Muestra el gráfico
plt.show()

0.47520326464288076


NameError: name 'pred_sbagged' is not defined

<Figure size 800x600 with 0 Axes>

In [None]:
print(y_true)
print(pred_bagged)

In [None]:
# predict bagged
pred_sbagged = smartbagged_clf(pd.DataFrame(X.loc[y==1, :]), pd.DataFrame(X.loc[y==0, :]),
                               random_state = 42, T=100, clf=pclf, 
                               l1=0.5, l2=0.6, e1=0.9, e2=1, **pkwargs_clf)

# average precision
print(average_precision_score(y_true, pred_sbagged))

# Crea el gráfico de densidad
plt.figure(figsize=(8, 6))
sns.kdeplot(data=pred_sbagged[y_true==0], label='y_true = 0')
sns.kdeplot(data=pred_sbagged[y_true==1], label='y_true = 1')

# Agrega etiquetas y leyenda
plt.xlabel('pred_sbagged')
plt.ylabel('Densidad')
plt.legend(title='y_true')

# Muestra el gráfico
plt.show()