In [9]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import average_precision_score

from matplotlib.colors import ListedColormap

In [100]:
class PUBoostClassifier():
    def __init__(self):
        '''
        Description
        '''
        self.model_list = []
        
    def fit(self, X_seed, X_poblacion, random_state, T=50, clf='logistic', 
            l1=1, l2=1, e1=1, e2=1, **kwargs_clf):
        """
        Returns avg of oob predictions of classifier para la poblacion
        Param:
            - T number of baggint iteractions 
            - clf: base estimator (one of rg, logistic)
        """
        self.T = T
        
        # K: size of boostrap sample (= size of seed)
        K = X_seed.shape[0]
        # U: size of poblation
        U = X_poblacion.shape[0]
        # se entrena con una muestra balanceada
        # vector target: primero seed - luego poblacion
        y_poblacion = np.zeros(U)
        # y_train = np.concatenate([np.ones(K), np.zeros(K)])
        # initialize numerador de predicciones
        pred = np.zeros(U)
        # initialize denominador de predicciones
        n = np.zeros(U)
        # iniialize weight vectors
        w_poblacion = np.ones(U)
        w_seed = np.ones(K)

        # bagging
        for t in range(T):
            # get sample
            idx_train = np.random.choice(U, K, replace=True)
            X_train = np.concatenate([X_seed, X_poblacion.iloc[idx_train,:]])
            # y_train vector
            y_train = np.concatenate([np.ones(K), y_poblacion[idx_train]])
            # weights
            # print(w_poblacion[idx_train], "/n")
            weights = np.concatenate([w_seed, w_poblacion[idx_train]])      
            # train
            if clf=='rf':
                clf = RandomForestClassifier(**kwargs_clf)
            if clf=='logistic':
                clf = LogisticRegression(**kwargs_clf)
            if clf=='tree':
                clf = DecisionTreeClassifier(**kwargs_clf)
            if clf=='knn':
                clf = KNeighborsClassifier(**kwargs_clf)
            clf.fit(X_train, y_train, sample_weight = weights)
            
            self.model_list.append(clf)
            
            # predict OOB
            idx_oob = np.full(U, True)
            idx_oob[idx_train] = False
            _pred = clf.predict_proba(X_poblacion.iloc[idx_oob,:])[:,clf.classes_ == 1].ravel()
            pred[idx_oob] += _pred
            n[idx_oob] += 1
            # update weight vector
            if t > (T*l1):
                _wupdate = np.zeros(U)
                _wupdate[idx_oob] = _pred
                w_poblacion += (-_wupdate/T*l2) 
            if t > (T*e1):
                y_poblacion[(pred/n)>e2] = 1
        scores = pred / n
        return scores
        
    def predict(self, df):
        
        predic = np.zeros(df.shape[0])
        
        for t in range(self.T):
            _predic = self.model_list[t].predict_proba(df)[:,self.model_list[t].classes_ == 1].ravel()
            predic += _predic
        
        return predic / self.T


In [62]:
## creo dataset
rng = np.random.RandomState(123)
X, y = make_classification(n_samples=5000
                       ,n_features=2
                       ,n_informative=2
                       ,n_redundant=0
                       ,n_repeated=0 
                       ,n_classes=2
                       ,n_clusters_per_class=1
                       ,weights=[0.80, 0.20]
                       ,flip_y=0
                       ,class_sep=0.4
                       ,random_state=rng)
print(X.shape)
print(y.shape)
# sns.scatterplot(x=X[:,0], y=X[:,1], hue=y, size=1, palette=['#440154FF','#FDE725FF'])

X_seed = X[y==1,:]
X_poblacion = X[y==0,:]
contamination = 0.5
idx_hidden = rng.choice(np.argwhere(y == 1).ravel()
                        ,size=int(X_seed.shape[0]*contamination)
                        ,replace=False)
y[idx_hidden]=0
# sns.scatterplot(x=X[:,0], y=X[:,1], hue=y, size=1, palette=['#440154FF','#FDE725FF'])

(5000, 2)
(5000,)


In [118]:
pclf='tree'
pkwargs_clf = {'min_samples_leaf': 5}

# predict bagged
puboost_clf = PUBoostClassifier()

pred_sbagged = puboost_clf.fit(X_seed = pd.DataFrame(X[y==1, :]), X_poblacion = pd.DataFrame(X[y==0, :]),
                               random_state = 42, T=100, clf=pclf, 
                               l1=0.5, l2=0.5, e1=0.5, e2=0.5, **pkwargs_clf)

print(pred_sbagged)
# # predict poblacion
# plt.scatter(X[y==0,0], X[y==0,1], c=pred_sbagged, cmap=cm, s=4)

# # average precision
# average_precision_score(y_true, pred_sbagged)

print(len(puboost_clf.predict(X_seed)))
print(puboost_clf.predict(X_seed).mean())

print(len(puboost_clf.predict(X_poblacion)))
print(puboost_clf.predict(X_poblacion).mean())

print(len(puboost_clf.predict(X)))
print(puboost_clf.predict(X).mean())

[0.         0.90704031 0.         ... 0.76203421 0.         0.83403541]
1000
0.9720912810150073
4000
0.2567882821238018
5000
0.3998488819020429
