## MetaCost

[Machine Learning with Imbalanced Data - Course](https://www.trainindata.com/p/machine-learning-with-imbalanced-data)


In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from metacost import MetaCost

In [2]:
# load data
# only a few observations to speed the computaton

data = pd.read_csv('../kdd2004.csv').sample(10000)

# remap target class to 0 and 1
data['target'] = data['target'].map({-1:0, 1:1})

data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,65,66,67,68,69,70,71,72,73,target
8006,49.58,28.33,-0.42,18.0,80.5,4655.8,-0.72,0.25,-13.5,-101.5,...,5504.9,0.29,0.58,-18.0,-109.0,1020.0,1.0,0.19,0.12,0
31947,66.2,22.04,1.09,31.0,128.0,4352.3,0.68,1.29,3.5,-90.0,...,6144.8,-0.35,1.17,-4.0,-139.0,759.0,0.28,0.34,0.81,0
132826,33.54,29.09,-1.2,-26.0,39.0,1711.6,-0.39,-0.08,3.0,-62.5,...,1413.7,1.45,1.35,-9.0,-38.0,150.8,0.69,0.16,0.48,0
116503,66.29,24.29,1.13,10.0,-14.0,2728.3,-0.86,0.52,20.5,-98.0,...,3121.3,-0.73,-1.1,2.0,-64.0,545.9,1.22,0.23,-0.18,0
90210,47.0,25.81,-0.43,-17.0,93.0,4490.9,-0.92,-0.1,-3.0,-94.0,...,5261.1,-2.2,-2.67,-5.0,-58.0,738.1,0.51,0.47,0.42,0


In [3]:
# imbalanced target

data.target.value_counts() / len(data)

target
0    0.9925
1    0.0075
Name: count, dtype: float64

In [4]:
# separate dataset into train and test

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['target'], axis=1),  # drop the target
    data['target'],  # just the target
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((7000, 74), (3000, 74))

In [5]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,64,65,66,67,68,69,70,71,72,73
111157,76.39,26.06,0.27,25.0,-39.5,1435.6,0.6,0.9,25.0,-108.0,...,93.0,916.2,2.09,3.6,4.0,-39.0,125.9,0.34,0.53,0.69
19751,86.76,25.42,0.13,27.5,-12.0,1302.5,1.14,-0.15,-5.0,-88.5,...,-55.0,1870.0,-0.87,0.7,14.0,-96.0,740.0,0.25,0.42,0.7
23652,46.3,28.0,-2.57,-80.5,67.5,2762.4,-2.0,-1.0,-12.0,-68.0,...,-65.0,1379.7,1.32,1.65,15.0,-115.0,844.9,0.55,0.38,0.11
127291,68.7,23.75,-0.85,-22.0,-4.5,734.3,-0.24,-0.47,-7.5,-50.5,...,78.0,256.3,1.72,1.13,-5.0,-31.0,132.2,0.71,0.49,0.22
92335,17.71,29.03,-1.57,-4.0,56.5,1511.3,0.35,-1.06,-14.5,-45.5,...,-11.0,1819.1,0.64,0.52,3.0,-54.0,561.6,0.79,0.31,-0.41


## Set up Logistic regression

In [6]:
# set up the estimator we would like to ensemble

logit = LogisticRegression(
    penalty='l2',
    solver='newton-cg',
    random_state=0,
    max_iter=10,
    n_jobs=4,
)

## MetaCost

With no cost

In [7]:
cost_matrix = np.array([[0, 1], [1, 0]])
cost_matrix

array([[0, 1],
       [1, 0]])

In [8]:
metacost_ = MetaCost(estimator=logit,
                     cost_matrix=cost_matrix,
                     n_estimators=50,
                     n_samples=None,
                     p=True,
                     q=True)

In [9]:
metacost_.fit(X_train, y_train)

resampling data and training ensemble
Finished training ensemble
evaluating optimal class per observation
Finished re-assigning labels
Training model on new data
Finished training model on data with new labels


In [10]:
metacost_.predict_proba(X_train)



array([[9.99997913e-01, 2.08735355e-06],
       [9.99998795e-01, 1.20488327e-06],
       [9.99999999e-01, 1.24888003e-09],
       ...,
       [9.97729720e-01, 2.27028034e-03],
       [1.00000000e+00, 4.95571103e-11],
       [9.99999998e-01, 1.80389541e-09]])

In [11]:
print('Train set')
pred = metacost_.predict_proba(X_train)
print(
    'MetaCost roc-auc: {}'.format(roc_auc_score(y_train, pred[:, 1])))

print('Test set')
pred = metacost_.predict_proba(X_test)
print(
    'MetaCost roc-auc: {}'.format(roc_auc_score(y_test, pred[:, 1])))

Train set
MetaCost roc-auc: 0.9105563147295742
Test set
MetaCost roc-auc: 0.9059311582015921




## MetaCost

With costs

TN | FN
 
FP | TP

In [12]:
cost_matrix = np.array([[0, 100], [1, 0]])
cost_matrix

array([[  0, 100],
       [  1,   0]])

In [13]:
metacost2 = MetaCost(estimator=logit,
                     cost_matrix=cost_matrix,
                     n_estimators=50,
                     n_samples=None,
                     p=True,
                     q=True)

In [14]:
metacost2.fit(X_train, y_train)

print('Train set')
pred = metacost2.predict_proba(X_train)
print(
    'MetaCost roc-auc: {}'.format(roc_auc_score(y_train, pred[:, 1])))

print('Test set')
pred = metacost2.predict_proba(X_test)
print(
    'MetaCost roc-auc: {}'.format(roc_auc_score(y_test, pred[:, 1])))

resampling data and training ensemble
Finished training ensemble
evaluating optimal class per observation
Finished re-assigning labels
Training model on new data
Finished training model on data with new labels
Train set
MetaCost roc-auc: 0.9387196729957805
Test set
MetaCost roc-auc: 0.9139539808897361




In [15]:
y_train.reset_index(drop=True)

0       0
1       0
2       0
3       0
4       0
       ..
6995    0
6996    0
6997    0
6998    0
6999    0
Name: target, Length: 7000, dtype: int64

In [16]:
tmp = pd.concat([metacost2.y_, y_train.reset_index(drop=True)], axis=1)

tmp.head()

Unnamed: 0,0,target
0,0,0
1,0,0
2,0,0
3,1,0
4,0,0


In [17]:
tmp[tmp[0]!=tmp['target']][['target', 0]]

Unnamed: 0,target,0
3,0,1
38,0,1
41,0,1
60,0,1
61,0,1
...,...,...
6939,0,1
6951,0,1
6969,0,1
6995,0,1


In theory, we should only be re-labeling observations from class 0 to class 1, but in practice that does not happen.

In [18]:
np.sum( np.where(metacost2.y_ != y_train.reset_index(drop=True),1,0) )

987

In [19]:
np.sum( np.where(metacost2.y_ == y_train.reset_index(drop=True),1,0) )

6013

## Conclusion

We can wrap a model to make it cost-sensitive utilizing metacost.

### Important

The code I provided, does not allow reproducible results, because at the moment the class MetaCost does not incorporate a seed when re-sampling the data.

**HOMEWORK**

Go ahead and compare how many observations are relabeled if we apply no extra cost to the minority class, or different costs to 100.

Also insteresting, compare the performance of bagging with cost_sensitive learning (adding the parameter class_weight) with MetaCost.