In [13]:
import numpy as np
import scipy.sparse as sp

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

from imblearn.datasets import fetch_datasets
from imblearn.under_sampling import NeighbourhoodCleaningRule
from imblearn.over_sampling import SMOTE
from imblearn.combine import SPIDER


name = 'isolet'
dataset = fetch_datasets()[name]
X, y = dataset.data, dataset.target
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, stratify=y)

logreg = LogisticRegression(solver='lbfgs', random_state=0, n_jobs=-1)

ncr = NeighbourhoodCleaningRule(random_state=0, n_jobs=-1)
smote = SMOTE(random_state=0, n_jobs=-1)

# TODO: change to 3
nn = 5
spider_weak = SPIDER(kind='weak', n_neighbors=nn, n_jobs=-1)
spider_relabel = SPIDER(kind='relabel', n_neighbors=nn, n_jobs=-1)
spider_strong = SPIDER(kind='strong', n_neighbors=nn, n_jobs=-1)


def pipeline(sampler=None):
    if sampler:
        X_r, y_r = sampler.fit_resample(X_tr, y_tr)
    else:
        X_r, y_r = X_tr, y_tr
    logreg.fit(X_r, y_r)
    y_pred = logreg.predict(X_te)
    print(confusion_matrix(y_te, y_pred))
    print(f"Accuracy : {accuracy_score(y_te, y_pred)}")
    print(f"Precision : {precision_score(y_te, y_pred)}")
    print(f"Recall : {recall_score(y_te, y_pred)}")
    
    if isinstance(sampler, SPIDER):
        print(f"Resampled: {X_r.shape} {y_r.shape} -- Train: {X_tr.shape} {y_tr.shape}")
        print(f"Discarded: {sampler.discarded_}")
        print(f"Relabeled: {sampler.relabeled_}")    

In [None]:
tp / (tp + fp)

## No Sampling

In [14]:
pipeline()

[[1417   23]
 [  18  102]]
Accuracy : 0.9737179487179487
Precision : 0.816
Recall : 0.85


In [16]:
102 / (102 + 23)

0.816

## SMOTE

In [7]:
pipeline(smote)

[[1395   45]
 [  11  109]]


In [17]:
109 / (109 + 45)

0.7077922077922078

## NCR

In [11]:
pipeline(ncr)

[[1427   13]
 [  26   94]]


In [18]:
94 / (94 + 13)

0.8785046728971962

## Weak

In [8]:
pipeline(spider_weak)

[[1428   12]
 [  29   91]]
Resampled: (6336, 617) (6336,) -- Train: (6237, 617) (6237,)
Discarded: [  34  207  273  386  419  467  472  718  730 1024 1047 1109 1333 1398
 1475 1491 1525 1771 1823 1852 1854 1856 1944 1983 2032 2071 2292 2333
 2656 2773 2782 3031 3146 3182 3215 3491 3532 3570 3665 3688 3691 3898
 4143 4296 4323 4367 4465 4548 4631 4756 4765 4898 4957 5036 5054 5301
 5308 5351 5471 5499 5547 5619 5714 5734 5875 5963 6006 6008 6074 6154]
Relabeled: []


In [19]:
91 / (91 + 12)

0.883495145631068

## Relabel

In [9]:
pipeline(spider_relabel)

[[1430   10]
 [  28   92]]
Resampled: (6361, 617) (6361,) -- Train: (6237, 617) (6237,)
Discarded: [  34  207  273  386  467  472  730 1024 1047 1398 1491 1771 1852 1856
 1944 1983 2071 2292 2333 2656 2782 3146 3182 3532 3570 4143 4323 4367
 4465 4548 4631 4765 4957 5054 5301 5308 5351 5471 5499 5714 5734 5963
 6006 6074 6154]
Relabeled: [ 419  718 1109 1333 1475 1525 1823 1854 2032 2773 3031 3215 3491 3665
 3688 3691 3898 4296 4756 4898 5036 5547 5619 5875 6008]


In [20]:
92 / (92 + 10)

0.9019607843137255

## Strong

In [10]:
pipeline(spider_strong)

[[1431    9]
 [  32   88]]
Resampled: (6701, 617) (6701,) -- Train: (6237, 617) (6237,)
Discarded: [  34   95  207  273  386  467  472  523  730  749 1024 1047 1289 1376
 1398 1491 1670 1771 1852 1856 1944 1983 2071 2292 2333 2401 2656 2782
 3146 3182 3342 3532 3544 3570 3634 3647 4143 4323 4367 4373 4465 4473
 4548 4631 4634 4765 4923 4957 5054 5301 5308 5351 5471 5499 5714 5734
 5795 5840 5963 6006 6074 6154]
Relabeled: []


In [21]:
88 / (88 + 9)

0.9072164948453608