# Manually combine over and under-sampling

In this notebook, I demo how to manually combine an over- and under-sampling technique, in case you want to do so.

In [1]:
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# to correctly set up the cross-validation
from imblearn.pipeline import make_pipeline
from imblearn.under_sampling import NeighbourhoodCleaningRule
from imblearn.over_sampling import ADASYN

In [2]:
# load data (just some cells to speed up the computations)
# we will be using KNN in our cleaning procedure so with the whole 
# dataset in my laptop it takes quite a while...

# you see already some of the limitations of these techniques ;)


data = pd.read_csv('../kdd2004.csv').sample(50000, random_state=0)

data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,65,66,67,68,69,70,71,72,73,target
46233,51.02,22.08,0.92,31.5,10.5,1910.7,-1.47,-0.74,-8.0,-52.0,...,879.5,1.58,-0.45,-5.0,-30.0,291.7,-0.12,0.47,0.96,-1
58625,64.17,24.6,-0.21,-35.5,26.0,4585.3,-1.1,1.17,-27.5,-121.5,...,4815.7,-1.09,5.09,25.0,-220.0,475.4,2.32,0.42,0.46,-1
5231,86.09,29.63,3.24,78.5,-89.0,453.2,1.87,4.58,63.0,-119.5,...,144.9,1.25,2.5,3.0,-24.0,64.8,-0.85,0.59,0.94,1
58042,78.57,21.37,0.36,-7.0,38.5,1779.1,-0.25,-0.03,-3.5,-62.5,...,1471.3,-0.12,1.48,-5.0,-62.0,406.9,0.18,0.41,0.68,-1
128067,79.13,24.18,0.78,-3.0,-16.0,844.1,0.48,-0.56,-6.5,-52.0,...,633.8,0.43,1.3,5.0,-29.0,165.0,-0.1,0.09,-0.41,-1


In [3]:
# separate dataset into train and test

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['target'], axis=1),  # drop the target
    data['target'],  # just the target
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((35000, 74), (15000, 74))

## ADASYN + NCR 

In [4]:
# adasyn

adasyn = ADASYN(
        sampling_strategy='auto',  # samples only the minority class
        random_state=0,  # for reproducibility
        n_neighbors=5,
        n_jobs=4,
)

In [5]:
###################
## IMPORTANT  
##################=

# The sampling strategy needs to be set to all, or with
# a specific dictionary, because after ADASYN, our
# previous minority class is no longer minority!!

ncr = NeighbourhoodCleaningRule(
    sampling_strategy='all',# undersamples all classes
    n_neighbors=3, 
    kind_sel='mode', 
    threshold_cleaning=0.1, # the threshold to evaluate a class for cleaning (used only for clearning step)
) 

In [6]:
model = make_pipeline(
    MinMaxScaler(),
    adasyn,
    ncr, 
    RandomForestClassifier(
            n_estimators=100, random_state=39, max_depth=3, n_jobs=4
        ),
)

In [7]:
model.fit(X_train, y_train)

Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                ('adasyn', ADASYN(n_jobs=4, random_state=0)),
                ('neighbourhoodcleaningrule',
                 NeighbourhoodCleaningRule(kind_sel='mode',
                                           sampling_strategy='all',
                                           threshold_cleaning=0.1)),
                ('randomforestclassifier',
                 RandomForestClassifier(max_depth=3, n_jobs=4,
                                        random_state=39))])

In [8]:
pred_train = model.predict_proba(X_train)[:,1]
pred_test = model.predict_proba(X_test)[:,1]

In [9]:
print('Train roc-auc: {}'.format(roc_auc_score(y_train, pred_train)))

Train roc-auc: 0.9917936513257711


In [10]:
print('Test roc-auc: {}'.format(roc_auc_score(y_test, pred_test)))

Test roc-auc: 0.9891783766984125
