## SMOTE-N

In this notebook, we will cover the essentials of SMOTE-N and the Value Difference Metric.

- First, we will calculate the difference between values and observations using the VDM
- Second, we will implement SMOTE-N with imbalanced learn.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import make_blobs
from sklearn.preprocessing import OrdinalEncoder

from imblearn.metrics.pairwise import ValueDifferenceMetric
from imblearn.over_sampling import SMOTEN

## Distance between values

In [2]:
# create a dataset with 1 feature

X = np.array(["green"] * 10 + ["red"] * 10 + ["blue"] * 10).reshape(-1, 1)
y = [1] * 8 + [0] * 5 + [1] * 7 + [0] * 9 + [1]

# the function "ValueDifferenceMetric" works
# only with encoded variables, so we need to transform
# the strings into numbers first

encoder = OrdinalEncoder(dtype=np.int32)
X_enc = encoder.fit_transform(X)

# Now, we can learn the distances
# I put r=1 so we have the same results that I showed
# previously in the slides, for comparison
vdm = ValueDifferenceMetric(r=1).fit(X_enc, y)

# the conditional probabilities of a value given the
# class are stored, for each value

vdm.proba_per_class_

[array([[0.9, 0.1],
        [0.2, 0.8],
        [0.3, 0.7]])]

In [3]:
# and if you are wondering, which class is displayed first
# they come as stored in the categories_ attribute of
# the encoder

encoder.categories_

[array(['blue', 'green', 'red'], dtype='<U5')]

In [4]:
# Find the distances between 3 different values

# create the example
X_test = np.array(["red", "green", "blue"]).reshape(-1, 1)

# encode the example
X_test_enc = encoder.transform(X_test)

# determine the distances
vdm.pairwise(X_test_enc)

array([[0. , 0.2, 1.2],
       [0.2, 0. , 1.4],
       [1.2, 1.4, 0. ]])

## Distance between vectors

Now, instead of having a single value, we will determine distances in vectors

In [5]:
# We create a dataframe that contains 2 features

# 2 features
X = pd.concat([
    pd.Series(np.array(["green"] * 10 + ["red"] * 10 + ["blue"] * 10)),
    
    pd.Series(np.array(["used"] + ["new"] + ["used"] + ["new"] * 2 +
                       ["used"] * 2 + ["new"] * 3 + ["used"] * 4 + 
                       ["new"] * 6 + ["used"] * 6 + ["new"] * 4)),
    ], axis=1)

X.columns = ['colour', 'condition']

# target
y = [1] * 8 + [0] * 5 + [1] * 7 + [0] * 9 + [1]

X.head()

Unnamed: 0,colour,condition
0,green,used
1,green,new
2,green,used
3,green,new
4,green,new


In [6]:
# the function "ValueDifferenceMetric" works
# only with encoded variables, so we need to transform
# the strings into numbers first

encoder = OrdinalEncoder(dtype=np.int32)
X_enc = encoder.fit_transform(X)

In [7]:
# Now, we can learn the distances
# I put r=1 so we have the same results that I showed
# previously in the slides, for comparison

vdm = ValueDifferenceMetric(r=2).fit(X_enc, y)


# the conditional probabilities of a value given the
# class are stored, for each value and for each variable

vdm.proba_per_class_

[array([[0.9, 0.1],
        [0.2, 0.8],
        [0.3, 0.7]]),
 array([[0.3125    , 0.6875    ],
        [0.64285714, 0.35714286]])]

In [8]:
# Now we create some new data with some vector
# combinations of the 2 variables

X_test = pd.concat([
    pd.Series(np.array(["green"]+["green"]+["red"]+["red"])),
    pd.Series(np.array(["used"] + ["new"] + ["used"] + ["new"])),
], axis=1)


X_test.columns = ['colour', 'condition']

X_test

Unnamed: 0,colour,condition
0,green,used
1,green,new
2,red,used
3,red,new


In [9]:
# we encode them first into numnbers
X_test_enc = encoder.transform(X_test)


# and not, we determine the distance, which is going
# to output the distance of all possible combinations
# of the vectors

vdm.pairwise(X_test_enc)

array([[0.        , 0.43654337, 0.04      , 0.47654337],
       [0.43654337, 0.        , 0.47654337, 0.04      ],
       [0.04      , 0.47654337, 0.        , 0.43654337],
       [0.47654337, 0.04      , 0.43654337, 0.        ]])

## SMOTE-N

[SMOTE-N](https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.SMOTEN.html)

In [10]:
# Create some data

rng = np.random.RandomState(42)
num_samples = 1600

X = pd.concat([
    pd.Series(rng.choice(['Blue', 'Green', 'Red'], size=num_samples).astype(object)),
    pd.Series(rng.choice(['New', 'Used'], size=num_samples).astype(object)),
    pd.Series(rng.choice(['Classic', 'Luxus', 'Smart', 'Small'], size=num_samples).astype(object)),
], axis=1)

X.columns = ['Colour', 'Condition', 'Model']

y = pd.Series(rng.binomial(p=0.1, n=1, size=num_samples))

# display size
X.shape, y.shape

((1600, 3), (1600,))

In [11]:
# y is imbalanced

y.value_counts()

0    1443
1     157
dtype: int64

In [12]:
# X is categorical

X.head()

Unnamed: 0,Colour,Condition,Model
0,Red,Used,Luxus
1,Blue,New,Small
2,Red,Used,Luxus
3,Red,New,Small
4,Blue,Used,Luxus


In [13]:
for var in X.columns:
    print(X[var].value_counts(normalize=True))
    print()

Blue     0.344375
Red      0.328750
Green    0.326875
Name: Colour, dtype: float64

Used    0.51125
New     0.48875
Name: Condition, dtype: float64

Small      0.256250
Classic    0.255625
Smart      0.251875
Luxus      0.236250
Name: Model, dtype: float64



In [14]:
# set up SMOTE

sampler = SMOTEN(
    sampling_strategy='auto', # samples only the minority class
    random_state=0,  # for reproducibility
    k_neighbors=5,
    n_jobs=4,
)

X_res, y_res = sampler.fit_resample(X, y)

In [15]:
for var in X.columns:
    print(X_res[var].value_counts(normalize=True))
    print()

Blue     0.343728
Green    0.331254
Red      0.325017
Name: Colour, dtype: float64

Used    0.517325
New     0.482675
Name: Condition, dtype: float64

Smart      0.273042
Classic    0.258143
Small      0.241511
Luxus      0.227304
Name: Model, dtype: float64



In [16]:
# y_res is balanced

y_res.value_counts()

1    1443
0    1443
dtype: int64