<a href="https://colab.research.google.com/github/scottspurlock/mlfairness/blob/main/sampling_smote.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Playing with sampling using SMOTE
## Scott Spurlock
### 6/18/2021


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras import layers
from matplotlib import pyplot as plt
import sklearn.preprocessing as preprocessing
from sklearn.model_selection import train_test_split

# The following lines adjust the granularity of reporting.
pd.options.display.max_rows = 10
pd.options.display.float_format = "{:.1f}".format
# tf.keras.backend.set_floatx('float32')

tf.__version__

'2.5.0'

In [None]:
# imbalanced-learn is currently available on the PyPi's repositories and you can install it via pip:
# Note that installation seems to result in an error message suggesting "restart and run all" which fixes things.
!pip install -U imbalanced-learn

# The package is release also in Anaconda Cloud platform:
# conda install -c conda-forge imbalanced-learn

Requirement already up-to-date: imbalanced-learn in /usr/local/lib/python3.7/dist-packages (0.8.0)


## Load data

In [None]:
# Source: https://fairmlbook.org/code/adult.html
features = ["Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Martial Status",
        "Occupation", "Relationship", "Race", "Sex", "Capital Gain", "Capital Loss",
        "Hours per week", "Country", "Target"] 

# Change these to local file if available
train_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
test_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'

# This will download 3.8M
original_train = pd.read_csv(train_url, names=features, sep=r'\s*,\s*', 
                             engine='python', na_values="?")
# This will download 1.9M
original_test = pd.read_csv(test_url, names=features, sep=r'\s*,\s*', 
                            engine='python', na_values="?", skiprows=1)

# Drop Na values in train and test sets
original_train = original_train.dropna()
original_test = original_test.dropna()

num_train = len(original_train)
num_test = len(original_test)
original = pd.concat([original_train, original_test])
roc_original = original
labels = original['Target']
labels = labels.replace('<=50K', 0).replace('>50K', 1)
labels = labels.replace('<=50K.', 0).replace('>50K.', 1)
original

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-Num,Martial Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours per week,Country,Target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16275,33,Private,245211,Bachelors,13,Never-married,Prof-specialty,Own-child,White,Male,0,0,40,United-States,<=50K.
16276,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.
16278,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.
16279,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.


In [None]:
original.drop(['Target', 'Education', 'fnlwgt'], axis = 1, inplace = True)
original


Unnamed: 0,Age,Workclass,Education-Num,Martial Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours per week,Country
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba
...,...,...,...,...,...,...,...,...,...,...,...,...
16275,33,Private,13,Never-married,Prof-specialty,Own-child,White,Male,0,0,40,United-States
16276,39,Private,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States
16278,38,Private,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States
16279,44,Private,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States


## Preprocessing
Make data numeric (one-hot encoding) and standardized

In [None]:
numeric = pd.get_dummies(original)
numeric.head()

Unnamed: 0,Age,Education-Num,Capital Gain,Capital Loss,Hours per week,Workclass_Federal-gov,Workclass_Local-gov,Workclass_Private,Workclass_Self-emp-inc,Workclass_Self-emp-not-inc,Workclass_State-gov,Workclass_Without-pay,Martial Status_Divorced,Martial Status_Married-AF-spouse,Martial Status_Married-civ-spouse,Martial Status_Married-spouse-absent,Martial Status_Never-married,Martial Status_Separated,Martial Status_Widowed,Occupation_Adm-clerical,Occupation_Armed-Forces,Occupation_Craft-repair,Occupation_Exec-managerial,Occupation_Farming-fishing,Occupation_Handlers-cleaners,Occupation_Machine-op-inspct,Occupation_Other-service,Occupation_Priv-house-serv,Occupation_Prof-specialty,Occupation_Protective-serv,Occupation_Sales,Occupation_Tech-support,Occupation_Transport-moving,Relationship_Husband,Relationship_Not-in-family,Relationship_Other-relative,Relationship_Own-child,Relationship_Unmarried,Relationship_Wife,Race_Amer-Indian-Eskimo,...,Country_Canada,Country_China,Country_Columbia,Country_Cuba,Country_Dominican-Republic,Country_Ecuador,Country_El-Salvador,Country_England,Country_France,Country_Germany,Country_Greece,Country_Guatemala,Country_Haiti,Country_Holand-Netherlands,Country_Honduras,Country_Hong,Country_Hungary,Country_India,Country_Iran,Country_Ireland,Country_Italy,Country_Jamaica,Country_Japan,Country_Laos,Country_Mexico,Country_Nicaragua,Country_Outlying-US(Guam-USVI-etc),Country_Peru,Country_Philippines,Country_Poland,Country_Portugal,Country_Puerto-Rico,Country_Scotland,Country_South,Country_Taiwan,Country_Thailand,Country_Trinadad&Tobago,Country_United-States,Country_Vietnam,Country_Yugoslavia
0,39,13,2174,0,40,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,50,13,0,0,13,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,38,9,0,0,40,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,53,7,0,0,40,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,28,13,0,0,40,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
X = numeric.values
y = labels.values

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=0)

# We need to split the data into train and test sets before normalizing.
scaler = preprocessing.MinMaxScaler()
Xtrain = scaler.fit_transform(Xtrain)
Xtest = scaler.transform(Xtest)

Xtest[:10, :10]

array([[0.08219178, 0.73333333, 0.        , 0.        , 0.44897959,
        0.        , 0.        , 1.        , 0.        , 0.        ],
       [0.21917808, 0.53333333, 0.        , 0.        , 0.55102041,
        0.        , 0.        , 1.        , 0.        , 0.        ],
       [0.30136986, 0.6       , 0.        , 0.        , 0.37755102,
        0.        , 0.        , 1.        , 0.        , 0.        ],
       [0.36986301, 0.4       , 0.        , 0.        , 0.39795918,
        0.        , 0.        , 1.        , 0.        , 0.        ],
       [0.19178082, 0.53333333, 0.        , 0.        , 0.39795918,
        0.        , 0.        , 1.        , 0.        , 0.        ],
       [0.63013699, 0.53333333, 0.        , 0.        , 0.39795918,
        0.        , 0.        , 1.        , 0.        , 0.        ],
       [0.12328767, 0.53333333, 0.        , 0.        , 0.39795918,
        0.        , 0.        , 1.        , 0.        , 0.        ],
       [0.10958904, 0.66666667, 0.2782827

## SMOTENC
Use the SMOTE algorithm for  Numeric and Categorical data to resample so that the minority class has as many examples as the majority class.

In [None]:
from collections import Counter
import imblearn
from imblearn.over_sampling import SMOTENC
print(imblearn.__version__)


0.8.0


In [None]:
# Adapted from https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.SMOTENC.html
male_col_index = numeric.columns.to_list().index('Sex_Male')
female_col_index = numeric.columns.to_list().index('Sex_Female')

male_mask = Xtrain[:, male_col_index] == 1
Xtrain_male = Xtrain[male_mask]
ytrain_male = ytrain[male_mask]
Xtrain_female = Xtrain[~male_mask]
ytrain_female = ytrain[~male_mask]

print(f'Original male dataset samples per class {Counter(ytrain_male)}')
print(f'Original female dataset samples per class {Counter(ytrain_female)}')

mcnt = Counter(ytrain_male)
fcnt = Counter(ytrain_female)

male_low_mask = ytrain_male == 0
female_low_mask = ytrain_female == 0

Xtrain_male_low = Xtrain_male[male_low_mask]
ytrain_male_low = ytrain_male[male_low_mask]
Xtrain_male_high = Xtrain_male[~male_low_mask]
ytrain_male_high = ytrain_male[~male_low_mask]

Xtrain_female_low = Xtrain_female[female_low_mask]
ytrain_female_low = ytrain_female[female_low_mask]
Xtrain_female_high = Xtrain_female[~female_low_mask]
ytrain_female_high = ytrain_female[~female_low_mask]

# Subsample so we have the same number of male/female examples.
# Reduce the highest count among hi/lo male/female to be the same as the second
# highest count to make things come out right.
if mcnt[0] > mcnt[1] and mcnt[0] > fcnt[0] and mcnt[0] > fcnt[1]:
  # subsample male low income
  number_of_rows = mcnt[0]
  desired_rows = max(fcnt[0], fcnt[1])
  random_indices = np.random.choice(number_of_rows, size=desired_rows, replace=False)
  
  Xtrain_male_low = Xtrain_male_low[random_indices, :]
  ytrain_male_low = ytrain_male_low[random_indices]

elif mcnt[1] > mcnt[0] and mcnt[1] > fcnt[0] and mcnt[1] > fcnt[1]:
  # subsample male high income
  number_of_rows = mcnt[1]
  desired_rows = max(fcnt[0], fcnt[1])
  random_indices = np.random.choice(number_of_rows, size=desired_rows, replace=False)
  
  Xtrain_male_high = Xtrain_male_high[random_indices, :]
  ytrain_male_high = ytrain_male_high[random_indices]

elif fcnt[0] > mcnt[0] and fcnt[0] > mcnt[1] and fcnt[0] > fcnt[1]:
  # subsample female low income
  number_of_rows = fcnt[0]
  desired_rows = max(mcnt[0], mcnt[1])
  random_indices = np.random.choice(number_of_rows, size=desired_rows, replace=False)
  
  Xtrain_female_low = Xtrain_female_low[random_indices, :]
  ytrain_female_low = ytrain_female_low[random_indices]

elif fcnt[1] > mcnt[0] and fcnt[1] > mcnt[1] and fcnt[1] > fcnt[0]:
  # subsample female high income
  number_of_rows = fcnt[1]
  desired_rows = max(mcnt[0], mcnt[1])
  random_indices = np.random.choice(number_of_rows, size=desired_rows, replace=False)
  
  Xtrain_female_high = Xtrain_female_high[random_indices, :]
  ytrain_female_high = ytrain_female_high[random_indices]

Xtrain_male = np.concatenate((Xtrain_male_low, Xtrain_male_high), axis=0)
ytrain_male = np.concatenate((ytrain_male_low, ytrain_male_high), axis=0)

Xtrain_female = np.concatenate((Xtrain_female_low, Xtrain_female_high), axis=0)
ytrain_female = np.concatenate((ytrain_female_low, ytrain_female_high), axis=0)

print(f'Subsampled male dataset samples per class {Counter(ytrain_male)}')
print(f'Subsampled female dataset samples per class {Counter(ytrain_female)}')

# Resampling to equalize number of low/high income examples
sm = SMOTENC(random_state=0, categorical_features=[0, 1, 3, 4, 5, 6, 7, 11])
Xtrain_male_res, ytrain_male_res = sm.fit_resample(Xtrain_male, ytrain_male)
print(f'Resampled male dataset samples per class {Counter(ytrain_male_res)}')

Xtrain_female_res, ytrain_female_res = sm.fit_resample(Xtrain_female, ytrain_female)
print(f'Resampled female dataset samples per class {Counter(ytrain_female_res)}')


Original male dataset samples per class Counter({0: 14630, 1: 6718})
Original female dataset samples per class Counter({0: 9126, 1: 1181})
Subsampled male dataset samples per class Counter({0: 9126, 1: 6718})
Subsampled female dataset samples per class Counter({0: 9126, 1: 1181})
Resampled male dataset samples per class Counter({0: 9126, 1: 9126})
Resampled female dataset samples per class Counter({0: 9126, 1: 9126})


In [None]:
Xtrain = np.concatenate((Xtrain_male_res, Xtrain_female_res), axis=0)
ytrain = np.concatenate((ytrain_male_res, ytrain_female_res), axis=0)
print(Xtrain.shape, ytrain.shape)

(36504, 12) (36504,)
