# Imbalanced Learning Techniques

### Import Libraries


In [1]:
import pandas as pd
import numpy as np
import scipy
import scipy.stats as stats
from scipy.stats import normaltest

import math
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.linear_model  import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.metrics import roc_auc_score
from sklearn.neighbors import KNeighborsRegressor

# imblearn
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import (
    RandomUnderSampler,
    CondensedNearestNeighbour,
    TomekLinks,
    OneSidedSelection,
    EditedNearestNeighbours,
    RepeatedEditedNearestNeighbours,
    AllKNN,
    NeighbourhoodCleaningRule,
    NearMiss,
    InstanceHardnessThreshold
)

from imblearn.over_sampling import (
    RandomOverSampler,
    SMOTE,
    ADASYN,
    BorderlineSMOTE,
    SVMSMOTE,
    SMOTENC,
    KMeansSMOTE
)

from imblearn.combine import SMOTEENN, SMOTETomek


from imblearn.ensemble import (
    BalancedBaggingClassifier,
    BalancedRandomForestClassifier,
    RUSBoostClassifier,
    EasyEnsembleClassifier,
)

from sklearn.ensemble import (
    RandomForestClassifier,
    BaggingClassifier,
    AdaBoostClassifier,
)
from sklearn.svm import SVC

# adding common folder location to sys.path
import sys
sys.path.append('../common')

from helper import get_config

### Loading Config

In [2]:
#config = get_config()

### Load Dataset

In [3]:
# path to your dataset, can be a csv file or xlsx
dataset_path = "../dataset/Bank_Personal_Loan_Modelling_transformed.xlsx"

## use code as per the type of data source

## use below line to read data from csv file
## df = pd.read_csv(dataset_path)
df = pd.read_excel(dataset_path, index_col=0)

In [5]:
df.head()

Unnamed: 0_level_0,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Securities Account,CD Account,Online,CreditCard,Personal Loan
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,-1.0,-0.95,-0.254237,-0.863923,1.0,0.055556,-0.5,0.0,1,0,-1,0,0
2,0.0,-0.05,-0.508475,-1.241379,0.5,0.0,-0.5,0.0,1,0,-1,0,0
3,-0.3,-0.25,-0.898305,0.475714,-0.5,-0.277778,-0.5,0.0,0,0,-1,0,0
4,-0.5,-0.55,0.610169,0.250278,-0.5,0.666667,0.0,0.0,0,0,-1,0,0
5,-0.5,-0.6,-0.322034,-0.781238,1.0,-0.277778,0.0,0.0,0,0,-1,1,0


In [6]:
target = 'Personal Loan'
# df_x = df.drop(columns=[target])
# df_y = df[target]

### 5. Handling Class Imbalance

In [34]:
# method for testing
def run_method(method, df):
    class_cnt = df[target].value_counts()
    print('Before run size majority:{0}, minority:{1}'.format(class_cnt[0], class_cnt[1]))
    df_sampled = method(df)
    class_cnt = df_sampled[target].value_counts()
    print('After run size majority:{0}, minority:{1}'.format(class_cnt[0], class_cnt[1]))

1. Under Sampling
* refer notebook example [here](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-04-Undersampling/04-01-Random-Undersampling.ipynb)
* documentation [here]

In [35]:
# imblearn random under sampling
# params: strategy: 'majority'(it specifies to undersample majority class to have 1:1 ratio)
#         strategy: 0.5 (resultant ratio will be 1:0.5)

def apply_random_undersampling(df, strategy='auto'):
    # define oversampling strategy
    rus = RandomUnderSampler(
        sampling_strategy=strategy,  # 'auto' - samples only the majority class
        random_state=0,  # for reproducibility
        replacement=True # if it should resample with replacement
    )  
    # fit and apply the transform
    x_train = df.drop(columns=[target])
    y_train=df[target]
    x_train_under, y_train_under = rus.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_under[target] = y_train_under
    return x_train_under

In [36]:
run_method(apply_random_undersampling, df[:20])

Before run size majority:17, minority:3
After run size majority:3, minority:3


2. Condensed Nearest Neighbour
* refere example notebook [here](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-04-Undersampling/04-02-Condensed-Nearest-Neighbours.ipynb)
* refer doc here

In [37]:
def apply_cnn_undersampling(df, strategy='auto'):
    
    # define oversampling strategy
    cnn = CondensedNearestNeighbour(
        sampling_strategy=strategy,  # undersamples only the majority class
        random_state=0,            # for reproducibility
        n_neighbors=1,             # default
        n_jobs=4                   # I have 4 cores in my laptop
    )   
    
    # fit and apply the transform
    x_train = df.drop(columns=[target])
    y_train=df[target]
    x_train_resampled, y_train_resampled = cnn.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

In [38]:
run_method(apply_cnn_undersampling, df[:20])

Before run size majority:17, minority:3
After run size majority:2, minority:3


3. Tomet Link
* refere notebook example [here](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-04-Undersampling/04-03-Tomek-Links.ipynb)
* refer doc here

In [39]:
def apply_tometlink_undersampling(df, strategy='auto'):
    
    # define oversampling strategy
    tl = TomekLinks(
        sampling_strategy=strategy,  # undersamples only the majority class
        n_jobs=4                   # I have 4 cores in my laptop
    )    
    
    # fit and apply the transform
    x_train = df.drop(columns=[target])
    y_train=df[target]
    x_train_resampled, y_train_resampled = tl.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

In [41]:
run_method(apply_tometlink_undersampling, df[:20])

Before run size majority:17, minority:3
After run size majority:17, minority:3


4. One Sided Selection
* refer notebook example [here](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-04-Undersampling/04-04-One-Sised-Selection.ipynb)
* refer doc here.

In [43]:
def apply_onesidedselection_undersampling(df, strategy='auto'):
    
    # define oversampling strategy
    oss = OneSidedSelection(
        sampling_strategy=strategy,  # undersamples only the majority class
        random_state=0,            # for reproducibility
        n_neighbors=1,             # default
        n_jobs=4                   # I have 4 cores in my laptop
    )   
    
    # fit and apply the transform
    x_train = df.drop(columns=[target])
    y_train=df[target]
    x_train_resampled, y_train_resampled = oss.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

In [44]:
run_method(apply_onesidedselection_undersampling, df[:20])

Before run size majority:17, minority:3
After run size majority:3, minority:3


5. EditedNearestNeighbours
* refer notebook [here](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-04-Undersampling/04-05-Edited-Nearest-Neighbours.ipynb)
* refer doc [here]

In [45]:
def apply_editednearestneighbour_undersampling(df, strategy='majority'):
    
    # define oversampling strategy
    enn = EditedNearestNeighbours(
        sampling_strategy='auto',  # undersamples only the majority class
        n_neighbors=3,
        kind_sel='all',            # all neighbours need to have the same label as the observation examined
        n_jobs=4                   # I have 4 cores in my laptop 
    ) 
    
    # fit and apply the transform
    x_train = df.drop(columns=[target])
    y_train=df[target]
    x_train_resampled, y_train_resampled = enn.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

In [46]:
run_method(apply_editednearestneighbour_undersampling, df[:20])

Before run size majority:17, minority:3
After run size majority:17, minority:3


6. Repeated Edited Nearest Neighbours
* refer notebook [here](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-04-Undersampling/04-05-Edited-Nearest-Neighbours.ipynb)
* refer doc [here]

In [47]:
def apply_repeated_enn_undersampling(df, strategy='auto'):
    
    # define oversampling strategy
    renn = RepeatedEditedNearestNeighbours(
        sampling_strategy=strategy, # removes only the majority class
        n_neighbors=3,            # 3 KNN
        kind_sel='all',           # all neighbouring observations should show the same class
        n_jobs=4,                 # 4 processors in my laptop
        max_iter=100              # maximum number of iterations 
    )
    
    # fit and apply the transform
    x_train = df.drop(columns=[target])
    y_train=df[target]
    x_train_resampled, y_train_resampled = renn.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

In [48]:
run_method(apply_repeated_enn_undersampling, df[:20])

Before run size majority:17, minority:3
After run size majority:17, minority:3


7. All K Nearest Neighbours
* refer notebook [here](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-04-Undersampling/04-07-All-KNN.ipynb)
* refer doc [here]

In [49]:
def apply_allknn_undersampling(df, strategy='auto'):
    
    # define oversampling strategy
    allknn = AllKNN(
        sampling_strategy=strategy,  # undersamples only the majority class
        n_neighbors=3,
        kind_sel='all',            # all neighbours need to have the same label as the observation examined
        n_jobs=4                   # I have 4 cores in my laptop
    )  
    
    # fit and apply the transform
    x_train = df.drop(columns=[target])
    y_train=df[target]
    x_train_resampled, y_train_resampled = allknn.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

In [50]:
run_method(apply_allknn_undersampling, df[:20])

Before run size majority:17, minority:3
After run size majority:17, minority:3


8. Neighbourd Cleaning Rule
* refer notebook [here](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-04-Undersampling/04-08-Neighbourhood-Cleaning-Rule.ipynb)
* refer doc [here]

In [51]:
def apply_neighbourhood_cleaning_rule_undersampling(df, strategy='auto'):
    
    # define oversampling strategy
    ncr = NeighbourhoodCleaningRule(
        sampling_strategy=strategy,# removes only the majority class
        n_neighbors=3,           # 3 KNN
        kind_sel='all',          # all neighbouring observations should show the same class
        n_jobs=4,                # 4 processors in my laptop
        threshold_cleaning=0.5   # threshold no exclude or not observations
    )   
    
    # fit and apply the transform
    x_train = df.drop(columns=[target])
    y_train=df[target]
    x_train_resampled, y_train_resampled = ncr.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

In [52]:
run_method(apply_neighbourhood_cleaning_rule_undersampling, df[:20])

Before run size majority:17, minority:3
After run size majority:14, minority:3


9. NearMiss [v1|v2|v3]
* refer notebook [here](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-04-Undersampling/04-09-NearMiss.ipynb)
* refer doc [here]

In [54]:
def apply_nearmiss_v1_undersampling(df, strategy='auto'):
    
    # define oversampling strategy
    nm1 = NearMiss(
        sampling_strategy=strategy,  # undersamples only the majority class
        version=1,
        n_neighbors=3,
        n_jobs=4                   # I have 4 cores in my laptop  
    )  
    
    # fit and apply the transform
    x_train = df.drop(columns=[target])
    y_train=df[target]
    x_train_resampled, y_train_resampled = nm1.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

In [55]:
run_method(apply_nearmiss_v1_undersampling, df[:20])

Before run size majority:17, minority:3
After run size majority:3, minority:3


In [56]:
def apply_nearmiss_v2_undersampling(df, strategy='auto'):
    
    # define oversampling strategy
    nm2 = NearMiss(
        sampling_strategy=strategy,  # undersamples only the majority class
        version=2,
        n_neighbors=3,
        n_jobs=4                   # I have 4 cores in my laptop  
    )  
    
    # fit and apply the transform
    x_train = df.drop(columns=[target])
    y_train=df[target]
    x_train_resampled, y_train_resampled = nm2.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

In [57]:
run_method(apply_nearmiss_v2_undersampling, df[:20])

Before run size majority:17, minority:3
After run size majority:3, minority:3


In [58]:
def apply_nearmiss_v3_undersampling(df, strategy='auto'):
    
    # define oversampling strategy
    nm3 = NearMiss(
        sampling_strategy=strategy,  # undersamples only the majority class
        version=3,
        n_neighbors=3,
        n_jobs=4                   # I have 4 cores in my laptop  
    )  
    
    # fit and apply the transform
    x_train = df.drop(columns=[target])
    y_train=df[target]
    x_train_resampled, y_train_resampled = nm3.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

In [59]:
run_method(apply_nearmiss_v3_undersampling, df[:20])

Before run size majority:17, minority:3
After run size majority:3, minority:3


10. InstanceHardnessThreshold
* refer notebook [here](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-04-Undersampling/04-10-Instance-Hardness-Class.ipynb)
* refer doc [here]

In [60]:
def apply_instance_hardness_sampling_undersampling(df, strategy='auto'):
    
    # define oversampling strategy
    iht = InstanceHardnessThreshold(
        # TODO - review if we need to pass classifier as a parameter 
        # select a classifier, in this case Random Forests
        estimator=RandomForestClassifier(n_estimators=100, random_state=0),
        sampling_strategy='auto',  # undersamples only the majority class
        random_state=0,
        n_jobs=4,                  # have 4 processors in my laptop
        cv=3                       # cross validation fold 
    )  
    
    # fit and apply the transform
    x_train = df.drop(columns=[target])
    y_train=df[target]
    x_train_resampled, y_train_resampled = iht.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

In [61]:
run_method(apply_instance_hardness_sampling_undersampling, df[:20])

Before run size majority:17, minority:3
After run size majority:4, minority:3


<!-- #### COMPARISON
![image.png](attachment:image.png)
![image-2.png](attachment:image-2.png) -->

In [62]:
# COMPARISON
# ![image.png](attachment:image.png)
# ![image-2.png](attachment:image-2.png)

#### OVER SAMPLING

1. Random Over Sampler [Notebook](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-05-Oversampling/05-01-Random-Oversampling.ipynb), [Document]()


In [64]:
def apply_random_oversampling(df, strategy='auto'):
    
    # define oversampling strategy
    ros = RandomOverSampler(
        sampling_strategy=strategy, # samples only the minority class
        random_state=0,  # for reproducibility
    )  
    
    # fit and apply the transform
    x_train = df.drop(columns=[target])
    y_train=df[target]
    x_train_resampled, y_train_resampled = ros.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

In [65]:
run_method(apply_random_oversampling, df[:20])

Before run size majority:17, minority:3
After run size majority:17, minority:17


2. SMOTE [Notebook](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-05-Oversampling/05-02-SMOTE.ipynb) [Document]()

In [74]:
def apply_smote_oversampling(df, strategy='auto'):
    sm = SMOTE(
        random_state=42,
        sampling_strategy=strategy,
        n_jobs=4
    )

    x_train = df.drop(columns=[target])
    y_train=df[target]
    
    x_train_sm, y_train_sm = sm.fit_resample(x_train, y_train)

    x_train_sm[target] = y_train_sm
    return x_train_sm

In [75]:
run_method(apply_smote_oversampling, df[:100])

Before run size majority:88, minority:12
After run size majority:88, minority:88


3. SMOTE Nominal Continous for categorical data [notebook](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-05-Oversampling/05-03-SMOTE-NC.ipynb), [Document]()

In [79]:
def apply_smotenc_oversampling(df, strategy='auto'):
    
    # define oversampling strategy
    smnc = SMOTENC(
        sampling_strategy=strategy,  # samples only the minority class
        random_state=0,            # for reproducibility
        k_neighbors=5,
        n_jobs=4,
        categorical_features=[2,3] # indeces of the columns of categorical variables
    )    
    
    #separate train and test
    x_train = df.drop(columns=[target])
    y_train=df[target]
    
    # fit and apply the transform
    x_train_resampled, y_train_resampled = smnc.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

In [82]:
run_method(apply_smotenc_oversampling, df[:100])

Before run size majority:88, minority:12
After run size majority:88, minority:88


4. ADASYN [notebook](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-05-Oversampling/05-04-ADASYN.ipynb), [Document]()

In [83]:
def apply_adasyn_oversampling(df, strategy='auto'):
    
    # define oversampling strategy
    ada = ADASYN(
        sampling_strategy=strategy,  # samples only the minority class
        random_state=0,  # for reproducibility
        n_neighbors=5,
        n_jobs=4
    )   
    
    #separate train and test
    x_train = df.drop(columns=[target])
    y_train=df[target]
    
    # fit and apply the transform
    x_train_resampled, y_train_resampled = ada.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

In [84]:
run_method(apply_adasyn_oversampling, df[:100])

Before run size majority:88, minority:12
After run size majority:88, minority:84


5. BORDERLINE SMOTE [notebook](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-05-Oversampling/05-05-Borderline-SMOTE.ipynb), [Document]()

In [85]:
def apply_borderline1_oversampling(df, strategy='auto'):
    
    # define oversampling strategy
    sm_b1 = BorderlineSMOTE(
        sampling_strategy=strategy,  # samples only the minority class
        random_state=0,  # for reproducibility
        k_neighbors=5,
        m_neighbors=10,
        kind='borderline-1',
        n_jobs=4
    )  
    
    #separate train and test
    x_train = df.drop(columns=[target])
    y_train=df[target]
    
    # fit and apply the transform
    x_train_resampled, y_train_resampled = sm_b1.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

In [86]:
run_method(apply_borderline1_oversampling, df[:100])

Before run size majority:88, minority:12
After run size majority:88, minority:88


In [87]:
def apply_borderline2_oversampling(df, strategy='auto'):
    
    # define oversampling strategy
    sm_b1 = BorderlineSMOTE(
        sampling_strategy=strategy,  # samples only the minority class
        random_state=0,  # for reproducibility
        k_neighbors=5,
        m_neighbors=10,
        kind='borderline-2',
        n_jobs=4
    )  
    
    #separate train and test
    x_train = df.drop(columns=[target])
    y_train=df[target]
    
    # fit and apply the transform
    x_train_resampled, y_train_resampled = sm_b1.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

In [88]:
run_method(apply_borderline2_oversampling, df[:100])

Before run size majority:88, minority:12
After run size majority:88, minority:87


6. SVM SMOTE [notebook](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-05-Oversampling/05-06-SVM-SMOTE.ipynb), [Document]()

In [93]:
def apply_svmsmote_oversampling(df, strategy='auto'):
    
    # define oversampling strategy
    sm = SVMSMOTE(
        sampling_strategy=strategy,  # samples only the minority class
        random_state=0,              # for reproducibility
        k_neighbors=5,
        m_neighbors=10,
        n_jobs=4,
        svm_estimator = SVC(kernel='linear')
    )  
    
    #separate train and test
    x_train = df.drop(columns=[target])
    y_train=df[target]
    
    # fit and apply the transform
    x_train_resampled, y_train_resampled = sm.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

In [94]:
run_method(apply_svmsmote_oversampling, df[:100])

Before run size majority:88, minority:12
After run size majority:88, minority:57


7. K-Means SMOTE [notebook](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-05-Oversampling/05-07-K-Means-SMOTE.ipynb), [Document]()

In [98]:
def apply_kmeanssmote_oversampling(df, strategy='auto'):
    
    # define oversampling strategy
    sm = KMeansSMOTE(
        sampling_strategy=strategy,  # samples only the minority class
        random_state=0,              # for reproducibility
        k_neighbors=2,
        n_jobs=None,
        kmeans_estimator=KMeans(n_clusters=3, random_state=0),
        cluster_balance_threshold=0.1,
        density_exponent='auto'
    )  
    
    #separate train and test
    x_train = df.drop(columns=[target])
    y_train=df[target]
    
    # fit and apply the transform
    x_train_resampled, y_train_resampled = sm.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

In [101]:
run_method(apply_kmeanssmote_oversampling, df[:100])

Before run size majority:88, minority:12
After run size majority:88, minority:89


#### COMBINATION OF UNDER AND OVER SAMPLING

1. SMOTE + ENN [notebook](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-06-Over-and-Undersampling/06-01-SMOTEENN-and-SMOTETomek.ipynb), [Document]()

In [103]:
def apply_sm_enn_sampling(df, strategy='auto'):
    
    # define oversampling strategy
    sm = SMOTE(
        sampling_strategy=strategy,  # samples only the minority class
        random_state=0,  # for reproducibility
        k_neighbors=5,
        n_jobs=4
    )
    
    # define under sampling strategy
    # need ENN  as argument of SMOTEENN
    enn = EditedNearestNeighbours(
        sampling_strategy=strategy,
        n_neighbors=3,
        kind_sel='all',
        n_jobs=4)

    smenn = SMOTEENN(
        sampling_strategy='auto',  # samples only the minority class
        random_state=0,  # for reproducibility
        smote=sm,
        enn=enn,
        n_jobs=4
    )
    
    #separate train and test
    x_train = df.drop(columns=[target])
    y_train=df[target]
    
    # fit and apply the transform
    x_train_resampled, y_train_resampled = smenn.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

In [104]:
run_method(apply_sm_enn_sampling, df[:100])

Before run size majority:88, minority:12
After run size majority:88, minority:88


2. SMOTE + Tomek [notebook](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-06-Over-and-Undersampling/06-01-SMOTEENN-and-SMOTETomek.ipynb), [Document]()

In [105]:
def apply_sm_tomek_sampling(df, strategy='auto'):
    
    # define oversampling strategy
    sm = SMOTE(
        sampling_strategy=strategy,  # samples only the minority class
        random_state=0,  # for reproducibility
        k_neighbors=5,
        n_jobs=4
    )
    
    # define under sampling strategy
    # need tomek as argument of SMOTETomek
    tl = TomekLinks(
        sampling_strategy='all',
        n_jobs=4)

    smtomek = SMOTETomek(
        sampling_strategy='auto',  # samples only the minority class
        random_state=0,  # for reproducibility
        smote=sm,
        tomek=tl,
        n_jobs=4
    )
    
    #separate train and test
    x_train = df.drop(columns=[target])
    y_train=df[target]
    
    # fit and apply the transform
    x_train_resampled, y_train_resampled = smtomek.fit_resample(x_train, y_train)
    
    # merging y to x
    x_train_resampled[target] = y_train_resampled
    return x_train_resampled

In [106]:
run_method(apply_sm_tomek_sampling, df[:100])

Before run size majority:88, minority:12
After run size majority:88, minority:88


#### ENSEMBLE IMBALANCED LEARNING TECHNIQUE
* *TODO: imblearn comes with model + sampling techniques, review to move this to more appropriate place* as these techniques is different from just the data sampling methods

In [107]:
# just re-sampling methods (no classifier)

resampling_dict = {
    
    'random': RandomUnderSampler(
        sampling_strategy='auto',
        random_state=0,
        replacement=False,
    ),

    'smote': SMOTE(
        sampling_strategy='auto',
        random_state=0,
        k_neighbors=5,
        n_jobs=4,
    ),
}

In [108]:
# ensemble methods (with or without resampling)

ensemble_dict = {

    # balanced random forests (bagging)
    'balancedRF': BalancedRandomForestClassifier(
        n_estimators=20,
        criterion='gini',
        max_depth=3,
        sampling_strategy='auto',
        n_jobs=4,
        random_state=2909,
    ),

    # bagging of Logistic regression, no resampling
    'bagging': BaggingClassifier(
        base_estimator=LogisticRegression(random_state=2909),
        n_estimators=20,
        n_jobs=4,
        random_state=2909,
    ),

    # bagging of Logistic regression, with resampling
    'balancedbagging': BalancedBaggingClassifier(
        base_estimator=LogisticRegression(random_state=2909),
        n_estimators=20,
        max_samples=1.0,  # The number of samples to draw from X to train each base estimator
        max_features=1.0,  # The number of features to draw from X to train each base estimator
        bootstrap=True,
        bootstrap_features=False,
        sampling_strategy='auto',
        n_jobs=4,
        random_state=2909,
    ),

    # boosting + undersampling
    'rusboost': RUSBoostClassifier(
        base_estimator=None,
        n_estimators=20,
        learning_rate=1.0,
        sampling_strategy='auto',
        random_state=2909,
    ),

    # bagging + boosting + under-sammpling
    'easyEnsemble': EasyEnsembleClassifier(
        n_estimators=20,
        sampling_strategy='auto',
        n_jobs=4,
        random_state=2909,
    ),
}

In [117]:
## function to train random forests and evaluate the peensembleormance

# ensemble = ensemble_dict['choose_ensemble_technique']

def run_ensemble(model_name, X_train, y_train):
    print("{0}".format(model_name), end=': ')

    ensemble = ensemble_dict[model_name]
    ensemble.fit(X_train, y_train)
    pred = ensemble.predict_proba(X_train)
    
    print("roc_auc: {0}".format(roc_auc_score(y_train, pred[:, 1])))

    return roc_auc_score(y_train, pred[:, 1])

In [118]:
for model_name in ensemble_dict.keys():
    run_ensemble(model_name, df.drop(columns=[target]), df[target])

balancedRF: roc_auc: 0.9743754609144544
bagging: roc_auc: 0.959160674778761
balancedbagging: roc_auc: 0.9609554756637169
rusboost: roc_auc: 0.9831065634218289
easyEnsemble: roc_auc: 0.9853774889380531


#### Cost Sensitive learning approaches
* **Misclassification cost as part of learning**
    1. Defining the class_weight for those estimators that allow it, when we set the estimator. it can take values - |None|balanced|{0:1, and 1:10}(misclassification of class 1 will be penalized 10 times)|
    2. Passing the sample_weight vector with the weights for every single observation, when we *fit the estimator*. Sample weight is the vector of the same length as y, containing the weight or penalty for each individual observation. It's more flexible as it allows us to set weight to the observation and not the classes.
    NOTE: the costs such as 'class_weight' can be optimized using the hyperparameter optimization techniques.
    
* **MetaCost learning** - This is recent method and most likely has not been introduced in the popular libraries. Idea is to use the conditional risk of misclassifying the observations using Bayes Conditional Probabilities. For example refer [notebook](https://github.com/solegalli/machine-learning-imbalanced-data/blob/master/Section-08-Cost-Sensitive-Learning/08-03-MetaCost.ipynb) and video in udemy course.

#### Probability Calibration
* Refer slides [here](https://amueller.github.io/COMS4995-s20/slides/aml-10-calibration-imbalanced-data/#53) for understadning the topic
* Refer [notebook](https://github.com/solegalli/machine-learning-imbalanced-data/tree/master/Section-09-Probability-Calibration) here for example for calibrated classifiers.

### Save transformed dataset

In [119]:
# df_balanced.head()
# df_balanced.to_excel('../Bank_Personal_Loan_Modelling_balanced.xlsx')