<a href="https://colab.research.google.com/github/saikanthatluri/Class-Imbalance-comparision-in-Over-Sampling-Methods/blob/main/Over_sampling_Methods_Comparision.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Comparision of different over-sampling algorithms to improve the performance of Random Forests on different datasets with imbalanced classes.




**Importing Libraries**

In [1]:
!pip install -U imbalanced-learn

Collecting imbalanced-learn
[?25l  Downloading https://files.pythonhosted.org/packages/c8/81/8db4d87b03b998fda7c6f835d807c9ae4e3b141f978597b8d7f31600be15/imbalanced_learn-0.7.0-py3-none-any.whl (167kB)
[K     |██                              | 10kB 27.6MB/s eta 0:00:01[K     |████                            | 20kB 31.7MB/s eta 0:00:01[K     |█████▉                          | 30kB 17.6MB/s eta 0:00:01[K     |███████▉                        | 40kB 16.0MB/s eta 0:00:01[K     |█████████▉                      | 51kB 15.2MB/s eta 0:00:01[K     |███████████▊                    | 61kB 13.5MB/s eta 0:00:01[K     |█████████████▊                  | 71kB 13.9MB/s eta 0:00:01[K     |███████████████▊                | 81kB 15.1MB/s eta 0:00:01[K     |█████████████████▋              | 92kB 13.7MB/s eta 0:00:01[K     |███████████████████▋            | 102kB 12.7MB/s eta 0:00:01[K     |█████████████████████▋          | 112kB 12.7MB/s eta 0:00:01[K     |███████████████████████▌  

In [2]:
from collections import Counter

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC


In [4]:
from imblearn.over_sampling import (
    SMOTE,
    BorderlineSMOTE,
    SVMSMOTE,
)

from imblearn.datasets import fetch_datasets

from imblearn.under_sampling import EditedNearestNeighbours, TomekLinks

from imblearn.combine import SMOTEENN, SMOTETomek

In [5]:
oversampler_dict = {
    'svm': SVMSMOTE(
        sampling_strategy='auto',
        random_state=0,
        k_neighbors=5,
        m_neighbors=10,
        n_jobs=4,
        svm_estimator=SVC(kernel='linear')),
      
    'border1': BorderlineSMOTE(
        sampling_strategy='auto',
        random_state=0,
        k_neighbors=5,
        m_neighbors=10,
        kind='borderline-1',
        n_jobs=4),
      
    'smote': SMOTE(
        sampling_strategy='auto',
        random_state=0,
        k_neighbors=5,
        n_jobs=4),

    'smenn': SMOTEENN(
        sampling_strategy='auto',
        random_state=0,
        smote=SMOTE(sampling_strategy='auto', random_state=0, k_neighbors=5),
        enn=EditedNearestNeighbours(
            sampling_strategy='auto', n_neighbors=3, kind_sel='all'),
        n_jobs=4),

    'smtomek': SMOTETomek(
        sampling_strategy='auto',
        random_state=0,
        smote=SMOTE(sampling_strategy='auto', random_state=0, k_neighbors=5),
        tomek=TomekLinks(sampling_strategy='all'),
        n_jobs=4),
}

Fetching Datasets from imblearn

In [6]:
datasets_ls = [
    'car_eval_34',
    'thyroid_sick',
    'arrhythmia'
    ]


Print Class Imbalance in Dataset

In [7]:
for dataset in datasets_ls:
    data = fetch_datasets()[dataset]
    print(dataset)
    print(Counter(data.target))
    print()

car_eval_34
Counter({-1: 1594, 1: 134})

thyroid_sick
Counter({-1: 3541, 1: 231})

arrhythmia
Counter({-1: 427, 1: 25})



Function to train Random Forest and evaluate performance

In [8]:
def run_randomForests(X_train, X_test, y_train, y_test):

    rf = RandomForestClassifier(
        n_estimators=100, random_state=39, max_depth=2, n_jobs=4)
    rf.fit(X_train, y_train)

    print('Train set')
    pred = rf.predict_proba(X_train)
    print(
        'Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:, 1])))

    print('Test set')
    pred = rf.predict_proba(X_test)
    print(
        'Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:, 1])))

    return roc_auc_score(y_test, pred[:, 1])

In [11]:
for dataset in datasets_ls:
    
    
    print("Dataset :"+dataset)
    
    # load dataset
    data = fetch_datasets()[dataset]
    
    # separate train and test
    X_train, X_test, y_train, y_test = train_test_split(
    data.data,  
    data.target, 
    test_size=0.3,
    random_state=0)
    
    # as some oversampling techniques use KNN
    # we set variables in the same scale
    scaler = MinMaxScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
        
    roc = run_randomForests(X_train, X_test, y_train, y_test)
    
   
    
    print()
    
    for oversampler in oversampler_dict.keys():
        
        print(oversampler)
        
        # resample
        X_resampled, y_resampled = oversampler_dict[oversampler].fit_resample(X_train, y_train)
        
        # evaluate performance
        roc = run_randomForests(X_resampled, X_test, y_resampled, y_test)
        
        
        print()
        
    print()

Dataset :car_eval_34
Train set
Random Forests roc-auc: 0.9581261802905924
Test set
Random Forests roc-auc: 0.9440504133074803

svm
Train set
Random Forests roc-auc: 0.9868208913040686
Test set
Random Forests roc-auc: 0.9754056536381264

border1
Train set
Random Forests roc-auc: 0.9889084801068846
Test set
Random Forests roc-auc: 0.9839524441269516

smote
Train set
Random Forests roc-auc: 0.9898270191801236
Test set
Random Forests roc-auc: 0.9777018063067661

smenn
Train set
Random Forests roc-auc: 0.9885568016854871
Test set
Random Forests roc-auc: 0.9751505255638331

smtomek
Train set
Random Forests roc-auc: 0.9898270191801236
Test set
Random Forests roc-auc: 0.9777018063067661


Dataset :thyroid_sick
Train set
Random Forests roc-auc: 0.9646448684059303
Test set
Random Forests roc-auc: 0.9521203914568843

svm
Train set
Random Forests roc-auc: 0.9694301766481193
Test set
Random Forests roc-auc: 0.9401797254877824

border1
Train set
Random Forests roc-auc: 0.9617160200097215
Test set
Ra