In [1]:
%pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Installing collected packages: imblearn
Successfully installed imblearn-0.0
Note: you may need to restart the kernel to use updated packages.


## expand a labeled dataset to SMOTE

magnify the number of minority/exceptional cases within the sequence dataset, ideally targets the binary binned dataset.

[reference 1](<ver5-ordinal-binning-grid-searches/step 2-0, ranged clustering, with time.ipynb>)

different oversampling tools: Naive random oversampling, SMOTE, ADASYN, SMOTENC

In [2]:
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN, BorderlineSMOTE, SVMSMOTE,SMOTENC, KMeansSMOTE
from imblearn.combine import SMOTEENN, SMOTETomek
from collections import Counter
import pandas as pd 

from sklearn.metrics import balanced_accuracy_score
from imblearn.ensemble import BalancedBaggingClassifier,BalancedRandomForestClassifier,RUSBoostClassifier

from sklearn.neighbors import KNeighborsClassifier

from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import HistGradientBoostingClassifier, GradientBoostingClassifier, RandomForestClassifier, \
ExtraTreesClassifier, RandomTreesEmbedding, BaggingClassifier
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from scipy import stats

from sklearn.tree import DecisionTreeClassifier
import shap
import pandas as pd
import time
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt


In [4]:
m2_pipeline = pd.read_csv("binary_binned_pipeline.csv")

In [5]:
m2_pipeline.columns #do this to identify the index of the categorical feature, for below setup

Index(['Unnamed: 0', 'precursor_buy_cap_pct_change',
       'precursor_ask_cap_pct_change', 'precursor_bid_vol_pct_change',
       'precursor_ask_vol_pct_change', 'sum_change', 'length',
       'surge_targets_met_pct', 'time', 'label'],
      dtype='object')

In [None]:
#set up toolsets as functions to build separate datasets, bin_Naive, bin_SMOTE, bin_ADASYN

def build_naive(d):  #https://imbalanced-learn.org/stable/over_sampling.html#naive-random-over-sampling
    ros = RandomOverSampler(random_state=42, sampling_strategy='minority')
    X_resampled, y_resampled = ros.fit_resample(X, y)
    print("ROS",sorted(Counter(y_resampled).items()))
    return X_resampled, y_resampled

def build_smote(d): #https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.SMOTE.html#smote
    X_resampled, y_resampled = SMOTE(random_state=42,categorical_features='label', categorical_encoder=None, ).fit_resample(X, y)
    print("SMOTE",sorted(Counter(y_resampled).items()))
    return X_resampled, y_resampled

def build_adasyn(d): #https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.ADASYN.html#adasyn
    X_resampled, y_resampled = ADASYN(random_state=42,sampling_strategy='minority').fit_resample(X, y)
    print("ADASYN",sorted(Counter(y_resampled).items()))
    return X_resampled, y_resampled

def build_borderline(d): #https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.BorderlineSMOTE.html#borderlinesmote
    X_resampled, y_resampled = BorderlineSMOTE(random_state=42,sampling_strategy='minority').fit_resample(X, y)
    print("BORDERLINE",sorted(Counter(y_resampled).items())) 
    return X_resampled, y_resampled

def build_smotenc(d): #https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.SMOTENC.html#smotenc
    smote_nc = SMOTENC(categorical_features="infer", random_state=42,sampling_strategy='minority')
    X_resampled, y_resampled = smote_nc.fit_resample(X, y)
    print("SMOTENC",sorted(Counter(y_resampled).items()))
    return X_resampled, y_resampled

def build_svmsmote(d):
    sm = SVMSMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X, y)
    print("SVMSMOTE",'Resampled dataset shape %s' % Counter(y_res))
    return X_res, y_res

def build_kmsmote(d): #https://imbalanced-learn.org/stable/combine.html#combination-of-over-and-under-sampling
    m = KMeansSMOTE(kmeans_estimator=MiniBatchKMeans(n_init=1, random_state=42), random_state=42,sampling_strategy='minority')
    X_res, y_res = sm.fit_resample(X, y)
    # Find the number of new samples in the middle blob
    n_res_in_middle = ((X_res[:, 0] > -5) & (X_res[:, 0] < 5)).sum()
    print("KMSMOTE","Samples in the middle blob: %s" % n_res_in_middle)
    return X_res, y_res

def build_smoteenn(d):
    smote_enn = SMOTEENN(random_state=42,sampling_strategy='minority')
    X_resampled, y_resampled = smote_enn.fit_resample(X, y)
    print("SMOTEENN",sorted(Counter(y_resampled).items()))
    return X_resampled, y_resampled

 def build_smotetomek(d):
    smote_tomek = SMOTETomek(random_state=42,sampling_strategy='minority')
    X_resampled, y_resampled = smote_tomek.fit_resample(X, y)
    print("SMOTETOMEK",sorted(Counter(y_resampled).items()))
    return X_resampled, y_resampled

### take optimal classifier parameters and technique

In [None]:

# Splitting the dataframe into features and labels
# X = m2_pipeline.drop(columns=['label'])
def getBestClassifier(name, dataset):   #'kmsmote', build_svmsmote(source)
    # y = m2_pipeline['label'].values #per https://stackoverflow.com/a/73095562/12001832
    # X = m2_pipeline[keepable].values
    y = dataset[1].values
    X = dataset[0].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler = StandardScaler()  #normalize all numeric columns
    X_train_scaled = scaler.fit_transform(X_train)
    
    classifiers = [  # Define the classifiers and their respective hyperparameters
        RandomForestClassifier(),
        GradientBoostingClassifier(),
        HistGradientBoostingClassifier(),
        ExtraTreesClassifier(),
        BaggingClassifier(),
        RidgeCV(),
        LassoCV(),
        SVC(),
        LogisticRegression(),
        BernoulliNB(),
        KNNeighbors(),
    ]
    params = {
        'RandomForestClassifier': {'n_estimators': [10, 100, 1000], 'max_depth': [None, 10, 100]},
        'GradientBoostingClassifier': {'n_estimators': [10, 100, 1000], 'max_depth': [None, 10, 100]},
        'HistGradientBoostingClassifier': {'learning_rate': [0.1, 0.01], 'max_iter': [100, 200]},
        'ExtraTreesClassifier':{'n_estimators': [10, 100, 1000], 'max_depth': [None, 10, 100]},
        'BaggingClassifier':{ 'n_estimators':[10],  'random_state':[42]},
        'RidgeCV':{'alphas':[0.1, 1.0, 10.0]},
        'LassoCV':{ 'eps':[0.001, 0.01, .1], 'n_alphas':[100,200],  'max_iter':[100,200,300,1000]},
        'SVC':{'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
        'LogisticRegression':{'C': [0.1, 1, 10], 'penalty':['l1','l2','elasticnet','None'], 'multi_class':['ovr','auto'],\
                              'random_state':[42]},
        'BernoulliNB':{'fit_prior':[True, False]},
        'KNNeighbors':{'n_neighbors':[3,4,5,6,7,8], 'algorithm':['auto'], 'n_jobs':[1,2,3,4]}
    }
    comparative = []
    # Perform the grid search
    for clf in classifiers:
        name = clf.__class__.__name__
        if name in params:
            grid_search = GridSearchCV(clf, params[name], cv=5)
            grid_search.fit(X_train_scaled, y_train)
            
            print(f"Best parameters for {name}: {grid_search.best_params_}")
            accuracy = grid_search.score(X_test, y_test)
            
            dict = {"classifier":name, "best_params":grid_search.best_params_, "accuracy":accuracy}
            comparative.append(dict)
    return(comparative)
    # dg = pd.DataFrame(comparative) #display grid

In [None]:
#trigger the search mechanism, per SMOTE method

#loop through each oversampled dataset, then run the classifier search on that set, outlining each set, before hand

#for each set, run the classifier search
source = 'binary_binned_pipeline.csv'
resultSet = []
resultSet.concat(getBestClassifier('naive', build_naive(source)))  #many rows
resultSet.concat(getBestClassifier('smote', build_smote(source)))
resultSet.concat(getBestClassifier('adasyn', build_adasyn(source)))
resultSet.concat(getBestClassifier('borderline', build_borderline(source)))
resultSet.concat(getBestClassifier('smotenc', build_smotenc(source)))
resultSet.concat(getBestClassifier('svmsmote', build_svmsmote(source)))
resultSet.concat(getBestClassifier('kmsmote', build_kmsmote(source)))
resultSet.concat(getBestClassifier('smoteenn', build_smoteenn(source)))
resultSet.concat(getBestClassifier('smotetomek', build_smotetomek(source)))

optimals = pd.DataFrame(resultSet)
optimals.groupby(column="accuracy")

[pitfalls of oversampling](https://imbalanced-learn.org/stable/common_pitfalls.html#data-leakage)

## sampling based ensemble methods

score each, can you combine into a voter?

[validation curve model selection](https://imbalanced-learn.org/stable/auto_examples/model_selection/plot_validation_curve.html#plotting-validation-curves)

In [None]:
# balanced bagger

bbc = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                sampling_strategy='not majority',
                                replacement=False,
                                random_state=42)
bbc.fit(X_train, y_train)
y_pred = bbc.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
#balanced tree estimator
brf = BalancedRandomForestClassifier(
    n_estimators=100, random_state=42, sampling_strategy="all", replacement=True
)
brf.fit(X_train, y_train)
y_pred = brf.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
rusboost = RUSBoostClassifier(n_estimators=200, algorithm='SAMME.R',
                              random_state=42)
rusboost.fit(X_train, y_train)
y_pred = rusboost.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
import shap #this library would take 30 hours to explain the above model, not to be used. would function on one tree, better
from sklearn.ensemble import  VotingClassifier 
from sklearn.model_selection import cross_val_score
X = m2_pipeline[keepable].values  #.drop(columns=['label']).values #per https://stackoverflow.com/a/73095562/12001832
y = m2_pipeline['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#normalize all numeric columns
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Define VotingClassifier 
weights = [0.1, 0.5, 0.1, 0.1, 0.2]
vc = VotingClassifier(estimators=[('rfc',rfc),('gbc',gbc),('hgbc',hgbc),('etc',etc),('bc',bc)], voting='hard', weights=weights) 
rfc.fit(X_train_scaled,y_train)
gbc.fit(X_train_scaled,y_train)
hgbc.fit(X_train_scaled,y_train)
etc.fit(X_train_scaled,y_train)
bc.fit(X_train_scaled,y_train)
svc.fit(X_train_scaled,y_train)
vc.fit(X_train_scaled,y_train)

# #fit all, voting classifier scoring
for clf, label in zip([rfc,gbc,hgbc,etc,bc,svc,vc], ['RandomForestClassifier', 'GradientBoostingClassifier', 'HistGradientBoostingClassifier', 'ExtraTreesClassifier',\
                                                    'BaggingClassifier','SVC','Voting']):
    scores = cross_val_score(clf, X_train_scaled, y_train, scoring='accuracy', cv=5)
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))