In [2]:
import numpy as np
import matplotlib.pyplot as plt
from random import randint
from imblearn.over_sampling import SMOTE
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold

Using TensorFlow backend.


In [4]:
def smote(X, Y, ratio = 1):
    sm = SMOTE(sampling_strategy = ratio)
    return sm.fit_resample(X, Y)

In [7]:
X = [[100,1,2],[150,0.2,4],[123,6,4],[200,6,9],[100,1,2],[150,0.2,4],[123,6,4],[200,6,9],[100,1,2],[150,0.2,4],[123,6,4],[200,6,9],[100,1,2],[150,0.2,4],[123,6,4],[200,6,9],[100,1,2],[150,0.2,4],[123,6,4],[200,6,9],[3,4,5],[5,6,7],[3.1,4,5],[5.2,6,7],[3.6,4,5],[5.3,6,7],[3.5,4,5],[5.6,6,7],[3.9,4,5],[9,6,7]]
Y = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1]
X_new, Y_new = smote(X, Y)
print(X_new)
print(Y_new)

[[100.0, 1.0, 2.0], [150.0, 0.2, 4.0], [123.0, 6.0, 4.0], [200.0, 6.0, 9.0], [100.0, 1.0, 2.0], [150.0, 0.2, 4.0], [123.0, 6.0, 4.0], [200.0, 6.0, 9.0], [100.0, 1.0, 2.0], [150.0, 0.2, 4.0], [123.0, 6.0, 4.0], [200.0, 6.0, 9.0], [100.0, 1.0, 2.0], [150.0, 0.2, 4.0], [123.0, 6.0, 4.0], [200.0, 6.0, 9.0], [100.0, 1.0, 2.0], [150.0, 0.2, 4.0], [123.0, 6.0, 4.0], [200.0, 6.0, 9.0], [3.0, 4.0, 5.0], [5.0, 6.0, 7.0], [3.1, 4.0, 5.0], [5.2, 6.0, 7.0], [3.6, 4.0, 5.0], [5.3, 6.0, 7.0], [3.5, 4.0, 5.0], [5.6, 6.0, 7.0], [3.9, 4.0, 5.0], [9.0, 6.0, 7.0], [3.6181726959910634, 4.0, 5.0], [3.2280618210836685, 4.0, 5.0], [3.2953696389781566, 4.0, 5.0], [5.052720053470037, 6.0, 7.0], [3.7377577695010022, 4.0, 5.0], [5.135033550917499, 6.0, 7.0], [7.61633258500434, 6.0, 7.0], [3.2677456752594294, 4.0, 5.0], [5.19519847342316, 6.0, 7.0], [3.4278388767986323, 4.0, 5.0]]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [None]:
# Assumption: All the minority class samples appear after majority class samples 
def esmote(X, Y, ratio = 1, num_candidates = 10, iter = 15, clf):
    candidates = []
    num_pairs = int(num_candidates*0.2)
    original_size = len(X)
    
    # Generate the candidates
    for i in range(num_candidates):
        X_res, Y_res = smote(X, Y, ratio)
        candidates.append([X_res,Y_res])
        
    res_size = len(candidates[0][0])
    
    # Evolution steps
    for i in range(iter):
        f1_scores = []
        
        # Calculate fitness level (f1) for each candidate
        for c in range(num_candidates):
            skf = StratifiedKFold(n_splits=5)
            f1 = cross_val_score(clf, X = candidates[c][0], y = candidates[c][1], scoring = 'f1', cv = skf)
            f1_scores.append(mean(f1))
            
        # Eliminate the least fit candidates
        f1_scores = np.array(f1_scores)
        for i in range(num_pairs*2):
            del candidates[np.argmin(f1_scores)]
            del f1_scores[np.argmin(f1_scores)]
            
        # Crossover the remaining candiates with probability 
        # proportional to their fitness level
        for i in range(num_pairs):
            parent_indices = np.random.choice(num_candidates-num_pairs*2, 2, f1_scores, replace = False)
            crossover_point = randint(original_size,res_size-1)
            c1_Y = candidates[parent_indices[0]][1]
            c2_Y = candidates[parent_indices[1]][1]
            
            # Actual crossover
            c1_X = candidates[parent_indices[0]][0][:crossover_point]
            c1_X.extend(candidates[parent_indices[1]][0][crossover_point:])
            c2_X = candidates[parent_indices[1]][0][:crossover_point]
            c2_X.extend(candidates[parent_indices[0]][0][crossover_point:])
            
            # Add the children to the candidates set
            candidates.append([c1_X,c1_Y])
            candidates.append([c2_X,c2_Y])
            
            
            