<a href="https://colab.research.google.com/github/heroza/sia-smote/blob/main/SIA_SMOTE%20on%20fmnist%20ir%2040.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Library

In [None]:
IMAGE_W = 28
IMAGE_H = 28
IMAGE_C = 1
IMG_SIZE = (IMAGE_W,IMAGE_H)
INPUT_SHAPE = (IMAGE_H, IMAGE_W,)
num_classes = 2

In [None]:
try:
# %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Flatten, Dense, Dropout, Lambda
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.datasets import fashion_mnist
from tensorflow.python.keras.utils.vis_utils import plot_model
from tensorflow.keras import backend as K

import numpy as np
import matplotlib.pyplot as plt
from PIL import Image, ImageFont, ImageDraw
import random

from typing_extensions import Counter
from sklearn.utils import shuffle
import heapq
from sklearn.metrics import precision_recall_fscore_support, balanced_accuracy_score, confusion_matrix, accuracy_score, fbeta_score
from imblearn.over_sampling import SMOTE, BorderlineSMOTE
from sklearn.neighbors import NearestNeighbors

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.


In [None]:
def make_fmnist_im():
  (X_train, y_train), (X_val, y_val) = fashion_mnist.load_data()

  # To generate the indices of the data that we want. (Train)
  idx_train = np.concatenate(
      (
      np.where(y_train == 0)[0],
      np.random.choice(np.where(y_train == 1)[0], 150, replace=False)
      #  np.where(y_train == 2)[0][:4800],
      #  np.where(y_train == 3)[0][:4200],
      #  np.where(y_train == 4)[0][:3600],
      #  np.where(y_train == 5)[0][:3000],
      #  np.where(y_train == 6)[0][:2400],
      #  np.where(y_train == 7)[0][:1800],
      #  np.where(y_train == 8)[0][:1200],
      #  np.where(y_train == 9)[0][:600]
      )
  )

  X_train = X_train[idx_train]
  y_train = y_train[idx_train]
  #y_train = to_categorical(y_train, num_classes=num_classes)

  X_train, y_train = shuffle(X_train, y_train, random_state=42)

  # To generate the indices of the data that we want. (val)
  idx_val = np.concatenate(
      (
      np.where(y_val == 0)[0],
      np.where(y_val == 1)[0][:25]
      #  np.where(y_val == 2)[0][:800],
      #  np.where(y_val == 3)[0][:700],
      #  np.where(y_val == 4)[0][:600],
      #  np.where(y_val == 5)[0][:500],
      #  np.where(y_val == 6)[0][:400],
      #  np.where(y_val == 7)[0][:300],
      #  np.where(y_val == 8)[0][:200],
      #  np.where(y_val == 9)[0][:100]
      )
  )

  X_val = X_val[idx_val]
  y_val = y_val[idx_val]
  #y_val = to_categorical(y_val, num_classes=num_classes)

  X_val, y_val = shuffle(X_val, y_val, random_state=42)
  return (X_train, y_train), (X_val, y_val)

def biased_get_class(X, y, c):
    
    xbeg = X[y == c]
    ybeg = y[y == c]
    
    return xbeg, ybeg
    #return xclass, yclass

def join_data(X_train,y_train,resx1,resy1):
  X_train = X_train.reshape(X_train.shape[0], -1)
  resx1 = resx1.reshape(resx1.shape[0],-1)
  X_train = np.vstack((resx1,X_train))
  y_train = np.hstack((resy1,y_train))
  # y_train = to_categorical(y_train)
  X_train = X_train.reshape(-1, IMAGE_W, IMAGE_H, IMAGE_C)
  return X_train, y_train

#Data Loading

In [None]:
# load fmnist imbalance
(X_train, y_train), (X_val, y_val) = make_fmnist_im()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


In [None]:
#Normalize
# prepare train and test sets
X_train = X_train.astype('float32')
X_val = X_val.astype('float32')

# normalize values
X_train = X_train / 255.0
X_val = X_val / 255.0

#SIAMESE Training

In [None]:
def create_pairs(x, digit_indices):
    '''Positive and negative pair creation.
    Alternates between positive and negative pairs.
    '''
    pairs = []
    labels = []
    n = min([len(digit_indices[d]) for d in range(num_classes)]) - 1
    
    for d in range(num_classes):
        for i in range(n):
            z1, z2 = digit_indices[d][i], digit_indices[d][i + 1]
            pairs += [[x[z1], x[z2]]]
            inc = random.randrange(1, num_classes)
            dn = (d + inc) % num_classes
            z1, z2 = digit_indices[d][i], digit_indices[dn][i]
            pairs += [[x[z1], x[z2]]]
            labels += [1, 0]
            
    return np.array(pairs), np.array(labels)


def create_pairs_on_set(images, labels):
    
    digit_indices = [np.where(labels == i)[0] for i in range(num_classes)]
    pairs, y = create_pairs(images, digit_indices)
    y = y.astype('float32')
    
    return pairs, y


def show_image(image):
    plt.figure()
    plt.imshow(image)
    plt.colorbar()
    plt.grid(False)
    plt.show()

In [None]:
# create pairs on train and test sets
tr_pairs, tr_y = create_pairs_on_set(X_train, y_train)
ts_pairs, ts_y = create_pairs_on_set(X_val, y_val)
print(Counter(tr_y))
print(Counter(ts_y))

Counter({1.0: 298, 0.0: 298})
Counter({1.0: 48, 0.0: 48})


In [None]:
def initialize_base_network():
    input_shape = (IMAGE_H, IMAGE_W,)
    input = Input(shape=input_shape, name="base_input")
    x = Flatten(name="flatten_input")(input)
    x = Dense(128, activation='relu', name="first_base_dense")(x)
    x = Dropout(0.1, name="first_dropout")(x)
    x = Dense(128, activation='relu', name="second_base_dense")(x)
    x = Dropout(0.1, name="second_dropout")(x)
    x = Dense(128, activation='relu', name="third_base_dense")(x)

    return Model(inputs=input, outputs=x)


def euclidean_distance(vects):
    x, y = vects
    sum_square = K.sum(K.square(x - y), axis=1, keepdims=True)
    return K.sqrt(K.maximum(sum_square, K.epsilon()))


def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

def contrastive_loss_with_margin(margin):
    def contrastive_loss(y_true, y_pred):
        '''Contrastive loss from Hadsell-et-al.'06
        http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
        '''
        square_pred = K.square(y_pred)
        margin_square = K.square(K.maximum(margin - y_pred, 0))
        return (y_true * square_pred + (1 - y_true) * margin_square)
    return contrastive_loss

def compute_accuracy(y_true, y_pred):
    '''Compute classification accuracy with a fixed threshold on distances.
    '''
    pred = y_pred.ravel() < 0.5
    return np.mean(pred == y_true)

In [None]:
base_network = initialize_base_network()
#plot_model(base_network, show_shapes=True, show_layer_names=True, to_file='base-model.png')

In [None]:
# create the left input and point to the base network
input_shape = (IMAGE_H, IMAGE_W,)
input_a = Input(shape=input_shape, name="left_input")
vect_output_a = base_network(input_a)

# create the right input and point to the base network
input_b = Input(shape=input_shape, name="right_input")
vect_output_b = base_network(input_b)

# measure the similarity of the two vector outputs
output = Lambda(euclidean_distance, name="output_layer", output_shape=eucl_dist_output_shape)([vect_output_a, vect_output_b])

# specify the inputs and output of the model
model = Model([input_a, input_b], output)

# plot model graph
#plot_model(model, show_shapes=True, show_layer_names=True, to_file='outer-model.png')

In [None]:
rms = RMSprop()
model.compile(loss=contrastive_loss_with_margin(margin=1), optimizer=rms)
history = model.fit([tr_pairs[:,0], tr_pairs[:,1]], tr_y, epochs=20, batch_size=128, validation_data=([ts_pairs[:,0], ts_pairs[:,1]], ts_y))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
loss = model.evaluate(x=[ts_pairs[:,0],ts_pairs[:,1]], y=ts_y)

y_pred_train = model.predict([tr_pairs[:,0], tr_pairs[:,1]])
train_accuracy = compute_accuracy(tr_y, y_pred_train)

y_pred_test = model.predict([ts_pairs[:,0], ts_pairs[:,1]])
test_accuracy = compute_accuracy(ts_y, y_pred_test)

print("Loss = {}, Train Accuracy = {} Test Accuracy = {}".format(loss, train_accuracy, test_accuracy))

Loss = 0.011534787714481354, Train Accuracy = 1.0 Test Accuracy = 1.0


#SIA-SMOTE Oversampling

In [None]:
def Euclidean_Metric(a,b):
      dis = np.linalg.norm(a - b)
      return dis
      
def G_SM(All_X,samples_Y, n_to_sample, cl):
    g_index=0
    
    Minority_X=All_X[samples_Y == 1] # 1 is Minority class
    print(f'all x len: {All_X.shape}, Minority_X shape: {Minority_X.shape}')
    #Populate distance matrix
    dis_matrix=np.zeros((Minority_X.shape[0],All_X.shape[0]),dtype=float)
    for i in range(0,Minority_X.shape[0]):
        for j in range(0,All_X.shape[0]):
            dis_matrix[i,j]=Euclidean_Metric(Minority_X[i],All_X[j])
            if(dis_matrix[i,j]==0):
                dis_matrix[i,j]=999999
    dis_matrix=dis_matrix.tolist()
    print('here')
    #noise filtering
    base_indices=[] # d = noise, minority class which its nearest neihbor is majority class
    neighbor_indices=[]
    #print(Minority_X.shape[0])
    for i in range(Minority_X.shape[0]):
        min_index=list(map(dis_matrix[i].index, heapq.nsmallest(1, dis_matrix[i])))
        #print(min_index)
        if(samples_Y[min_index[0]]==0): 
            base_indices.append(i)
            neighbor_indices.append(min_index[0])
    # Minority_X=np.delete(Minority_X,d,axis=0)
    print('base_indices len',len(base_indices))
    # dis_matrix = np.array(dis_matrix)
    
    # base_indices = np.random.choice(list(range(len(X))),n_to_sample)
    base_indices = np.random.choice(base_indices,n_to_sample)
    neighbor_indices = np.random.choice(neighbor_indices,n_to_sample)

    X_base = Minority_X[base_indices]
    X_neighbor = All_X[neighbor_indices]
    samples = X_base + np.multiply(np.random.rand(n_to_sample,1), X_neighbor - X_base)

    #use 10 as label because 0 to 9 real classes and 1 fake/smoted = 10
    return X_base, samples, [cl]*n_to_sample

def SIAMESE_SMOTE_Data(X_train, y_train, one_hot = False):
  X_train = X_train.reshape(X_train.shape[0], -1)
  if one_hot:
    y_train = np.argmax(y_train, axis=1)
  #oversampling
  # resx = []
  # resy = []

  # for i in range(num_classes):
  #xclass, yclass = biased_get_class(X_train, y_train, i)
  # n = np.max(counter) - counter[i]
  # if n <= 0:
  #   continue
  xbase, xsamp, ysamp = G_SM(X_train,y_train,5400,1)
  ysamp = np.array(ysamp)
  # resx.append(xsamp)
  # resy.append(ysamp)
  
  # resx1 = np.vstack(resx)
  # resy1 = np.hstack(resy)

  #selection of new samples based on Siamese Network
  # y_pred = model.predict(xbase, xsamp)
  # pred = y_pred.ravel() < 0.5

  # resx1 = resx1.reshape(resx1.shape[0],-1)
  # X_train = np.vstack((resx1,X_train))
  # y_train = np.hstack((resy1,y_train))
  # y_train = to_categorical(y_train)
  # X_train = X_train.reshape(-1, IMAGE_W, IMAGE_H)#,3) #BEWARE IMAGE CHANNEL
  # return X_train, y_train
  return xbase, xsamp, ysamp

In [None]:
xbase, xsamp, ysamp = SIAMESE_SMOTE_Data(X_train, y_train, one_hot = False)
print(f'xbase shape {xbase.shape}, xsamp shape {xsamp.shape}, ysamp shape {ysamp.shape}')

all x len: (6150, 784), Minority_X shape: (150, 784)
here
base_indices len 12
xbase shape (5400, 784), xsamp shape (5400, 784), ysamp shape (5400,)


In [None]:
xbase = xbase.reshape(xbase.shape[0],IMAGE_W,IMAGE_H)
xsamp = xsamp.reshape(xsamp.shape[0],IMAGE_W,IMAGE_H)
print(f'xbase shape {xbase.shape}, xsamp shape {xsamp.shape}')

xbase shape (5400, 28, 28), xsamp shape (5400, 28, 28)


In [None]:
#selection of new samples based on Siamese Network
y_pred = model.predict([xbase, xsamp])
pred = y_pred.ravel() < 0.03
xsamp_sia = xsamp[pred]
ysamp_sia = ysamp[pred]
print(f'xsamp_sia shape {xsamp_sia.shape}, ysamp_sia shape {ysamp_sia.shape}')

xsamp_sia shape (160, 28, 28), ysamp_sia shape (160,)


#SMOTE

In [None]:
def G_SM(X, y,n_to_sample,cl):
    n_neigh = 5
    nn = NearestNeighbors(n_neighbors=n_neigh, n_jobs=1)
    nn.fit(X)
    dist, ind = nn.kneighbors(X)
    # generating samples
    base_indices = np.random.choice(list(range(len(X))),n_to_sample)
    neighbor_indices = np.random.choice(list(range(1, n_neigh)),n_to_sample)

    X_base = X[base_indices]
    X_neighbor = X[ind[base_indices, neighbor_indices]]
    
    samples = X_base + np.multiply(np.random.rand(n_to_sample,1), X_neighbor - X_base)
    return samples, [cl]*n_to_sample

def SMOTE_Data(X_train, y_train, one_hot = False):
  X_train = X_train.reshape(X_train.shape[0], -1)
  if one_hot:
    y_train = np.argmax(y_train, axis=1)
  
  #oversampling
  resx = []
  resy = []
  
  counter = Counter(y_train)
  counter = sorted(counter.items())
  counter = [value for _, value in counter]

  for i in range(num_classes):
      xclass, yclass = biased_get_class(X_train, y_train, i)
      n = np.max(counter) - counter[i]
      if n <= 0:
        continue
      xsamp, ysamp = G_SM(xclass,yclass,n,i)
      ysamp = np.array(ysamp)
      resx.append(xsamp)
      resy.append(ysamp)
  
  resx1 = np.vstack(resx)
  resy1 = np.hstack(resy)
  resx1 = resx1.reshape(resx1.shape[0],-1)
  #X_train = X_train.reshape(X_train.shape[0],-1)
  X_train = np.vstack((resx1,X_train))
  y_train = np.hstack((resy1,y_train))
  # y_train = to_categorical(y_train)
  X_train = X_train.reshape(-1, IMAGE_W, IMAGE_H, 1)
  return X_train, y_train

In [None]:
def G_SM2(X, y,n_to_sample,cl):
    n_neigh = 5
    nn = NearestNeighbors(n_neighbors=n_neigh, n_jobs=1)
    nn.fit(X)
    dist, ind = nn.kneighbors(X)
    # generating samples
    base_indices = np.random.choice(list(range(len(X))),n_to_sample)
    neighbor_indices = np.random.choice(list(range(1, n_neigh)),n_to_sample)

    X_base = X[base_indices]
    X_neighbor = X[ind[base_indices, neighbor_indices]]
    
    samples = X_base + np.multiply(np.random.rand(n_to_sample,1), X_neighbor - X_base)
    return X_base, samples, [cl]*n_to_sample

def SMOTE_Data2(X_train, y_train, one_hot = False):
  X_train = X_train.reshape(X_train.shape[0], -1)
  if one_hot:
    y_train = np.argmax(y_train, axis=1)
  
  #oversampling
  resx = []
  basex = []
  resy = []
  
  counter = Counter(y_train)
  counter = sorted(counter.items())
  counter = [value for _, value in counter]

  for i in range(num_classes):
      xclass, yclass = biased_get_class(X_train, y_train, i)
      n = np.max(counter) - counter[i]
      if n <= 0:
        continue
      xbase, xsamp, ysamp = G_SM2(xclass,yclass,n,i)
      ysamp = np.array(ysamp)
      resx.append(xsamp)
      basex.append(xbase)
      resy.append(ysamp)
  
  xsamp = np.vstack(resx)
  xbase = np.vstack(basex)
  resy1 = np.hstack(resy)

  xbase = xbase.reshape(xbase.shape[0],IMAGE_W,IMAGE_H)
  xsamp = xsamp.reshape(xsamp.shape[0],IMAGE_W,IMAGE_H)
  print(f'xbase shape {xbase.shape}, xsamp shape {xsamp.shape}')

  #selection of new samples based on Siamese Network
  y_pred = model.predict([xbase, xsamp])
  pred = y_pred.ravel() < 0.1
  resx1 = xsamp[pred]
  resy1 = resy1[pred]
  print('shape after selection with siamese',resx1.shape)

  resx1 = resx1.reshape(resx1.shape[0],-1)
  X_train = np.vstack((resx1,X_train))
  y_train = np.hstack((resy1,y_train))
  # y_train = to_categorical(y_train)
  X_train = X_train.reshape(-1, IMAGE_W, IMAGE_H, 1)
  return X_train, y_train

#ASN-SMOTE

In [None]:
# -*- coding: utf-8 -*-
"""
Created on Sun Apr 11 17:25:10 2021

@author: 易新凯
"""
#NM-Smote

#分类器
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
#标准化工具
from sklearn.preprocessing import StandardScaler

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
#导入集合分割，交叉验证，网格搜索
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV,cross_validate,KFold,StratifiedKFold
from sklearn.metrics import roc_auc_score,roc_curve,auc,confusion_matrix
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#smote过采样
# from imblearn.over_sampling import SMOTE,BorderlineSMOTE,SVMSMOTE,ADASYN,KMeansSMOTE,RandomOverSampler
# #欠采样
# from imblearn.under_sampling import RandomUnderSampler
import random
import math
import heapq
import time
def generate_x(samples,N,k):
    #n=int(N/10)
    time_start=time.time()
    g_index=0
    wrg=0
    samples_X=samples.iloc[:,0:-1]
    samples_Y=samples.iloc[:,-1]
    Minority_sample=samples[samples.iloc[:,-1].isin([1])] # 1 is Minority class
    Minority_sample_X=Minority_sample.iloc[:,0:-1]
                                       
    # transfer = StandardScaler()
    # SMinority_X= transfer.fit_transform(Minority_sample)
    # All_X=transfer.fit_transform(samples_X)
    Minority_X=np.array(Minority_sample_X)
    All_X=np.array(samples_X)
    n1=All_X.shape[0]-2*Minority_X.shape[0]
    print(n1)
    #n=int((All_X.shape[0]-2*Minority_X.shape[0])/Minority_X.shape[0])
    #print(n)

    #Populate distance matrix
    dis_matrix=np.zeros((Minority_X.shape[0],All_X.shape[0]),dtype=float)
    for i in range(0,Minority_X.shape[0]):
        for j in range(0,All_X.shape[0]):
            dis_matrix[i,j]=Euclidean_Metric(Minority_X[i,:],All_X[j,:])
            if(dis_matrix[i,j]==0):
                dis_matrix[i,j]=999999
    dis_matrix=dis_matrix.tolist()
    
    #noise filtering
    d=[] # d = noise, minority class which its nearest neihbor is majority class
    #print(Minority_X.shape[0])
    for i in range(Minority_X.shape[0]):
        min_index=list(map(dis_matrix[i].index, heapq.nsmallest(1, dis_matrix[i])))
        #print(min_index)
        if(samples_Y[min_index[0]]==0): 
            d.append(i)
    Minority_X=np.delete(Minority_X,d,axis=0)
    #print(Minority_X.shape)

    n=int((n1)/Minority_X.shape[0])
    #print(n)
    synthetic = np.zeros(((Minority_X.shape[0])*n,Minority_X.shape[1]),dtype=float)
    #print(Minority_X.shape[0])
    for i in range(Minority_X.shape[0]):

        # Filter in only neihgbours within safe radius
        min_index=list(map(dis_matrix[i].index, heapq.nsmallest(k, dis_matrix[i])))
        best_index={}
        best_f=0
        for h in range(len(min_index)):
            
            if(samples_Y[min_index[h]]==0): # 0 is Majority class
               best_index[best_f]=min_index[h]
               best_f+=1
               break # safe radius has been reached
            else:
                best_index[best_f]=min_index[h]
                best_f+=1
        #print(best_index)

        # syntesize samples by interpolating base samples and safe neihgbours
        for j in range(0,n):
            nn=random.randint(0,len(best_index)-1)
            #print(min_index[nn])
            dif=All_X[best_index[nn]]-Minority_X[i]
            #print(dif)
            gap=random.random()
            synthetic[g_index]=Minority_X[i]+gap*dif
            g_index+=1
            
    #print(synthetic.shape)
    #print(wrg)
    
    # synthetic=synthetic[0:synthetic.shape[0]-,:]
    labels=np.ones(synthetic.shape[0])
    synthetic=np.insert(synthetic,synthetic.shape[1],values=labels,axis=1)
    examples=np.concatenate((samples,synthetic),axis=0)
    time_end=time.time()
    del(dis_matrix)
    return examples
def RandomforClassifier(xtrain,ytrain,xtest,ytest):
    transfer = StandardScaler()
    xtrain = transfer.fit_transform(xtrain)
    xtest = transfer.transform(xtest)
    #选用随机森林模型
    rfc=RandomForestClassifier(
                                criterion='gini',
                                n_estimators=100,
                                min_samples_split=2,
                                min_samples_leaf=2,
                                max_depth=15,
                                random_state=6)
    #score_pre = cross_val_score(rfc,xtrain,ytrain,scoring='roc_auc',cv=10).mean()
    #scores = cross_val_score(rfc,xtrain,ytrain,cv=10,scoring='roc_auc')
    #print(scores)
    #print('mean CV-Scores: %.6f' % score_pre)
    rfc=rfc.fit(xtrain,ytrain)
    # #测试评估
    #result=rfc.score(xtest,ytest)
    AUC=roc_auc_score(ytest,rfc.predict_proba(xtest)[:,1])
    cm=confusion_matrix(ytest,rfc.predict(xtest))
    TN=cm[0][0]
    FP=cm[0][1]
    FN=cm[1][0]
    TP=cm[1][1]
    Acc=(TP+TN)/(TP+TN+FP+FN)
    Pos_Precision=TP/(TP+FP)
    #print("%.3f" %(Pos_Precision))
    #Neg_Precision=TN/(TN+FN)
    Sensitivity=TP/(TP+FN)
    Specificity=TN/(TN+FP)
    F_Measure=2*Sensitivity*Pos_Precision/(Sensitivity+Pos_Precision)
    G_Mean=np.sqrt(Sensitivity*Specificity)
    #print("F_Measure=%.6f" % F_Measure)
    #print("G_Mean=%.6f" %G_Mean)
    #print("AUC=%.6f" %AUC)
    #print("Acc=%.6f" % Acc)
    bal_acc = balanced_accuracy_score(ytest,rfc.predict(xtest))
    return F_Measure,G_Mean,AUC,Acc,bal_acc

#Classification

In [None]:
# model with padded convolutions for the fashion mnist dataset
from numpy import mean
from numpy import std
from matplotlib import pyplot
from sklearn.model_selection import KFold
from keras.datasets import fashion_mnist
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Conv2D
from keras.layers import MaxPooling2D
from keras.layers import Dense
from keras.layers import Flatten
from keras.optimizers import SGD

# load train and test dataset
def load_dataset():
	# load dataset
	(trainX, trainY), (testX, testY) = make_fmnist_im()
	# reshape dataset to have a single channel
	trainX = trainX.reshape((trainX.shape[0], 28, 28, 1))
	testX = testX.reshape((testX.shape[0], 28, 28, 1))
	# one hot encode target values
	# trainY = to_categorical(trainY)
	# testY = to_categorical(testY)
	return trainX, trainY, testX, testY

# scale pixels
def prep_pixels(train, test):
	# convert from integers to floats
	train_norm = train.astype('float32')
	test_norm = test.astype('float32')
	# normalize to range 0-1
	train_norm = train_norm / 255.0
	test_norm = test_norm / 255.0
	# return normalized images
	return train_norm, test_norm

# define cnn model
def define_model():
	model = Sequential()
	model.add(Conv2D(32, (3, 3), padding='same', activation='relu', kernel_initializer='he_uniform', input_shape=(28, 28, 1)))
	model.add(MaxPooling2D((2, 2)))
	model.add(Flatten())
	model.add(Dense(100, activation='relu', kernel_initializer='he_uniform'))
	model.add(Dense(1, activation='sigmoid'))
	# compile model
	opt = SGD(learning_rate=0.01, momentum=0.9)
	model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])
	return model

# evaluate a model using k-fold cross-validation
def evaluate_model(dataX, dataY, xsamp_sia,ysamp_sia, n_folds=10, k_n=5):
	dataX = dataX.reshape((dataX.shape[0], 28, 28, 1))
	confusion = np.array([[0, 0], [0, 0]])
	scores, histories = list(), list()
	Acc_arr, bal_acc_arr, G_Mean_arr, F_Measure_arr, Precision_arr, Sensitivity_arr, Specificity_arr = list(), list(), list(), list(), list(), list(), list()
	# prepare cross validation
	kfold = KFold(n_folds, shuffle=True, random_state=1)
	# enumerate splits
	for train_ix, test_ix in kfold.split(dataX):
		# define model
		model = define_model()
		# select rows for train and test
		X_train, y_train, X_val, y_val = dataX[train_ix], dataY[train_ix], dataX[test_ix], dataY[test_ix]
		#oversampling
		# sm = SMOTE(k_neighbors=k_n)
		# X_train, y_train = sm.fit_resample(X_train.reshape(X_train.shape[0], IMAGE_W * IMAGE_H), y_train)
		# X_train = X_train.reshape(X_train.shape[0], IMAGE_W, IMAGE_H, 1)
		#X_train, y_train = SMOTE_Data2(X_train, y_train)
		# X_train,y_train = join_data(X_train,y_train,xsamp_sia,ysamp_sia)
		#ASN-SMOTE
		kdata=pd.DataFrame(np.column_stack((X_train.reshape(X_train.shape[0], IMAGE_W * IMAGE_H),y_train)))
		g_sample=generate_x(kdata,100,k_n)
		X_train, y_train = g_sample[:,0:-1], g_sample[:,-1]
		X_train = X_train.reshape(X_train.shape[0], IMAGE_W, IMAGE_H, 1)

		# fit model
		history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val), verbose=0)
		#predict
		y_train_pred = model.predict(X_train)
		y_train_pred = np.where(y_train_pred < 0.5, 0, 1)
		y_val_pred = model.predict(X_val)
		y_val_pred = np.where(y_val_pred < 0.5, 0, 1)
		
		# evaluate model
		bal_acc = balanced_accuracy_score(y_val, y_val_pred)
		TN, FP, FN, TP = confusion_matrix(y_val, y_val_pred).ravel()
		print('TN, FP, FN, TP: ',TN, FP, FN, TP)
		# TN=cm[0][0]
		# FP=cm[0][1]
		# FN=cm[1][0]
		# TP=cm[1][1]
		Acc=(TP+TN)/(TP+TN+FP+FN)
		Precision=TP/(TP+FP)
		#print("%.3f" %(Pos_Precision))
		#Neg_Precision=TN/(TN+FN)
		Sensitivity=TP/(TP+FN)
		Specificity=TN/(TN+FP)
		F_Measure=2*Sensitivity*Precision/(Sensitivity+Precision)
		G_Mean=np.sqrt(Sensitivity*Specificity)
		# append scores
		confusion += confusion_matrix(y_val, y_val_pred)
		Acc_arr.append(Acc)
		bal_acc_arr.append(bal_acc)
		G_Mean_arr.append(G_Mean)
		F_Measure_arr.append(F_Measure)
		Precision_arr.append(Precision)
		Sensitivity_arr.append(Sensitivity)
		Specificity_arr.append(Specificity)

		scores.append(bal_acc)
		histories.append(history)
	print('Acc_arr: mean=%.3f std=%.3f' % (mean(Acc_arr)*100, std(Acc_arr)*100))
	print('bal_acc_arr: mean=%.3f std=%.3f' % (mean(bal_acc_arr)*100, std(bal_acc_arr)*100))
	print('G_Mean_arr: mean=%.3f std=%.3f' % (mean(G_Mean_arr)*100, std(G_Mean_arr)*100))
	print('F_Measure_arr: mean=%.3f std=%.3f' % (mean(F_Measure_arr)*100, std(F_Measure_arr)*100))
	print('Precision_arr: mean=%.3f std=%.3f' % (mean(Precision_arr)*100, std(Precision_arr)*100))
	print('Sensitivity_arr: mean=%.3f std=%.3f' % (mean(Sensitivity_arr)*100, std(Sensitivity_arr)*100))
	print('Specificity_arr: mean=%.3f std=%.3f' % (mean(Specificity_arr)*100, std(Specificity_arr)*100))

	return scores, histories, confusion

# plot diagnostic learning curves
def summarize_diagnostics(histories):
	for i in range(len(histories)):
		# plot loss
		pyplot.subplot(211)
		pyplot.title('Cross Entropy Loss')
		pyplot.plot(histories[i].history['loss'], color='blue', label='train')
		pyplot.plot(histories[i].history['val_loss'], color='orange', label='test')
		# plot accuracy
		pyplot.subplot(212)
		pyplot.title('Classification Accuracy')
		pyplot.plot(histories[i].history['accuracy'], color='blue', label='train')
		pyplot.plot(histories[i].history['val_accuracy'], color='orange', label='test')
	pyplot.show()

# summarize model performance
def summarize_performance(scores):
	# print summary
	print(scores)
	print('Accuracy: mean=%.3f std=%.3f' % (mean(scores)*100, std(scores)*100))
	# box and whisker plots of results
	pyplot.boxplot(scores)
	pyplot.show()

# run the test harness for evaluating a model
# def run_test_harness(xsamp_sia,ysamp_sia):
	# load dataset
# trainX, trainY, testX, testY = load_dataset()
# prepare pixel data
# trainX, testX = prep_pixels(X_train, X_val)
# evaluate model
k_values = [1, 2, 3, 4, 5, 6, 7]
for k in k_values:
	scores, histories, confusion = evaluate_model(X_train, y_train, xsamp_sia,ysamp_sia,k_n=k)
# # learning curves
# summarize_diagnostics(histories)
# # summarize estimated performance
# summarize_performance(scores) 
# print(confusion)

# entry point, run the test harness
# run_test_harness(xsamp_sia,ysamp_sia)

5269
TN, FP, FN, TP:  598 0 0 17
5263
TN, FP, FN, TP:  600 1 1 13
5269
TN, FP, FN, TP:  597 1 3 14
5265
TN, FP, FN, TP:  598 2 1 14
5271
TN, FP, FN, TP:  596 1 0 18
5257
TN, FP, FN, TP:  604 0 0 11
5267
TN, FP, FN, TP:  598 1 0 16
5263
TN, FP, FN, TP:  600 1 1 13
5257
TN, FP, FN, TP:  603 1 0 11
5269
TN, FP, FN, TP:  596 2 2 15
Acc_arr: mean=99.707 std=0.228
bal_acc_arr: mean=97.398 std=2.954
G_Mean_arr: mean=97.321 std=3.070
F_Measure_arr: mean=94.169 std=4.310
Precision_arr: mean=93.530 std=3.929
Sensitivity_arr: mean=94.964 std=5.854
Specificity_arr: mean=99.833 std=0.106
5269
TN, FP, FN, TP:  597 1 0 17
5263
TN, FP, FN, TP:  600 1 1 13
5269
TN, FP, FN, TP:  597 1 4 13
5265
TN, FP, FN, TP:  597 3 1 14
5271
TN, FP, FN, TP:  590 7 0 18
5257
TN, FP, FN, TP:  601 3 0 11
5267
TN, FP, FN, TP:  598 1 0 16
5263
TN, FP, FN, TP:  600 1 0 14
5257
TN, FP, FN, TP:  602 2 0 11
5269
TN, FP, FN, TP:  590 8 2 15
Acc_arr: mean=99.415 std=0.461
bal_acc_arr: mean=97.311 std=3.710
G_Mean_arr: mean=97.20