In [None]:
import numpy as np
import pandas as pd

# SMOTE Algorithm


In [None]:
def smote(X,y,N,k):
  sample = get_minority_samples(X,y)
  T = len(sample)                         # number of minority class samples

  if (N < 100):
    np.random.shuffle(sample)             # randomize the T minority class sample
    T = int(N/100 * T)
    N = 100

  N = int(N/100)
  numattrs = sample.shape[1]              # number of attributes/features
  synthetic = np.empty((N*T,numattrs))

  for i in range(T):
    distances = np.empty((0,2),float)
    for j in range(T):
      if (i!=j):
        distance = euclidean_distance(sample[i],sample[j])
        distances = np.append(distances,np.array([[j,distance]]),axis = 0)
    distances = distances[distances[:,1].argsort()]
    nnarray = distances[:,0].astype(int)
    populate(N,i,nnarray[:k],k,numattrs,sample,synthetic)

In [4]:
def euclidean_distance(point1,point2):
      try:
        n = len(point1)
        distance = 0
        for i in range(n):
          distance += (point1[i] - point2[i])**2
        return np.sqrt(distance)
      except Exception as e:                                       # Raise Exception if point1 and point2 are of different dimension
        print(f'Exception: {type(e).__name__} - {e}')


In [5]:
def populate(N,i,nnarray,k,numattrs,sample,synthetic):
  newindex = 0                                                     # keeps count of number of synthetic samples generated
  while(N != 0):
    nn = np.random.randint(0,k)
    for attr in range(numattrs):
      dif = sample[nnarray[nn]][attr] - sample[i][attr]
      gap = np.random.randint(0,2)
      synthetic[newindex][attr] = sample[i][attr] + gap * dif
    newindex = newindex+1
    N = N - 1

In [6]:
def get_minority_samples(X,y):
  N = len(X)                                                # total number of samples
  majority_class = y.mode()[0]                              # majority class
  minority_n = N - y.describe()['freq']

  d = X.shape[1]                                            # number of features
  sample = np.empty((minority_n,d))                         # array for original majority class
  j = 0
  for i in range(N):
        if(y[i] != majority_class):
           sample[j] = np.array(X.iloc[i])
           j=j+1
  return sample