<a href="https://colab.research.google.com/github/pritam-banik-roy/Software_Defect_Prediction/blob/main/Entropy_SMOTE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.neighbors import NearestNeighbors
import random
from sklearn import preprocessing
from imblearn.over_sampling import SMOTE
from sklearn import svm
from sklearn.model_selection import cross_validate

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from sklearn.cluster import KMeans
from scipy.stats import entropy

In [None]:
def load_norm_data(path):
    #df = pd.read_csv(path, header=None)
    df = pd.read_csv(path)
    data = df.values
    label = data[:, -1]
    columns = data.shape[1]
    x = data[:, :columns - 1]

    min_max_scaler = preprocessing.MinMaxScaler()
    x = min_max_scaler.fit_transform(x)

    Maj_num = Counter(label)[0]
    Min_num = Counter(label)[1]
    IR = Maj_num / Min_num

    print("Instances: {0} ,Features: {1} ,Maj: {2} ,Min: {3} ,IR: {4} ".format(len(label), columns - 1, Maj_num,
                                                                               Min_num,
                                                                               round(IR, 2)))
    return x, label, Maj_num, Min_num, round(IR, 2), columns - 1



In [None]:
def get_entropy(labels, base=None):
  value,counts = np.unique(labels, return_counts=True)
  return entropy(counts, base=base)

In [None]:
def entropy_smote(X,y,n_clus = 5,entropy_threshold = 0.2):
  # Find clusters
  kmeans = KMeans(n_clusters=n_clus, random_state=0).fit(X)
  select_data_index = []

  # Find entropy for every cluster

  for cluster in range(n_clus):

    # Find index of data points which belongs to a particular cluster

    cluster_index = np.where(kmeans.labels_ == cluster)[0]

    # Calculate Entropy of that cluster

    cluster_entropy = get_entropy(y[cluster_index])

    # If entropy is less than threshold that means its purer
    # then add index values to select data pool

    if cluster_entropy <= entropy_threshold:
      select_data_index.extend(cluster_index)

  # Find index of minority samples from selected data
  min_sample_index = []
  for id in select_data_index:
    if y[id] == 1:
      min_sample_index.append(id)

  # Following not working as y[select_data_index] is resetting index values
  # min_sample_index = np.where(y[select_data_index] == 1)[0]

  min_sample_index = np.array(min_sample_index)
  print('No. of minority samples selected: ',min_sample_index.shape[0])

  # Resample the minority data samples whose index values are stored in min_sample_index

  majority_data_index = np.where(y == 0)[0]
  X_maj = X[majority_data_index,:]
  y_maj = y[majority_data_index]
  X = np.vstack((X_maj,X[min_sample_index,:]))
  y = np.hstack((y_maj,y[min_sample_index]))
  #print("y_maj:{}, y_min_select:{}, X:{}, y:{}, y_count: {}, y_maj_count:{}, y_min_count:{}".format(y_maj.shape[0], min_sample_index.shape[0], X.shape[0], y.shape[0], Counter(y),Counter(y_maj),Counter(y[min_sample_index])))
  X_resampled, y_resampled = SMOTE(sampling_strategy = 'minority').fit_resample(X, y)

  return X_resampled, y_resampled

In [None]:
path = r'/content/drive/MyDrive/imbalanced_datasets/ecoli1.csv'
X, y, Maj_num, Min_num, IR, features = load_norm_data(path)
X_resampled, y_resampled = entropy_smote(X,y,entropy_threshold=0.18)

Instances: 336 ,Features: 7 ,Maj: 259 ,Min: 77 ,IR: 3.36 
No. of minority samples selected:  8




In [None]:
print('Resampled dataset shape %s' % Counter(y))
print('Resampled dataset shape %s' % Counter(y_resampled))

Resampled dataset shape Counter({0.0: 259, 1.0: 77})
Resampled dataset shape Counter({0.0: 259, 1.0: 259})


In [None]:
clf = svm.SVC(kernel='linear', C=1, random_state=42)
scores = cross_validate(clf, X, y, cv=5, scoring='f1_macro')
print("Before Resampling: %0.2f (%0.2f)" % (scores['test_score'].mean(), scores['test_score'].std()))
X_smote, y_smote = SMOTE(sampling_strategy = 'minority').fit_resample(X, y)
scores = cross_validate(clf, X, y, cv=5, scoring='f1_macro')
print("After SMOTE: %0.2f (%0.2f)" % (scores['test_score'].mean(), scores['test_score'].std()))
scores = cross_validate(clf, X, y, cv=5, scoring='f1_macro')
print("After ESMOTE: %0.2f (%0.2f)" % (scores['test_score'].mean(), scores['test_score'].std()))

Before Resampling: 0.79 (0.14)
After SMOTE: 0.79 (0.14)
After ESMOTE: 0.79 (0.14)
