<a href="https://colab.research.google.com/github/pritam-banik-roy/Software_Defect_Prediction/blob/main/PRITAM_ESMOTE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U imbalanced-learn



In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.neighbors import NearestNeighbors
import random
from sklearn import preprocessing
from imblearn.over_sampling import SMOTE
from sklearn import svm
from sklearn.model_selection import cross_val_score

In [None]:
from sklearn.cluster import KMeans
from scipy.stats import entropy

In [None]:
def load_norm_data(path):
    #df = pd.read_csv(path, header=None)
    df = pd.read_csv(path)
    data = df.values
    label = data[:, -1]
    columns = data.shape[1]
    x = data[:, :columns - 1]

    min_max_scaler = preprocessing.MinMaxScaler()
    x = min_max_scaler.fit_transform(x)

    Maj_num = Counter(label)[0]
    Min_num = Counter(label)[1]
    IR = Maj_num / Min_num

    print("Instances: {0} ,Features: {1} ,Maj: {2} ,Min: {3} ,IR: {4} ".format(len(label), columns - 1, Maj_num,
                                                                               Min_num,
                                                                               round(IR, 2)))
    return x, label, Maj_num, Min_num, round(IR, 2), columns - 1



In [None]:
def get_entropy(labels, base=None):
  value,counts = np.unique(labels, return_counts=True)
  return entropy(counts, base=base)

In [None]:
def entropy_smote(X,y,n_clus = 5,entropy_threshold = 0.2):
  # Find clusters
  kmeans = KMeans(n_clusters=n_clus, random_state=0).fit(X)
  select_data_index = []

  # Find entropy for every cluster

  for cluster in range(n_clus):

    # Find index of data points which belongs to a particular cluster

    cluster_index = np.where(kmeans.labels_ == cluster)[0]

    # Calculate Entropy of that cluster

    cluster_entropy = get_entropy(y[cluster_index])

    # If entropy is less than threshold that means its purer
    # then add index values to select data pool

    if cluster_entropy <= entropy_threshold:
      select_data_index.extend(cluster_index)

  # Find index of minority samples from selected data
  min_sample_index = []
  for id in select_data_index:
    if y[id] == 1:
      min_sample_index.append(id)

  # Following not working as y[select_data_index] is resetting index values
  # min_sample_index = np.where(y[select_data_index] == 1)[0]

  min_sample_index = np.array(min_sample_index)
  print('No. of minority samples selected: ',min_sample_index.shape[0])

  # Resample the minority data samples whose index values are stored in min_sample_index

  majority_data_index = np.where(y == 0)[0]
  X_maj = X[majority_data_index,:]
  y_maj = y[majority_data_index]
  X = np.vstack((X_maj,X[min_sample_index,:]))
  y = np.hstack((y_maj,y[min_sample_index]))
  #print("y_maj:{}, y_min_select:{}, X:{}, y:{}, y_count: {}, y_maj_count:{}, y_min_count:{}".format(y_maj.shape[0], min_sample_index.shape[0], X.shape[0], y.shape[0], Counter(y),Counter(y_maj),Counter(y[min_sample_index])))
  X_resampled, y_resampled = SMOTE(sampling_strategy = 'minority').fit_resample(X, y)

  return X_resampled, y_resampled

In [None]:
path = r'/content/cm1.csv'

In [None]:
X, y, Maj_num, Min_num, IR, features = load_norm_data(path)
X_resampled, y_resampled = entropy_smote(X,y,entropy_threshold=0.27)

Instances: 498 ,Features: 21 ,Maj: 449 ,Min: 49 ,IR: 9.16 




No. of minority samples selected:  21


In [None]:
print('Resampled dataset shape %s' % Counter(y))
print('Resampled dataset shape %s' % Counter(y_resampled))

Resampled dataset shape Counter({0.0: 449, 1.0: 49})
Resampled dataset shape Counter({0.0: 449, 1.0: 449})


In [None]:
import pandas as pd
from sklearn import svm
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import SMOTE




# Create an SVM classifier with linear kernel
clf = svm.SVC(kernel='linear', C=1, random_state=42)

# Perform cross-validation without resampling
scores = cross_val_score(clf, X, y, cv=5)
print("Before Resampling: %0.2f (%0.2f)" % (scores.mean(), scores.std()))

# Perform SMOTE resampling
X_smote, y_smote = SMOTE(sampling_strategy='minority').fit_resample(X, y)

# Perform cross-validation after SMOTE resampling
scores = cross_val_score(clf, X_smote, y_smote, cv=5)
print("After SMOTE: %0.2f (%0.2f)" % (scores.mean(), scores.std()))

# Perform E-SMOTE resampling (assuming you have previously defined X_resampled and y_resampled)
scores = cross_val_score(clf, X_resampled, y_resampled, cv=5)
print("After E-SMOTE: %0.2f (%0.2f)" % (scores.mean(), scores.std()))


Before Resampling: 0.90 (0.00)
After SMOTE: 0.77 (0.04)
After E-SMOTE: 0.66 (0.05)


In [None]:
clf = svm.SVC(kernel='linear', C=1, random_state=42)
scores = cross_val_score(clf, X, y, cv=5)
print("Before Resampling: %0.2f (%0.2f)" % (scores.mean(), scores.std()))
X_smote, y_smote = SMOTE(sampling_strategy = 'minority').fit_resample(X, y)
scores = cross_val_score(clf, X_smote, y_smote, cv=5)
print("After SMOTE: %0.2f (%0.2f)" % (scores.mean(), scores.std()))
scores = cross_val_score(clf, X_resampled, y_resampled, cv=5)
print("After ESMOTE: %0.2f (%0.2f)" % (scores.mean(), scores.std()))

Before Resampling: 0.90 (0.00)
After SMOTE: 0.75 (0.04)
After ESMOTE: 0.66 (0.05)


# **SVM**

In [None]:
from sklearn.metrics import precision_score, recall_score, accuracy_score


svm_model = svm.SVC(kernel='linear', C=1, random_state=42)
svm_model.fit(X_resampled, y_resampled)

# Predict on the original test set
y_pred = svm_model.predict(X_resampled)

# Calculate evaluation metrics
accuracy = accuracy_score(y_resampled, y_pred)
precision = precision_score(y_resampled, y_pred)
recall = recall_score(y_resampled, y_pred)
g_mean = np.sqrt(recall * (1 - precision))


# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("G-Mean:", g_mean)


Accuracy: 0.678173719376392
Precision: 0.6120448179271709
Recall: 0.9732739420935412
G-Mean: 0.6144808128913708


# **DT**

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score


dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_resampled, y_resampled)

# Predict on the original test set
y_pred = dt_model.predict(X_resampled)

# Calculate evaluation metrics
accuracy = accuracy_score(y_resampled, y_pred)
precision = precision_score(y_resampled, y_pred)
recall = recall_score(y_resampled, y_pred)
g_mean = np.sqrt(recall * (1 - precision))

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("G-mean:", g_mean)


Accuracy: 0.9988864142538976
Precision: 1.0
Recall: 0.9977728285077951
G-mean: 0.0


# **KNN**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score


knn_model = KNeighborsClassifier()
knn_model.fit(X_resampled, y_resampled)


y_pred = knn_model.predict(X_resampled)

accuracy = accuracy_score(y_resampled, y_pred)
precision = precision_score(y_resampled, y_pred)
recall = recall_score(y_resampled, y_pred)
g_mean = np.sqrt(recall * (1 - precision))

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("G-mean:", g_mean)

Accuracy: 0.9476614699331849
Precision: 0.9068825910931174
Recall: 0.9977728285077951
G-mean: 0.3048114506844144


# **MLP**

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score


mlp_model = MLPClassifier(random_state=42)
mlp_model.fit(X_resampled, y_resampled)

y_pred = mlp_model.predict(X_resampled)


accuracy = accuracy_score(y_resampled, y_pred)
precision = precision_score(y_resampled, y_pred)
recall = recall_score(y_resampled, y_pred)
g_mean = np.sqrt(recall * (1 - precision))

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("G-mean:", g_mean)


Accuracy: 0.8485523385300668
Precision: 0.832271762208068
Recall: 0.8730512249443207
G-mean: 0.3826687124158422




# **LR**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, accuracy_score


lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_resampled, y_resampled)


y_pred = lr_model.predict(X_resampled)


accuracy = accuracy_score(y_resampled, y_pred)
precision = precision_score(y_resampled, y_pred)
recall = recall_score(y_resampled, y_pred)
g_mean = np.sqrt(recall * (1 - precision))

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("G-mean:", g_mean)



Accuracy: 0.7405345211581291
Precision: 0.6692789968652038
Recall: 0.9510022271714922
G-mean: 0.5608176268213948


# **NB**

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_score, recall_score, accuracy_score

nb_model = GaussianNB()
nb_model.fit(X_resampled, y_resampled)


y_pred = nb_model.predict(X_resampled)

accuracy = accuracy_score(y_resampled, y_pred)
precision = precision_score(y_resampled, y_pred)
recall = recall_score(y_resampled, y_pred)
g_mean = np.sqrt(recall * (1 - precision))

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("G-mean:", g_mean)



Accuracy: 0.6414253897550112
Precision: 0.5836627140974967
Recall: 0.9866369710467706
G-mean: 0.640916342978301
