<a href="https://colab.research.google.com/github/sujitkumar205/RICEVD/blob/main/Feature_Selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1591]:
import numpy as np
import pandas as pd
from scipy.spatial import distance
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.feature_extraction.text  import CountVectorizer
from sklearn.datasets import make_friedman1
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
from scipy.spatial import distance
import statistics
from collections import Counter
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from imblearn.metrics import geometric_mean_score
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import precision_score, recall_score, f1_score
from scipy.stats import gmean

In [1592]:
from datetime import datetime
start_time = datetime.now()


# ReliefF

In [1593]:
def reliefF(df, number_of_neighbours, instances_to_select, number_of_features):
  features = df.iloc[:,:-1]
  labels = df.iloc[:,-1]
  rows,columns = features.shape

  #initialize weights to zero
  weights = np.zeros(columns,dtype = 'int')

  #unique labels
  unique_labels = np.unique(labels)

  #used to select random instance
  instances=np.array(list(range(1,rows)))

  #difference between maximum and minimum of each feature used to calculate diff
  minimums=np.min(features.values,axis=0)
  maximums=np.max(features.values,axis=0)
  difference=np.subtract(maximums,minimums)

  for i in range(instances_to_select):

    #choose a random instance and remove from instances to avoid selecting same thing again
    random_instance = np.random.choice(instances[:-1])
    instances=np.delete(instances,np.where(instances==random_instance))

    #features of random instance used to calculate diff later on
    random_instance_features = features.iloc[random_instance,:].values

    #label of random instance and probability of label class
    random_instance_label = labels[random_instance]
    probability_random_instance_label = len(np.where(labels==random_instance_label)[0])/rows

    #calculate euclidean distance between random instance and all other instances
    distances = []
    for temp in instances:
      temp_features = features.iloc[temp,:].values
      dist = distance.euclidean(random_instance_features,temp_features)
      distances.append(dist)
    
    #sort instances based on distances
    distances = np.array(distances)
    arr1inds = distances.argsort()
    sorted_distances = distances[arr1inds[::]]
    sorted_instances = instances[arr1inds[::]]

    #initialize list of nearest hits for random instance label and dictionary of nearest misses for every other label
    nearest_hits = []
    nearest_misses = {}

    #finding nearest hits for random instance label
    for temp in sorted_instances:
      if labels[temp] == random_instance_label:
        nearest_hits.append(temp)
      if len(nearest_hits) == number_of_neighbours:
        break
      
    #finding nearest misses for all other labels
    for x in unique_labels:
      if x == random_instance_label:
        continue      
      nearest_misses[x] = []
      for temp in sorted_instances:
        if labels[temp] == x:
          nearest_misses[x].append(temp)
        if len(nearest_misses[x]) == number_of_neighbours:
          break

    #used to find sum of diff function in weights equation for hits
    total_hit = np.zeros(columns,dtype='int')

    #find sum of diff function in weights equation for hits
    for hit in range(len(nearest_hits)):
      hI = features.iloc[nearest_hits[hit],:].values
      dRH = np.divide(np.abs(np.subtract(random_instance_features,hI)),difference)
      dRH = dRH/(instances_to_select * number_of_neighbours)
      total_hit = np.add(total_hit,dRH)

    #used to find sum of diff function in weights equation for misses
    total_miss=np.zeros(columns,dtype='int')

    #find sum of diff function in weights equation for misses in each class
    for each_label in nearest_misses:
      temp_miss=np.zeros(columns,dtype='int')
      pclass=len(np.where(labels==each_label)[0])/rows #getting the probability of getting this class
      postProb=pclass/(1-probability_random_instance_label) #calculating the posterior probability of getting this class

      for each_miss in nearest_misses[each_label]:
        mI = features.iloc[each_miss,:].values
        dRM = np.divide(np.abs(np.subtract(random_instance_features,mI)),difference)
        dRM = dRM/(instances_to_select * number_of_neighbours)
        temp_miss = np.add(temp_miss,dRM)

      total_miss = np.add(total_miss,(temp_miss*postProb))
    
    #update value of weights based on total hits and total miss and diff function values
    weights=np.add(weights,total_miss)
    weights=np.subtract(weights,total_hit) 
    

  #select number_of_features weights with highest values and sort
  ind = np.argpartition(weights, -number_of_features)[-number_of_features:]
  ind = np.sort(ind)[::-1]

  #column names of data frame
  feature_names = list(df.columns.values)
  feature_names = np.array(feature_names)

  #top features based on weights
  top_features = feature_names[ind]

  return top_features



# Chi-Square

In [1594]:
def chi_square(df, number_of_features):
  features = df.iloc[:,:-1]
  labels = df.iloc[:,-1]
  labels=labels.astype('int') 
  test = SelectKBest(score_func=chi2, k=number_of_features)
  fit = test.fit(features, labels)
  chisquare_features = fit.get_feature_names_out(input_features=None)
  return chisquare_features

# SVM-RFE

In [1595]:
def svmrfe(df, number_of_features):
  features = df.iloc[:,:-1]
  labels = df.iloc[:,-1]
  #estimator = SVR(kernel="linear",cache_size=7000)
  #estimator = LinearSVR(max_iter=100000,dual = True)
  estimator = LinearSVC(random_state=0, tol=1e-5)
  selector = RFE(estimator, n_features_to_select=number_of_features, step=10)
  selector = selector.fit(features, labels)
  feature_ranks = list(selector.ranking_)
  feature_names = list(df.columns.values)
  rank_dictionary = dict(zip(feature_names, feature_ranks))
  svmrfe_features = [feature for feature, rank in rank_dictionary.items() if rank == 1]
  return svmrfe_features


# ReliefF Variable Distance

https://docs.scipy.org/doc/scipy/reference/spatial.distance.html

In [1596]:
def calcDistance(random_instance_features, temp_features, distance_variable):
  # print("inside the calc distance function")
  # print("distance variable is") 
  # print(distance_variable)
  if distance_variable == 0:
    dist = distance.braycurtis(random_instance_features,temp_features)
    return dist
  if distance_variable == 1:
    dist = distance.canberra(random_instance_features,temp_features)
    return dist
  if distance_variable == 2:
    dist = distance.chebyshev(random_instance_features,temp_features)
    return dist
  if distance_variable == 3:
    dist = distance.cityblock(random_instance_features,temp_features)
    return dist
  if distance_variable == 4:
    dist = distance.correlation(random_instance_features,temp_features)
    return dist
  if distance_variable == 5:
    dist = distance.cosine(random_instance_features,temp_features)
    return dist
  if distance_variable == 6:
    dist = distance.euclidean(random_instance_features,temp_features)
    return dist
  if distance_variable == 7:
    dist = distance.jensenshannon(random_instance_features,temp_features)
    return dist
  if distance_variable == 8:
    dist = distance.sqeuclidean(random_instance_features,temp_features)
    return dist
  return 0

In [1597]:
# def reliefF_variable(df, number_of_neighbours, instances_to_select, number_of_features, distance_variable):
#   print("inside the reliefF_variable func")

#   features = df.iloc[:,:-1]
#   labels = df.iloc[:,-1]
#   rows,columns = features.shape

#   #initialize weights to zero
#   # weights = np.zeros(columns,dtype = 'int')

#   #unique labels
#   unique_labels = np.unique(labels)

#   #used to select random instance
#   instances=np.array(list(range(1,rows)))

#   #difference between maximum and minimum of each feature used to calculate diff
#   minimums=np.min(features.values,axis=0)
#   maximums=np.max(features.values,axis=0)
#   difference=np.subtract(maximums,minimums)
  
#   total_instances = instances_to_select * len(unique_labels)
#   label_count = {}

#   for i in unique_labels:
#     label_count[i] = 0

#   for i in range(total_instances):

#     #choose a random instance and remove from instances to avoid selecting same thing again
#     random_instance = np.random.choice(instances[:-1])
#     instances=np.delete(instances,np.where(instances==random_instance))

#     #features of random instance used to calculate diff later on
#     random_instance_features = features.iloc[random_instance,:].values

#     #label of random instance and probability of label class
#     random_instance_label = labels[random_instance]
#     probability_random_instance_label = len(np.where(labels==random_instance_label)[0])/rows

#     if label_count[random_instance_label] >= instances_to_select:
#       i = i-1
#       continue
    
#     else:
#       label_count[random_instance_label] = label_count[random_instance_label] + 1

#     #calculate euclidean distance between random instance and all other instances
#     distances = []
#     for temp in instances:
#       temp_features = features.iloc[temp,:].values
#       dist = calcDistance(random_instance_features, temp_features, distance_variable)
#       distances.append(dist)
    
#     #sort instances based on distances
#     distances = np.array(distances)
#     arr1inds = distances.argsort()
#     sorted_distances = distances[arr1inds[::]]
#     sorted_instances = instances[arr1inds[::]]

#     #initialize list of nearest hits for random instance label and dictionary of nearest misses for every other label
#     nearest_hits = []
#     nearest_misses = {}

#     #finding nearest hits for random instance label
#     for temp in sorted_instances:
#       if labels[temp] == random_instance_label:
#         nearest_hits.append(temp)
#       if len(nearest_hits) == number_of_neighbours:
#         break
      
#     #finding nearest misses for all other labels
#     for x in unique_labels:
#       if x == random_instance_label:
#         continue      
#       nearest_misses[x] = []
#       for temp in sorted_instances:
#         if labels[temp] == x:
#           nearest_misses[x].append(temp)
#         if len(nearest_misses[x]) == number_of_neighbours:
#           break

#     #used to find sum of diff function in weights equation for hits
#     total_hit = np.zeros(columns,dtype='int')

#     #find sum of diff function in weights equation for hits
#     for hit in range(len(nearest_hits)):
#       hI = features.iloc[nearest_hits[hit],:].values
#       dRH = np.divide(np.abs(np.subtract(random_instance_features,hI)),difference)
#       dRH = dRH/(instances_to_select * number_of_neighbours)
#       total_hit = np.add(total_hit,dRH)

#     #used to find sum of diff function in weights equation for misses
#     total_miss=np.zeros(columns,dtype='int')

#     #find sum of diff function in weights equation for misses in each class
#     for each_label in nearest_misses:
#       temp_miss=np.zeros(columns,dtype='int')
#       pclass=len(np.where(labels==each_label)[0])/rows #getting the probability of getting this class
#       postProb=pclass/(1-probability_random_instance_label) #calculating the posterior probability of getting this class

#       for each_miss in nearest_misses[each_label]:
#         mI = features.iloc[each_miss,:].values
#         dRM = np.divide(np.abs(np.subtract(random_instance_features,mI)),difference)
#         dRM = dRM/(instances_to_select * number_of_neighbours)
#         temp_miss = np.add(temp_miss,dRM)

#       total_miss = np.add(total_miss,(temp_miss*postProb))
    
#     #update value of weights based on total hits and total miss and diff function values
#     weights=np.add(weights,total_miss)
#     weights=np.subtract(weights,total_hit) 
    

#   #select number_of_features weights with highest values and sort
#   ind = np.argpartition(weights, -number_of_features)[-number_of_features:]
#   ind = np.sort(ind)[::-1]

#   #column names of data frame
#   feature_names = list(df.columns.values)
#   feature_names = np.array(feature_names)

#   #top features based on weights
#   top_features = feature_names[ind]

#   return top_features


In [1598]:
def reliefF_variable(df, number_of_neighbours, instances_to_select, number_of_features, distance_variable):
  print("SSSSSSSSSSSSSSSSSSS")
  print("inside the reliefF_variable func2")

  features = df.iloc[:,:-1]
  labels = df.iloc[:,-1]
  rows,columns = features.shape

  #initialize weights to zero
  weights = np.zeros(columns,dtype = 'int')

  #unique labels
  unique_labels = np.unique(labels)

  #used to select random instance
  instances=np.array(list(range(1,rows)))

  #difference between maximum and minimum of each feature used to calculate diff
  minimums=np.min(features.values,axis=0)
  maximums=np.max(features.values,axis=0)
  difference=np.subtract(maximums,minimums)
  
  total_instances = instances_to_select * len(unique_labels)
  label_count = {}

  for i in unique_labels:
    label_count[i] = 0

  for i in range(total_instances):

    #choose a random instance and remove from instances to avoid selecting same thing again
    random_instance = np.random.choice(instances[:-1])
    instances=np.delete(instances,np.where(instances==random_instance))

    #features of random instance used to calculate diff later on
    random_instance_features = features.iloc[random_instance,:].values

    #label of random instance and probability of label class
    random_instance_label = labels[random_instance]
    probability_random_instance_label = len(np.where(labels==random_instance_label)[0])/rows

    if label_count[random_instance_label] >= instances_to_select:
      i = i-1
      continue
    
    else:
      label_count[random_instance_label] = label_count[random_instance_label] + 1

    #calculate euclidean distance between random instance and all other instances
    distances = []
    for temp in instances:
      temp_features = features.iloc[temp,:].values
      dist = calcDistance(random_instance_features, temp_features, distance_variable)
      distances.append(dist)
    
    #sort instances based on distances
    distances = np.array(distances)
    arr1inds = distances.argsort()
    sorted_distances = distances[arr1inds[::]]
    sorted_instances = instances[arr1inds[::]]

    #initialize list of nearest hits for random instance label and dictionary of nearest misses for every other label
    nearest_hits = []
    nearest_misses = {}

    #finding nearest hits for random instance label
    for temp in sorted_instances:
      if labels[temp] == random_instance_label:
        nearest_hits.append(temp)
      if len(nearest_hits) == number_of_neighbours:
        break
      
    #finding nearest misses for all other labels
    for x in unique_labels:
      if x == random_instance_label:
        continue      
      nearest_misses[x] = []
      for temp in sorted_instances:
        if labels[temp] == x:
          nearest_misses[x].append(temp)
        if len(nearest_misses[x]) == number_of_neighbours:
          break

    #used to find sum of diff function in weights equation for hits
    total_hit = np.zeros(columns,dtype='int')

    #find sum of diff function in weights equation for hits
    for hit in range(len(nearest_hits)):
      hI = features.iloc[nearest_hits[hit],:].values
      dRH = np.divide(np.abs(np.subtract(random_instance_features,hI)),difference)
      dRH = dRH/(instances_to_select * number_of_neighbours)
      total_hit = np.add(total_hit,dRH)

    #used to find sum of diff function in weights equation for misses
    total_miss=np.zeros(columns,dtype='int')

    #find sum of diff function in weights equation for misses in each class
    for each_label in nearest_misses:
      temp_miss=np.zeros(columns,dtype='int')
      pclass=len(np.where(labels==each_label)[0])/rows #getting the probability of getting this class
      postProb=pclass/(1-probability_random_instance_label) #calculating the posterior probability of getting this class

      for each_miss in nearest_misses[each_label]:
        mI = features.iloc[each_miss,:].values
        dRM = np.divide(np.abs(np.subtract(random_instance_features,mI)),difference)
        dRM = dRM/(instances_to_select * number_of_neighbours)
        temp_miss = np.add(temp_miss,dRM)

      total_miss = np.add(total_miss,(temp_miss*postProb))
    
    #update value of weights based on total hits and total miss and diff function values
    weights=np.add(weights,total_miss)
    weights=np.subtract(weights,total_hit) 
    

  #select number_of_features weights with highest values and sort
  ind = np.argpartition(weights, -number_of_features)[-number_of_features:]
  ind = np.sort(ind)[::-1]

  #column names of data frame
  feature_names = list(df.columns.values)
  feature_names = np.array(feature_names)

  #top features based on weights
  top_features = feature_names[ind]

  return top_features
# 

#### Balanced Sampling: Instead of random sampling, use stratified sampling to ensure that instances from both minority and majority classes are selected proportional to their representation in the dataset. This can help in giving equal importance to all classes.

#### This implementation includes the following modifications for balanced sampling: #### Class Proportions Calculation: Determines how many instances to select from each class based on the class distribution within the dataset..


#### Instance Selection Per Class: For each class, randomly selects instances up to the calculated limit, ensuring balanced representation.
#### Distance Calculation and Nearest Neighbor Identification: Separately for hits (same class) and misses (different classes), ensuring that the algorithm correctly identifies relevant neighbors within the balanced sampling framework.

In [1599]:
import numpy as np
import pandas as pd

# def calcDistance(instance1, instance2):
#     """
#     Calculate the Euclidean distance between two instances.
#     """
#     return np.sqrt(np.sum((instance1 - instance2) ** 2))

def reliefF_variable_balanced(df, number_of_neighbours, instances_to_select, number_of_features,distance_variable):
    features = df.iloc[:, :-1]
    labels = df.iloc[:, -1]
    rows, columns = features.shape

    # Initialize weights to zero
    weights = np.zeros(columns)

    # Difference between maximum and minimum of each feature for normalization
    minimums = np.min(features.values, axis=0)
    maximums = np.max(features.values, axis=0)
    difference = np.subtract(maximums, minimums)

    # Calculate class proportions and determine instances to select per class
    class_counts = labels.value_counts()
    instances_to_select_per_class = {label: max(1, int(count / class_counts.sum() * instances_to_select))
                                     for label, count in class_counts.items()}

    for label, count in instances_to_select_per_class.items():
        # class_instances = df[df['Label'] == label].index.tolist()
        class_instances = df[df.iloc[:, -1] == label].index.tolist()


        for _ in range(count):
            if len(class_instances) > 0:
                random_instance_index = np.random.choice(class_instances)
                class_instances.remove(random_instance_index)

                random_instance_features = features.iloc[random_instance_index, :].values
                random_instance_label = labels[random_instance_index]

                # Calculate distances to all other instances using calcDistance
                distances = [calcDistance(random_instance_features, features.iloc[i, :].values,distance_variable) for i in range(rows)]

                # Identifying nearest hits within the same class
                same_class_mask = labels == random_instance_label
                same_class_distances = np.array(distances)
                same_class_distances[same_class_mask] = np.max(distances) + 1  # Exclude the instance itself
                nearest_hits_indices = np.argsort(same_class_distances)[:number_of_neighbours]

                # Identifying nearest misses for each different class
                nearest_misses_indices = {}
                for other_label in class_counts.index:
                    if other_label == random_instance_label:
                        continue
                    other_class_mask = labels == other_label
                    other_class_distances = np.array(distances)
                    other_class_distances[~other_class_mask] = np.max(distances) + 1
                    nearest_misses_indices[other_label] = np.argsort(other_class_distances)[:number_of_neighbours]

                # Update weights based on nearest hits
                for hit_index in nearest_hits_indices:
                    hit_features = features.iloc[hit_index, :].values
                    weights -= np.abs(random_instance_features - hit_features) / difference / rows

                # Update weights based on nearest misses
                for other_label, miss_indices in nearest_misses_indices.items():
                    for miss_index in miss_indices:
                        miss_features = features.iloc[miss_index, :].values
                        weights += np.abs(random_instance_features - miss_features) / difference / rows

    # Select top features based on weights
    top_features_indices = np.argsort(weights)[-number_of_features:]
    top_features = features.columns[top_features_indices]

    return top_features


#### Below is a full implementation of the modified ReliefF algorithm incorporating cost-sensitive learning to address class imbalance. This implementation includes the calculation of class frequencies and their inverses to adjust the feature weight updates accordingly

#### Class Frequencies and Weights: Before the loop, the algorithm calculates the frequency of each class and its inverse weight. This makes sure that minority classes are given more importance in the feature weighting process.

In [1600]:
import numpy as np
import pandas as pd

# def calcDistance(instance1, instance2, distance_variable):
#     if distance_variable == 'euclidean':
#         return np.sqrt(np.sum((instance1 - instance2) ** 2))
#     else:
#         # Placeholder for other distance calculations
#         return np.linalg.norm(instance1 - instance2)

def reliefF_variable_weight(df, number_of_neighbours, instances_to_select, number_of_features, distance_variable):
    print("inside the reliefF_variable func")

    features = df.iloc[:, :-1]
    labels = df.iloc[:, -1]
    rows, columns = features.shape

    # Initialize weights to zero
    weights = np.zeros(columns, dtype=float)

    # Unique labels
    unique_labels = np.unique(labels)

    # Calculate class frequencies and inverse weights
    class_frequencies = {label: len(np.where(labels == label)[0]) for label in unique_labels}
    total_instances = len(labels)
    class_weights = {label: total_instances / (len(unique_labels) * freq) for label, freq in class_frequencies.items()}

    # Used to select random instance
    instances = np.array(list(range(rows)))

    # Difference between maximum and minimum of each feature used to calculate diff
    minimums = np.min(features.values, axis=0)
    maximums = np.max(features.values, axis=0)
    difference = np.subtract(maximums, minimums)

    total_instances = instances_to_select * len(unique_labels)
    label_count = {i: 0 for i in unique_labels}

    for i in range(total_instances):
        # Choose a random instance and remove from instances to avoid selecting the same thing again
        random_instance = np.random.choice(instances)
        instances = np.delete(instances, np.where(instances == random_instance))

        # Features of random instance used to calculate diff later on
        random_instance_features = features.iloc[random_instance, :].values

        # Label of random instance and probability of label class
        random_instance_label = labels.iloc[random_instance]
        probability_random_instance_label = len(np.where(labels == random_instance_label)[0]) / rows

        if label_count[random_instance_label] >= instances_to_select:
            continue
        else:
            label_count[random_instance_label] += 1

        # Calculate euclidean distance between random instance and all other instances
        distances = np.array([calcDistance(random_instance_features, features.iloc[temp, :].values, distance_variable) for temp in instances])

        # Sort instances based on distances
        sorted_indices = distances.argsort()
        sorted_instances = instances[sorted_indices]

        # Initialize list of nearest hits for random instance label and dictionary of nearest misses for every other label
        nearest_hits = []
        nearest_misses = {label: [] for label in unique_labels if label != random_instance_label}

        # Finding nearest hits and misses
        for temp in sorted_instances:
            temp_label = labels.iloc[temp]
            if temp_label == random_instance_label and len(nearest_hits) < number_of_neighbours:
                nearest_hits.append(temp)
            elif temp_label != random_instance_label and len(nearest_misses[temp_label]) < number_of_neighbours:
                nearest_misses[temp_label].append(temp)

        # Calculate diff for hits
        total_hit = np.sum([np.divide(np.abs(np.subtract(random_instance_features, features.iloc[hit, :].values)), difference) for hit in nearest_hits], axis=0)
        total_hit *= class_weights[random_instance_label] / (instances_to_select * number_of_neighbours)

        # Calculate diff for misses
        total_miss = np.zeros(columns, dtype=float)
        for label, misses in nearest_misses.items():
            temp_miss = np.sum([np.divide(np.abs(np.subtract(random_instance_features, features.iloc[miss, :].values)), difference) for miss in misses], axis=0)
            temp_miss *= class_weights[label] / (instances_to_select * number_of_neighbours)
            total_miss += temp_miss * (class_frequencies[label] / (rows - class_frequencies[random_instance_label]))

        # Update weights
        weights += total_miss - total_hit

    # Select top features based on weights
    ind = np.argpartition(weights, -number_of_features)[-number_of_features:]
    top_features = df.columns[ind].tolist()

    return top_features


In [1601]:
def reliefF_variable_main(df, number_of_neighbours, instances_to_select, number_of_features):
  # print("inside the main function")

  features = df.iloc[:,:-1]

  braycurtis = []
  canberra = []
  chebyshev = []
  cityblock = []
  correlation = []
  cosine = []
  euclidean = []
  jensenshannon = []
  sqeuclidean = []

  for i in range(len(features)-1):
    for j in range(i+1, len(features)):
      braycurtis.append(distance.braycurtis(features.iloc[i],features.iloc[j]))
      canberra.append(distance.canberra(features.iloc[i],features.iloc[j]))
      chebyshev.append(distance.chebyshev(features.iloc[i],features.iloc[j]))
      cityblock.append(distance.cityblock(features.iloc[i],features.iloc[j]))
      correlation.append(distance.correlation(features.iloc[i],features.iloc[j]))
      cosine.append(distance.cosine(features.iloc[i],features.iloc[j]))
      euclidean.append(distance.euclidean(features.iloc[i],features.iloc[j]))
      jensenshannon.append(distance.jensenshannon(features.iloc[i],features.iloc[j]))
      sqeuclidean.append(distance.sqeuclidean(features.iloc[i],features.iloc[j]))
  # print("braycurtis")
  # print(braycurtis)
  # print("canberra")
  # print(canberra)
  standard_deviation = []
  standard_deviation.append(statistics.stdev(braycurtis))
  standard_deviation.append(statistics.stdev(canberra))
  standard_deviation.append(statistics.stdev(chebyshev))
  standard_deviation.append(statistics.stdev(cityblock))
  standard_deviation.append(statistics.stdev(correlation))
  standard_deviation.append(statistics.stdev(cosine))
  standard_deviation.append(statistics.stdev(euclidean))
  standard_deviation.append(statistics.stdev(jensenshannon))
  standard_deviation.append(statistics.stdev(sqeuclidean))
  # print("standard dev") 
  # print(standard_deviation)

  max_value = max(standard_deviation)
  # print("max_value")
  # print(max_value)
  distance_variable = standard_deviation.index(max_value)
  
  # print("distance_variable")
  # print(distance_variable)
  #distance_variable = 1
  features_combined = []

  print(number_of_neighbours)
    # print(instances_to_select)
    # print(number_of_features) 
    # print(distance_variable)

  for i in range(10):
    # print(df)
    print(number_of_neighbours)
    print(instances_to_select)
    print(number_of_features) 
    print(distance_variable)
    a = reliefF_variable(df, number_of_neighbours, instances_to_select, number_of_features, distance_variable)
    # a = reliefF_variable_balanced(df, number_of_neighbours, instances_to_select, number_of_features, distance_variable)
    # a = reliefF_variable_weight(df, number_of_neighbours, instances_to_select, number_of_features, distance_variable)
    # print(a)
    features_combined = features_combined + list(a)

  features_count = Counter(features_combined)
  # print(features_count)

  features_count_sorted = sorted(features_count.items(), key=lambda x: x[1], reverse=True)
  # print(features_count_sorted)

  variable_distance_relieff_features = []
  for i in range(number_of_features):
    variable_distance_relieff_features.append(features_count_sorted[i][0])
  # print(variable_distance_relieff_features)

  return variable_distance_relieff_features

In [1602]:
def reliefF_variable_main_weight(df, number_of_neighbours, instances_to_select, number_of_features):
  # print("inside the main function")

  features = df.iloc[:,:-1]

  braycurtis = []
  canberra = []
  chebyshev = []
  cityblock = []
  correlation = []
  cosine = []
  euclidean = []
  jensenshannon = []
  sqeuclidean = []

  for i in range(len(features)-1):
    for j in range(i+1, len(features)):
      braycurtis.append(distance.braycurtis(features.iloc[i],features.iloc[j]))
      canberra.append(distance.canberra(features.iloc[i],features.iloc[j]))
      chebyshev.append(distance.chebyshev(features.iloc[i],features.iloc[j]))
      cityblock.append(distance.cityblock(features.iloc[i],features.iloc[j]))
      correlation.append(distance.correlation(features.iloc[i],features.iloc[j]))
      cosine.append(distance.cosine(features.iloc[i],features.iloc[j]))
      euclidean.append(distance.euclidean(features.iloc[i],features.iloc[j]))
      jensenshannon.append(distance.jensenshannon(features.iloc[i],features.iloc[j]))
      sqeuclidean.append(distance.sqeuclidean(features.iloc[i],features.iloc[j]))
  # print("braycurtis")
  # print(braycurtis)
  # print("canberra")
  # print(canberra)
  standard_deviation = []
  standard_deviation.append(statistics.stdev(braycurtis))
  standard_deviation.append(statistics.stdev(canberra))
  standard_deviation.append(statistics.stdev(chebyshev))
  standard_deviation.append(statistics.stdev(cityblock))
  standard_deviation.append(statistics.stdev(correlation))
  standard_deviation.append(statistics.stdev(cosine))
  standard_deviation.append(statistics.stdev(euclidean))
  standard_deviation.append(statistics.stdev(jensenshannon))
  standard_deviation.append(statistics.stdev(sqeuclidean))
  # print("standard dev") 
  # print(standard_deviation)

  max_value = max(standard_deviation)
  # print("max_value")
  # print(max_value)
  distance_variable = standard_deviation.index(max_value)
  
  # print("distance_variable")
  # print(distance_variable)
  #distance_variable = 1
  features_combined = []

  print(number_of_neighbours)
    # print(instances_to_select)
    # print(number_of_features) 
    # print(distance_variable)

  for i in range(10):
    # print(df)
    print(number_of_neighbours)
    print(instances_to_select)
    print(number_of_features) 
    print(distance_variable)
    # a = reliefF_variable(df, number_of_neighbours, instances_to_select, number_of_features, distance_variable)
    # a = reliefF_variable_balanced(df, number_of_neighbours, instances_to_select, number_of_features, distance_variable)
    a = reliefF_variable_weight(df, number_of_neighbours, instances_to_select, number_of_features, distance_variable)
    # print(a)
    features_combined = features_combined + list(a)

  features_count = Counter(features_combined)
  # print(features_count)

  features_count_sorted = sorted(features_count.items(), key=lambda x: x[1], reverse=True)
  # print(features_count_sorted)

  variable_distance_relieff_features = []
  for i in range(number_of_features):
    variable_distance_relieff_features.append(features_count_sorted[i][0])
  # print(variable_distance_relieff_features)

  return variable_distance_relieff_features

##### Cross-validation: Instead of selecting the distance metric based on the highest standard deviation of distances, cross-validation is used to evaluate how well each metric performs in terms of predictive accuracy.  This involves running the reliefF_variable function with each distance metric and evaluating the performance using a classifier. This method ensures the chosen metric is the most effective for the dataset.

In [1603]:
# from sklearn.model_selection import cross_val_score
# from sklearn.ensemble import RandomForestClassifier  # Example classifier
# from sklearn.ensemble import GradientBoostingClassifier
#   # Example classifier

# def calcDistance(random_instance_features, temp_features, distance_variable):
#   # print("inside the calc distance function")
#   # print("distance variable is") 
#   # print(distance_variable)
#   if distance_variable == 0:
#     dist = distance.braycurtis(random_instance_features,temp_features)
#     return dist
#   if distance_variable == 1:
#     dist = distance.canberra(random_instance_features,temp_features)
#     return dist
#   if distance_variable == 2:
#     dist = distance.chebyshev(random_instance_features,temp_features)
#     return dist
#   if distance_variable == 3:
#     dist = distance.cityblock(random_instance_features,temp_features)
#     return dist
#   if distance_variable == 4:
#     dist = distance.correlation(random_instance_features,temp_features)
#     return dist
#   if distance_variable == 5:
#     dist = distance.cosine(random_instance_features,temp_features)
#     return dist
#   if distance_variable == 6:
#     dist = distance.euclidean(random_instance_features,temp_features)
#     return dist
#   if distance_variable == 7:
#     dist = distance.jensenshannon(random_instance_features,temp_features)
#     return dist
#   if distance_variable == 8:
#     dist = distance.sqeuclidean(random_instance_features,temp_features)
#     return dist
#   return 0

# def reliefF_variable(df, number_of_neighbours, instances_to_select, number_of_features, distance_variable):
#   print("SSSSSSSSSSSSSSSSSSS")
#   print("inside the reliefF_variable func")

#   features = df.iloc[:,:-1]
#   labels = df.iloc[:,-1]
#   rows,columns = features.shape

#   #initialize weights to zero
#   weights = np.zeros(columns,dtype = 'int')

#   #unique labels
#   unique_labels = np.unique(labels)

#   #used to select random instance
#   instances=np.array(list(range(1,rows)))

#   #difference between maximum and minimum of each feature used to calculate diff
#   minimums=np.min(features.values,axis=0)
#   maximums=np.max(features.values,axis=0)
#   difference=np.subtract(maximums,minimums)
  
#   total_instances = instances_to_select * len(unique_labels)
#   label_count = {}

#   for i in unique_labels:
#     label_count[i] = 0

#   for i in range(total_instances):

#     #choose a random instance and remove from instances to avoid selecting same thing again
#     random_instance = np.random.choice(instances[:-1])
#     instances=np.delete(instances,np.where(instances==random_instance))

#     #features of random instance used to calculate diff later on
#     random_instance_features = features.iloc[random_instance,:].values

#     #label of random instance and probability of label class
#     random_instance_label = labels[random_instance]
#     probability_random_instance_label = len(np.where(labels==random_instance_label)[0])/rows

#     if label_count[random_instance_label] >= instances_to_select:
#       i = i-1
#       continue
    
#     else:
#       label_count[random_instance_label] = label_count[random_instance_label] + 1

#     #calculate euclidean distance between random instance and all other instances
#     distances = []
#     for temp in instances:
#       temp_features = features.iloc[temp,:].values
#       dist = calcDistance(random_instance_features, temp_features, distance_variable)
#       distances.append(dist)
    
#     #sort instances based on distances
#     distances = np.array(distances)
#     arr1inds = distances.argsort()
#     sorted_distances = distances[arr1inds[::]]
#     sorted_instances = instances[arr1inds[::]]

#     #initialize list of nearest hits for random instance label and dictionary of nearest misses for every other label
#     nearest_hits = []
#     nearest_misses = {}

#     #finding nearest hits for random instance label
#     for temp in sorted_instances:
#       if labels[temp] == random_instance_label:
#         nearest_hits.append(temp)
#       if len(nearest_hits) == number_of_neighbours:
#         break
      
#     #finding nearest misses for all other labels
#     for x in unique_labels:
#       if x == random_instance_label:
#         continue      
#       nearest_misses[x] = []
#       for temp in sorted_instances:
#         if labels[temp] == x:
#           nearest_misses[x].append(temp)
#         if len(nearest_misses[x]) == number_of_neighbours:
#           break

#     #used to find sum of diff function in weights equation for hits
#     total_hit = np.zeros(columns,dtype='int')

#     #find sum of diff function in weights equation for hits
#     for hit in range(len(nearest_hits)):
#       hI = features.iloc[nearest_hits[hit],:].values
#       dRH = np.divide(np.abs(np.subtract(random_instance_features,hI)),difference)
#       dRH = dRH/(instances_to_select * number_of_neighbours)
#       total_hit = np.add(total_hit,dRH)

#     #used to find sum of diff function in weights equation for misses
#     total_miss=np.zeros(columns,dtype='int')

#     #find sum of diff function in weights equation for misses in each class
#     for each_label in nearest_misses:
#       temp_miss=np.zeros(columns,dtype='int')
#       pclass=len(np.where(labels==each_label)[0])/rows #getting the probability of getting this class
#       postProb=pclass/(1-probability_random_instance_label) #calculating the posterior probability of getting this class

#       for each_miss in nearest_misses[each_label]:
#         mI = features.iloc[each_miss,:].values
#         dRM = np.divide(np.abs(np.subtract(random_instance_features,mI)),difference)
#         dRM = dRM/(instances_to_select * number_of_neighbours)
#         temp_miss = np.add(temp_miss,dRM)

#       total_miss = np.add(total_miss,(temp_miss*postProb))
    
#     #update value of weights based on total hits and total miss and diff function values
#     weights=np.add(weights,total_miss)
#     weights=np.subtract(weights,total_hit) 
    

#   #select number_of_features weights with highest values and sort
#   ind = np.argpartition(weights, -number_of_features)[-number_of_features:]
#   ind = np.sort(ind)[::-1]

#   #column names of data frame
#   feature_names = list(df.columns.values)
#   feature_names = np.array(feature_names)

#   #top features based on weights
#   top_features = feature_names[ind]

#   return top_features
# # 

# def reliefF_variable_crossvalidated(df, number_of_neighbours, instances_to_select, number_of_features, distance_variable, cv=5):
#     """
#     Evaluate ReliefF variable selection with cross-validation for a given distance metric.
    
#     """
#     print("inside the releifF_variable_crossvalidated")
#     print("distance variable value is")
#     print(distance_variable)
    
#     selected_features = reliefF_variable(df, number_of_neighbours, instances_to_select, number_of_features, distance_variable)
#     # print("selected_features")
#     # print(selected_features)
    
#     X = df[selected_features].values
#     y = df.iloc[:,-1].values
#     classifier = GradientBoostingClassifier()  # Example classifier, can be replaced
#     scores = cross_val_score(classifier, X, y, cv=cv)
#     print("scores value")
#     print(scores)
#     return np.mean(scores), selected_features

# def adaptive_distance_metric_selection(df, number_of_neighbours, instances_to_select, number_of_features):
#     print("inside adaptive distance_metric_selection")
#     cv = 5
#     metrics_performance = {}
#     for distance_variable in range(9):  # For each distance metric
#         print("distance_variable")
#         print(distance_variable)
#         score, _ = reliefF_variable_crossvalidated(df, number_of_neighbours, instances_to_select, number_of_features, distance_variable, cv)
#         print("score")
#         print(score)
#         metrics_performance[distance_variable] = score
#     print("metrics performance")
#     print(metrics_performance)
    
#     # Select the best performing metric
#     best_metric = max(metrics_performance, key=metrics_performance.get)
#     print("best_metric")
#     print(best_metric)
#     features_combined = []
#     best_score, best_features = reliefF_variable_crossvalidated(df, number_of_neighbours, instances_to_select, number_of_features, best_metric, cv)
#   #   for i in range(10):
        
#   #       a = reliefF_variable(df, number_of_neighbours, instances_to_select, number_of_features, distance_variable)
#   #   # print(a)
#   #       features_combined = features_combined + list(a)

#   #   features_count = Counter(features_combined)

#   #   features_count_sorted = sorted(features_count.items(), key=lambda x: x[1], reverse=True)
#   # # print(features_count_sorted)

#   #   variable_distance_relieff_two_features = []
#   #   for i in range(number_of_features):
#   #       variable_distance_relieff_two_features.append(features_count_sorted[i][0])
#   # # print(variable_distance_relieff_features)

#   #   return variable_distance_relieff_two_features    
    

#     # print(best_features)
    
#     # print(f"Best distance metric: {best_metric} with cross-validated score: {best_score}")
#     return best_features


##### The calcDistance function is updated. This function is modified to calculate the distance between two instances using a different metric for each feature, based on a list of metrics provided. 
##### The reliefF_variable function is adjusted. This function is modified to use the updated calcDistance function. The distance_variable parameter now receives a list of metrics corresponding to each feature rather than a single metric.


In [1604]:
# from scipy.spatial import distance
# import numpy as np

# def calcDistance_feature_specific(value1, value2, metric_index):
#     # Convert scalar values to arrays for compatibility with distance functions
#     value1, value2 = np.array([value1]), np.array([value2])
    
#     # Select and apply the appropriate distance metric
#     if metric_index == 0:
#         # Bray-Curtis distance
#         dist = distance.braycurtis(value1, value2)
#     elif metric_index == 1:
#         # Canberra distance
#         dist = distance.canberra(value1, value2)
#     elif metric_index == 2:
#         # Chebyshev distance
#         dist = distance.chebyshev(value1, value2)
#     elif metric_index == 3:
#         # City Block (Manhattan) distance
#         dist = distance.cityblock(value1, value2)
#     elif metric_index == 4:
#         # Correlation distance
#         dist = distance.correlation(value1, value2)
#     elif metric_index == 5:
#         # Cosine distance
#         dist = distance.cosine(value1, value2)
#     elif metric_index == 6:
#         # Euclidean distance
#         dist = distance.euclidean(value1, value2)
#     elif metric_index == 7:
#         # Jensen-Shannon distance, note: requires probability distributions
#         # Here, you need to ensure the inputs are suitable for Jensen-Shannon,
#         # for simplicity, this example treats inputs as distributions
#         dist = distance.jensenshannon(value1, value2, base=2)
#     elif metric_index == 8:
#         # Squared Euclidean distance
#         # Not directly available in scipy.spatial.distance, so compute manually
#         dist = np.sum((value1 - value2) ** 2)
#     else:
#         # Default case or error handling
#         raise ValueError("Unsupported metric_index: {}".format(metric_index))
    
#     # Handle cases where the distance calculation returns nan or inf
#     if np.isnan(dist) or np.isinf(dist):
#         return 0.0
#     return dist


In [1605]:
# import numpy as np

# def reliefF_variable_custom(df, number_of_neighbours, instances_to_select, number_of_features, distance_variables_per_feature):
#     features = df.iloc[:, :-1]
#     labels = df.iloc[:, -1]
#     rows, columns = features.shape

#     # Initialize weights to zero
#     weights = np.zeros(columns, dtype='float')

#     # Unique labels in the dataset
#     unique_labels = np.unique(labels)

#     # For storing minimum and maximum of each feature for normalization
#     minimums = np.min(features.values, axis=0)
#     maximums = np.max(features.values, axis=0)
#     difference = maximums - minimums

#     for _ in range(instances_to_select):
#         # Select a random instance
#         random_index = np.random.randint(rows)
#         random_instance_features = features.iloc[random_index, :].values
#         random_instance_label = labels[random_index]

#         # Calculate distances to all other instances
#         distances = np.zeros((rows, columns))
#         for i in range(rows):
#             for j in range(columns):
#                 distance_metric_index = distance_variables_per_feature[j]
#                 distances[i, j] = calcDistance_feature_specific(random_instance_features[j], features.iloc[i, j], distance_metric_index)

#         # Find nearest hits and misses
#         hits_mask = labels == random_instance_label
#         misses_mask = ~hits_mask
#         hits_distances = np.where(hits_mask, distances, np.inf).min(axis=0)
#         misses_distances = np.where(misses_mask, distances, np.inf).min(axis=0)

#         # Update weights
#         weights += misses_distances - hits_distances

#     # Normalize weights
#     weights = (weights - np.min(weights)) / (np.max(weights) - np.min(weights))

#     # Select top features based on weights
#     top_feature_indices = np.argsort(weights)[-number_of_features:]

#     # Map indices to feature names
#     top_features = features.columns[top_feature_indices].tolist()

#     return top_features




In [1606]:
import numpy as np
from scipy.spatial import distance
from collections import Counter

def calcDistanceCustom(feature_a, feature_b, metric):
    if metric == 0:
        return distance.braycurtis(feature_a, feature_b)
    elif metric == 1:
        return distance.canberra(feature_a, feature_b)
    elif metric == 2:
        return distance.chebyshev(feature_a, feature_b)
    elif metric == 3:
        return distance.cityblock(feature_a, feature_b)
    elif metric == 4:
        return distance.correlation(feature_a, feature_b)
    elif metric == 5:
        return distance.cosine(feature_a, feature_b)
    elif metric == 6:
        return distance.euclidean(feature_a, feature_b)
    elif metric == 7:
        return distance.jensenshannon(feature_a, feature_b)
    elif metric == 8:
        return distance.sqeuclidean(feature_a, feature_b)
    else:
        raise ValueError("Unsupported distance metric")

def reliefF_variable_custom(df, number_of_neighbours, instances_to_select, number_of_features, distance_variables_per_feature):
    features = df.iloc[:, :-1]
    labels = df.iloc[:, -1]
    rows, columns = features.shape
    weights = np.zeros(columns)

    for i in range(instances_to_select):
        random_instance_index = np.random.randint(0, rows)
        random_instance_features = features.iloc[random_instance_index, :].values
        random_instance_label = labels[random_instance_index]

        hit_distances = np.zeros(columns)
        miss_distances = {label: np.zeros(columns) for label in np.unique(labels)}

        for other_index, (other_features, other_label) in enumerate(zip(features.values, labels)):
            if other_index == random_instance_index:
                continue

            # Calculate distances feature-wise using the specified metric for each feature
            for feature_index in range(columns):
                metric = distance_variables_per_feature[feature_index]
                dist = calcDistanceCustom([random_instance_features[feature_index]], [other_features[feature_index]], metric)

                if other_label == random_instance_label:
                    hit_distances[feature_index] += dist
                else:
                    miss_distances[other_label][feature_index] += dist

        # Update weights
        for label, miss_dist in miss_distances.items():
            # Normalize distances by the number of instances
            norm_hit_dist = hit_distances / number_of_neighbours
            norm_miss_dist = miss_dist / number_of_neighbours

            # Update weights: Increase weight for features where misses are far and hits are close
            weights += norm_miss_dist - norm_hit_dist

    # Select top features based on weights
    top_feature_indices = np.argsort(weights)[-number_of_features:]
    feature_names = df.columns[:-1][top_feature_indices]

    return feature_names


In [1607]:
import pandas as pd
import numpy as np
from scipy.spatial import distance
import statistics
from collections import Counter

def calculate_distances(feature_values):
    # Initialize lists to store distances
    print("inside calculate_distances")
    braycurtis, canberra, chebyshev, cityblock, correlation, cosine, euclidean, jensenshannon, sqeuclidean = ([] for i in range(9))
    
    # Calculate distances for each pair of instances for a single feature
    for i in range(len(feature_values)-1):
        for j in range(i+1, len(feature_values)):
            # Adapt these calculations for single feature comparisons
            braycurtis.append(distance.braycurtis([feature_values[i]], [feature_values[j]]))
            canberra.append(distance.canberra([feature_values[i]], [feature_values[j]]))
            chebyshev.append(distance.chebyshev([feature_values[i]], [feature_values[j]]))
            cityblock.append(distance.cityblock([feature_values[i]], [feature_values[j]]))
            correlation.append(distance.correlation([feature_values[i]], [feature_values[j]]))
            cosine.append(distance.cosine([feature_values[i]], [feature_values[j]]))
            euclidean.append(distance.euclidean([feature_values[i]], [feature_values[j]]))
            jensenshannon.append(distance.jensenshannon([feature_values[i]], [feature_values[j]], base=2))
            sqeuclidean.append(distance.sqeuclidean([feature_values[i]], [feature_values[j]]))
    
    return [braycurtis, canberra, chebyshev, cityblock, correlation, cosine, euclidean, jensenshannon, sqeuclidean]
    # return [braycurtis, canberra]


def find_most_variable_distance(distances):
    # Calculate standard deviation for each distance metric
    # standard_deviations = [statistics.stdev(list(d) if d else 0 for d in distances]
    # standard_deviations = [statistics.stdev(list(d)) if d else 0 for d in distances]
    # standard_deviations = [statistics.stdev(list(filter(np.isfinite, np.nditer(d)))) if d else 0 for d in distances]
    # standard_deviations = [statistics.stdev(list(filter(np.isfinite, d))) if d else 0 for d in distances]
    standard_deviations = [statistics.stdev(list(filter(np.isfinite, d))) if len(list(filter(np.isfinite, d))) >= 2 else 0 for d in distances]



    # Find the index of the maximum standard deviation
    max_std_dev_index = standard_deviations.index(max(standard_deviations))
    return max_std_dev_index

# Helper function to calculate most variable distances
def calculate_most_variable_distances(features):
    print("inside calculate_most_variable_distances")
    count = 0 

    distance_variables_per_feature = {}
    for feature_index in range(features.shape[1]):
        current_feature_values = features.iloc[:, feature_index].values
        distances = calculate_distances(current_feature_values)
        print("distances")
        
        most_variable_distance_index = find_most_variable_distance(distances)
        distance_variables_per_feature[feature_index] = most_variable_distance_index
        print(count)
        count = count + 1 
    return distance_variables_per_feature


def reliefF_variable_main_modified(df, number_of_neighbours, instances_to_select, number_of_features):
    print("inside the main function")
    
    features = df.iloc[:,:-1]
    num_features = features.shape[1]
    
    # Calculate the most variable distance metric for each feature
    distance_variables_per_feature = calculate_most_variable_distances(features)
    
    # This assumes reliefF_variable is adapted as outlined
    features_combined = []
    for i in range(10):  # Example iteration count; adjust as needed
        selected_features = reliefF_variable_custom(df, number_of_neighbours, instances_to_select, number_of_features, distance_variables_per_feature)
        features_combined += list(selected_features)
    
    # Aggregate and select top features based on frequency of selection
    features_count = Counter(features_combined)
    features_count_sorted = sorted(features_count.items(), key=lambda x: x[1], reverse=True)
    variable_distance_relieff_features = [feature for feature, count in features_count_sorted[:number_of_features]]
    
    return variable_distance_relieff_features






# Comparing Feature Selection Algorithms

## Control Function

In [1608]:
#@title
def normalize_data(dataframe):
  dataframe=(dataframe-dataframe.mean())/dataframe.std()
  dataframe=(dataframe-dataframe.min())/(dataframe.max()-dataframe.min())
  dataframe[np.isnan(dataframe)] = 0
  return dataframe

In [1609]:
#@title
def plot_roc_auc(x_test, y_test, model):
  #define metrics
  y_pred_proba = model.predict_proba(x_test)[::,1]
  y_pred_proba = normalize_data(y_pred_proba)
  fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
  auc = metrics.roc_auc_score(y_test, y_pred_proba)
  rocauc_score = metrics.roc_auc_score(y_test, y_pred_proba)
  #print('The ROCAUC is {}'.format(rocauc_score))


  #create ROC curve
  #plt.plot(fpr,tpr,label="AUC="+str(auc))
  #plt.ylabel('True Positive Rate')
  #plt.xlabel('False Positive Rate')
  #plt.legend(loc=4)
  #plt.show()

  return rocauc_score

In [1610]:
#@title
def training_model(train, test, fold_no, model):
  x_train = train.drop(['label'],axis=1)
  y_train = train.label
  x_test = test.drop(['label'],axis=1)
  y_test = test.label
  model.fit(x_train, y_train)
  
  x_test = np.ascontiguousarray(x_test)
  y_test = np.ascontiguousarray(y_test)

  score = model.score(x_test,y_test)
  scores_list = []

  #print('For Fold {} the accuracy is {}'.format(str(fold_no),score))
  scores_list.append(score)

  f1_scores = f1_score(y_test,model.predict(x_test))
  #print('F1 score is {}'.format(f1_scores))
  scores_list.append(f1_scores)

  unique_labels = np.unique(y_test)
  predicted = model.predict(x_test)
  #y_test = list(y_test)
  #print(y_test)

  '''

  recall = []
  for k in unique_labels:
    indices = [int(i) for i, x in enumerate(y_test) if x == k]
    res_list_pred = list(map(predicted.__getitem__, indices))
    res_list_test = list(map(list(y_test).__getitem__, indices))
    recall_class = recall_score(res_list_pred, res_list_test)
    recall.append(recall_class)
  
  recall = [0.000001 if x == 0 else x for x in recall]
  geometric_mean = gmean(recall)

  '''


  geometric_mean = geometric_mean_score(y_test,model.predict(x_test))
  #print('The geometric mean is {}'.format(geometric_mean))
  scores_list.append(geometric_mean)

  roc_scores = plot_roc_auc(x_test, y_test,model)
  scores_list.append(roc_scores)

  # Calculate precision
  precision_scores = precision_score(y_test, predicted)
  scores_list.append(precision_scores)
  
    
  # Calculate recall
  recall_scores = recall_score(y_test, predicted)
  scores_list.append(recall_scores)

  return scores_list

In [1611]:
def control_function(dataframe, n_of_splits, df_name, path_name, df_scores):
  #dataframe = dataframe.reset_index()
  #dataframe = dataframe.replace(np.inf, dataframe.mean())
  #dataframe = dataframe.fillna(dataframe.mean())
  skf = StratifiedKFold(n_splits=n_of_splits)
  x = dataframe
  y = dataframe.label

  svc_model = SVC(kernel='rbf',probability=True)
# rbf 
  dtc_model = DecisionTreeClassifier(random_state = 0)
  rfc_model = RandomForestClassifier(random_state=0)
  naive_bayes_model = GaussianNB()
  gradient_boosting_model = GradientBoostingClassifier(random_state=0)
  knn_model = KNeighborsClassifier(n_neighbors=5)

  # dtc_model = DecisionTreeClassifier(random_state = 0)

  models = [svc_model, dtc_model, rfc_model, naive_bayes_model, gradient_boosting_model, knn_model]
  models_name = ['Support Vector Classifier', 'Decision Tree Classifier', 'Random Forest Classifier', 'Gaussian Naive Bayes', 'Gradient Boosting Classifier', 'K Nearest Neighbour']
  # models = [dtc_model]
  # models_name = ['Decision Tree Classifier']


  for i in range(len(models)):
    print(models_name[i], " + ", df_name, " + ", path_name, "\n")    
    fold_no = 1
    for train_index,test_index in skf.split(x, y):
      train = dataframe.iloc[train_index,:]
      test = dataframe.iloc[test_index,:]
      model_scores = training_model(train, test, fold_no, models[i])
      df_scores.loc[0 if pd.isnull(df_scores.index.max()) else df_scores.index.max() + 1] = [df_name, path_name, models_name[i], fold_no] + model_scores
      fold_no += 1
      #print("\n")

In [1612]:
def normalize_dataframe(dataframe):
  dataframe=(dataframe-dataframe.mean())/dataframe.std()
  dataframe=(dataframe-dataframe.min())/(dataframe.max()-dataframe.min())
  dataframe.fillna(0,inplace=True)
  return dataframe

In [1613]:
def main_function(path_name, df_scores, feature_names_path, count):
  df = pd.read_csv(path_name)
  print("df shapes")
  print(df.shape)
  num_rows, num_columns = df.shape
  
  # num_features = int(num_columns * 0.70)

  # number_of_neighbours = 5
  # instances_to_select = 10
  # Combination A ( Using more features) 
  num_features = int(num_columns * 0.55)

  number_of_neighbours = 7
  instances_to_select = 10



  # # Combination B ( Using more instances) 
  # num_features = int(num_columns * 0.70)

  # number_of_neighbours = 5
  # instances_to_select = 15


  # # Combination C ( Using more numder of neighbors) 
  # num_features = int(num_columns * 0.70)

  # number_of_neighbours = 8
  # instances_to_select = 10

    
  number_of_features = num_features

  
  df = normalize_dataframe(df)

  #Chi-Square
  chisquare_features = chi_square(df, number_of_features)
  df_chisquare = pd.read_csv(path_name, usecols = chisquare_features)
  df_chisquare = normalize_dataframe(df_chisquare)
  print('chi square') 
 
  

  #ReliefF
  reliefF_features = reliefF(df, number_of_neighbours, instances_to_select, number_of_features)
  df_reliefF = pd.read_csv(path_name, usecols = reliefF_features)
  df_reliefF = normalize_dataframe(df_reliefF)
  print('reliefF')
  
  

  #SVM-RFE
  svmrfe_features = svmrfe(df, number_of_features)
  df_svmrfe = pd.read_csv(path_name, usecols = svmrfe_features)
  df_svmrfe = normalize_dataframe(df_svmrfe)
  print('SVM')
 
  

  #Variable-ReliefF
  print(number_of_neighbours)
  variable_distance_relieff_features = reliefF_variable_main(df, number_of_neighbours, instances_to_select, number_of_features)
  df_variable_reliefF = pd.read_csv(path_name, usecols = variable_distance_relieff_features)
  df_variable_reliefF = normalize_dataframe(df_variable_reliefF)
  print('variable rfe') 
  
  


  #Original
  df_original = pd.read_csv(path_name).iloc[:,:-1]
  df_original = normalize_dataframe(df_original)
  print('df_original')
# adaptive_distance_metric_selection
  # Variable-ReliefF
  # variable_distance_relieffTwo_features = adaptive_distance_metric_selection(df, number_of_neighbours, instances_to_select, number_of_features)
  # df_variable_reliefFTwo = pd.read_csv(path_name, usecols = variable_distance_relieffTwo_features)
  # df_variable_reliefFTwo = normalize_dataframe(df_variable_reliefFTwo)
  # print('variable reliefFTwo') 

  variable_distance_relieffTwo_features = reliefF_variable_main_weight(df, number_of_neighbours, instances_to_select, number_of_features)
  df_variable_reliefFTwo = pd.read_csv(path_name, usecols = variable_distance_relieffTwo_features)
  df_variable_reliefFTwo = normalize_dataframe(df_variable_reliefFTwo)
  print('variable reliefFTwo') 

  features = pd.DataFrame(
    {'svmrfe': svmrfe_features,
     'chisquare': chisquare_features,
     'relief': reliefF_features,
     'variable': variable_distance_relieff_features,
     'reliefFTwo': variable_distance_relieffTwo_features,

    })
    
  # feature_names_path = feature_names_path + str(count)
  # features.to_csv(feature_names_path)

  features = pd.DataFrame(
    {
     'reliefFTwo': variable_distance_relieffTwo_features,
    })
  # features = pd.DataFrame(
  #   {
  #    'variable': variable_distance_relieff_features,
  #   })
  feature_names_path = feature_names_path + str(count)
  features.to_csv(feature_names_path)


  #Labels
  df_labels = pd.read_csv(path_name).iloc[:,-1]
  df_original['label'] = df_labels
  df_reliefF['label'] = df_labels
  df_chisquare['label'] = df_labels
  df_svmrfe['label'] = df_labels
  df_variable_reliefF['label'] = df_labels
  df_variable_reliefFTwo['label'] = df_labels


  n_of_splits = 5
  cr = 1

  df_list = [df_original, df_reliefF, df_chisquare, df_svmrfe, df_variable_reliefF,df_variable_reliefFTwo]
  df_list_name = ['Original', 'ReliefF', 'ChiSquare', 'SVMRFE', 'Variable ReliefF', 'ReliefFTwo']
  # df_list = [df_variable_reliefFTwo]
  # df_list_name = [ 'ReliefFTwo']

  for df_index in range(len(df_list)):
    print("cr")
    print(cr)
    control_function(df_list[df_index], n_of_splits, df_list_name[df_index], path_name, df_scores)
    cr = cr + 1

In [1614]:
# path_name_list = ['/content/gastroenterology.csv','/content/leukemia.csv', '/content/colon 2000.csv', '/content/DLBCL.csv', '/content/LSVT_voice_rehabilitation.csv', '/content/gastric cancer.csv',]
#path_name_list = ['/content/staDynBenignLab.csv','/content/qsar_androgen_receptor.csv','/content/qsar_oral_toxicity.csv']
# path_name_list = ['setapProcessT1_updated1.csv']
# path_name_list = ['staDynBenignLab.csv','taiwan_bank_data_updated.csv','pd_speech_data_updated.csv','aps_data_updated.csv','swarm_grouped_data_updated.csv','tuandromd_data_updated.csv']
# path_name_list = ['leukemia.csv','colon 2000.csv']

path_name_list = ['setapProcessT1_updated1.csv','setapProcessT2_updated.csv','setapProcessT3_updated.csv','setapProcessT4_updated.csv','setapProcessT5_updated.csv','setapProcessT6_updated.csv','setapProcessT7_updated.csv','setapProcessT8_updated.csv','setapProcessT9_updated.csv','setapProcessT10_updated.csv','setapProcessT11_updated.csv','period_data_updated.csv','darwin_data_updated.csv','toxicity_data_updated.csv','voice_data_updated.csv','colon 2000.csv','DLBCL.csv','gastric cancer.csv','gastroenterology.csv','leukemia.csv']
# path_name_list = ['gastric cancer.csv']
# path_name_list = ['setapProcessT6_updated.csv']
# path_name_list = ['colon 2000.csv','DLBCL.csv','gastric cancer.csv','gastroenterology.csv','leukemia.csv']
df_scores = pd.DataFrame(columns=['Algorithm','Dataset','Model','Fold Number', 'Accuracy','F1','Geometric Mean','AUC','Precision','Recall'])
# Add Precision and recall 
feature_names_path = 'featuresLow.csv'
count = 1
for path_name in path_name_list:
  main_function(path_name, df_scores,feature_names_path, count)
  

df shapes
(64, 85)
chi square
reliefF
SVM
7
7
7
10
46
1
SSSSSSSSSSSSSSSSSSS
inside the reliefF_variable func2
7
10
46
1
SSSSSSSSSSSSSSSSSSS
inside the reliefF_variable func2
7
10
46
1
SSSSSSSSSSSSSSSSSSS
inside the reliefF_variable func2
7
10
46
1
SSSSSSSSSSSSSSSSSSS
inside the reliefF_variable func2
7
10
46
1
SSSSSSSSSSSSSSSSSSS
inside the reliefF_variable func2
7
10
46
1
SSSSSSSSSSSSSSSSSSS
inside the reliefF_variable func2
7
10
46
1
SSSSSSSSSSSSSSSSSSS
inside the reliefF_variable func2
7
10
46
1
SSSSSSSSSSSSSSSSSSS
inside the reliefF_variable func2
7
10
46
1
SSSSSSSSSSSSSSSSSSS
inside the reliefF_variable func2
7
10
46
1
SSSSSSSSSSSSSSSSSSS
inside the reliefF_variable func2
variable rfe
df_original
7
7
10
46
1
inside the reliefF_variable func
7
10
46
1
inside the reliefF_variable func
7
10
46
1
inside the reliefF_variable func
7
10
46
1
inside the reliefF_variable func
7
10
46
1
inside the reliefF_variable func
7
10
46
1
inside the reliefF_variable func
7
10
46
1
inside the reliefF_

KeyboardInterrupt: 

In [None]:
df_scores.to_csv('scores_main_combinations.csv')
df_scores

In [None]:
end_time = datetime.now()
print("Total execution time:", end_time - start_time)


In [None]:
# #path_name_list = ['/content/gastroenterology.csv','/content/leukemia.csv', '/content/colon 2000.csv', '/content/DLBCL.csv', '/content/LSVT_voice_rehabilitation.csv', '/content/gastric cancer.csv',]
# # path_name_list = ['/content/staDynBenignLab.csv','/content/qsar_androgen_receptor.csv','/content/qsar_oral_toxicity.csv']
# path_name_list = ['gastroenterology.csv']
# df_scores = pd.DataFrame(columns=['Algorithm','Dataset','Model','Fold Number', 'Accuracy','F1','Geometric Mean','AUC'])
# feature_names_path = 'featuresHigh.csv'
# count = 1
# for path_name in path_name_list:
#   main_function(path_name, df_scores, feature_names_path, count)
#   count = count + 1

In [None]:
# df_scores.to_csv('h1.csv')
# df_scores