In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
from google.colab import drive
import random
import time
from enum import Enum
from sklearn.preprocessing import StandardScaler

drive.mount('/content/gdrive/', force_remount=True)

kd_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'Heart Rate', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age','class_label']
kd_data_initial = pd.read_csv('/content/gdrive/MyDrive/kidney_disease.csv', names = kd_columns )
print(kd_data_initial.shape)

ww_columns = ['Alcohol', 'Malic acid', 'Ash', 'Alkalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols' , 'Proanthocyanins', 'Hue', 'class_label']
ww_data_initial = pd.read_csv('/content/gdrive/MyDrive/white_wine_quality.csv',names = ww_columns)
print(ww_data_initial.shape)


Mounted at /content/gdrive/
(330, 10)
(1599, 11)


In [3]:
def shuffle_data(df):
    random.seed(0)  # Use a fixed seed for the random number generator
    df = df.sample(frac=1, random_state=0).reset_index(drop=True) 
    return df

In [4]:

kd_data = shuffle_data(kd_data_initial)
ww_data = shuffle_data(ww_data_initial)


In [5]:
#get feature columns and lable column and convert it to array
kd_x =  kd_data.drop('class_label', axis=1).to_numpy()
kd_y = kd_data.to_numpy()[:, -1].reshape(-1,1)


ww_x = ww_data.drop('class_label', axis=1).to_numpy()
ww_y = ww_data.to_numpy()[:, -1].reshape(-1,1)



In [6]:
#defined a dataframe for storing the result of model, an enum for different learning types
model_data = pd.DataFrame(columns=['model_name','description','learning_rate','iteration','weights','epsilon','elapsed_time',
                                   'is_max_reached','loss','accuracy_kfold','variable'])
learning_rate_type = Enum('lr_type', ['independent', 'iteration', 'iteration_plus_one','sample_size','ten_sample_size','hundred_sample_size'])


In [7]:
#utility functions

def train_test_split(x, y, train_size=0.8):
    num_rows = x.shape[0]
    num_rows_train = int(num_rows * train_size )
    num_rows_test = num_rows - num_rows_train

    x_train = x[:num_rows_train, :]
    x_test =  x[num_rows_train:, :]
    y_train = y[:num_rows_train]
    y_test = y[num_rows_train:]

    return x_train, y_train, x_test, y_test


#convert feature to Gaussian distribution
def log_transform_normalize(df, index):
    df_copy = df.copy()
    df_copy.iloc[:, index] = df_copy.iloc[:, index].apply(lambda x: np.nan if x <= 0 else x)
    #add small values to avoid NaN
    df_copy.iloc[:, index] = np.log(df_copy.iloc[:, index] + 1e-10) 
    mean_val = df_copy.iloc[:, index].mean()
    df_copy.iloc[:, index] = df_copy.iloc[:, index].fillna(mean_val)
    scaler = StandardScaler()
    df_copy.iloc[:, index] = scaler.fit_transform(df_copy.iloc[:, index].values.reshape(-1, 1))
    return df_copy


#convert feature to the power of feature
def power_n_feature(df, index, power_number):
    df_copy = df.copy()
    df_copy.iloc[:, index] = df_copy.iloc[:, index].apply(lambda x: x ** power_number)
    return df_copy



In [8]:
class Logistic_Regression:

    def __init__(self, learning_rate , learning_rate_type , max_iterations , epsilon):
        self.x = []
        self.y = []
        self.weights = []
        self.learning_rate = learning_rate
        self.learning_rate_type = learning_rate_type
        self.max_iterations = max_iterations
        self.epsilon = epsilon


    def fit(self,x, y,is_add_bias = False):
        self.x = x;
        self.y = y;
        loss_list = []
        t_start = time.time()
        n, m = self.x.shape 
        is_max_reached = False

        #add bias/dummy feature
        #initialize weights with zero
        if(is_add_bias == True):
          bias = np.ones((n,1), dtype=np.double)
          self.x = np.append(self.x, bias, axis = 1)
          self.weights = np.zeros(((m+1),1))
        else:
          self.weights = np.zeros(((m),1))


        #define initial norm value 
        norm_weights = 1e8
        iteration = 1

        while (iteration < self.max_iterations) & (norm_weights > self.epsilon):
          #if(iteration%10000 == 0):
            #print('iteration number:',iteration)

          if(self.learning_rate_type == learning_rate_type.iteration):
              self.learning_rate = (1/ iteration)
          elif(self.learning_rate_type == learning_rate_type.iteration_plus_one):
              self.learning_rate = (1/ (1 + iteration))
          elif(self.learning_rate_type == learning_rate_type.sample_size):
              self.learning_rate = (1/ (1 + n))
          elif(self.learning_rate_type == learning_rate_type.ten_sample_size):
              self.learning_rate = (1/ (10 * n))
          elif(self.learning_rate_type == learning_rate_type.hundred_sample_size):
              self.learning_rate = (1/ (100 * n))
   
          #if(iteration == 15):
           # print("learning rate type:",self.learning_rate_type)
           # print("learning rate:",self.learning_rate)

          if iteration % 100 == 0: 
            loss_model = self.cross_entropy_loss(self.x, self.y);
            loss_list.append((iteration, loss_model))

          # Store current weights before updating
          weight_previous = self.weights
         
          # Compute gradient
          gradient =  np.sum(
          self.x * (self.y - self.sigmoid(np.dot(self.x, weight_previous))), axis=0
          ).reshape(-1, 1)

          # Update weights
          self.weights = weight_previous + self.learning_rate * gradient

          # Compute change in weights
          norm_weights = np.linalg.norm(self.weights - weight_previous) ** 2    
          iteration += 1  

        if(iteration == self.max_iterations):
         print (f"**************failed to reach minimum in {self.max_iterations} iterations")
         is_max_reached = True

        
        t_end = time.time()

        #time elapsed for model training
        elapsed_time = round(t_end - t_start,3)
        return iteration,self.weights, elapsed_time,is_max_reached,loss_list
        
      
     ############################################# end fit
    # Decision boundary(threshold)
    def predict(self):
      decision_boundary = 0.5  
      y_predict = self.sigmoid(np.dot(self.x, self.weights))
      y_pred = np.where(y_predict < decision_boundary, 0, 1)
      return y_pred
    
    #compute accuracy of model 
    def accu_eval(self, y_pred):
      accuracy = np.count_nonzero(self.y == y_pred) / len(self.y)
      return accuracy
    

    def sigmoid(self, arg):
        return 1 / (1 + np.exp(-arg))

    #compute cross entropy loss
    def cross_entropy_loss(self, x_data, y_data):

        y_pred_0 = self.sigmoid(np.dot(x_data,self.weights))
        y_pred_1 = 1 - y_pred_0
        # Replace small values to avoid NAN (log0)
        y_pred_0 = np.where(y_pred_0 < 1e-6, 1e-6, y_pred_0)
        y_pred_1 = np.where(y_pred_1 < 1e-6, 1e-6, y_pred_1)

        loss_0 = y_data * np.log(y_pred_0)
        loss_1 = (1-y_data) * np.log(y_pred_1)
        loss = -np.sum(loss_0 + loss_1)
        return loss




In [9]:
#k fold cross validation function
def kfold_cross_validation(lgr_model , k = 10, x_train_initial = [] ,y_train_initial= [] ):

  partition_size = int(len(x_train_initial)/k) 

  model_accuracy_list = []
  model_loss_list = []

  for i in range(k):
    print("i======>",i)


    # Split data
    i_start = partition_size * i
    i_end = partition_size*(i+1)

    if i != (k-1):
      x_train_fold = np.concatenate((x_train_initial[:i_start,:], x_train_initial[i_end:,:]),axis=0)
      y_train_fold = np.concatenate((y_train_initial[:i_start,:], y_train_initial[i_end:,:]),axis=0)
      x_validation_fold = x_train_initial[i_start:i_end,:]
      y_validation_fold = y_train_initial[i_start:i_end,:]
    
    else:
      # For final partition
      x_train_fold = lgr_model.x[:i_start,:]
      y_train_fold = lgr_model.y[:i_start,:]
      x_validation_fold = lgr_model.x[i_start:,:]
      y_validation_fold = lgr_model.y[i_start:,:]


    iteration, weight_store , epalsed_time_one , is_max_reached , loss_list = lgr_model.fit(x_train_fold,y_train_fold,False)
    y_predict = lgr_model.predict()

    model_accuracy = lgr_model.accu_eval(y_predict)
    model_accuracy_list.append(model_accuracy)
    cross_entropy = lgr_model.cross_entropy_loss(x_validation_fold,y_validation_fold)
    model_loss_list.append(cross_entropy)
    #print("model_accuracy:",model_accuracy)
    #return model_accuracy
  return np.mean(model_accuracy_list),np.mean(model_loss_list)

   

In [10]:
#define a function that gets the weights and runs k-fold algorithm,then stores it in model data
def run_model(model_name , description,learning_rate , learning_rate_type, max_iterations , epsilon,  x_train ,   y_train , model_data , variable):

  model = Logistic_Regression(learning_rate = learning_rate, learning_rate_type = learning_rate_type , max_iterations = max_iterations,epsilon = epsilon)
  model_iteration_num , model_weights  , model_elapsed_time , is_max_iteration_reached, loss_list  = model.fit(x_train,y_train, True)
  model_accuracy_kfold,model_loss_kfold = kfold_cross_validation(model, k=10,x_train_initial = model.x , y_train_initial =  model.y)

  final_learning_rate = model.learning_rate if  learning_rate_type == learning_rate_type.independent else learning_rate_type

  model_data = model_data.append({'model_name':model_name,'description':description,'learning_rate' : final_learning_rate,
                                  'iteration' : model_iteration_num,'weights' : model_weights,'epsilon':model.epsilon,
                                  'elapsed_time':model_elapsed_time, 'is_max_reached': is_max_iteration_reached, 'loss':model_loss_kfold,
                              'accuracy_kfold':model_accuracy_kfold , 'variable':variable}, ignore_index = True)
  return model_data



In [11]:
kd_x_train, kd_y_train, kd_x_test, kd_y_test = train_test_split(kd_x,kd_y,1)
print(kd_x_train.shape)
print(kd_y_train.shape)

ww_x_train, ww_y_train, ww_x_test, ww_y_test = train_test_split(ww_x,ww_y,1)


(330, 9)
(330, 1)


In [12]:
#function sort dataframe by accuracy and delete dataframe
def show_sorted_model(data):
  return data.sort_values(by=['accuracy_kfold'], ascending=False)
def delete_model(data):
  return data.drop(model_data.index,inplace=True) 
def delete_last_model(data,number):
  return data.drop(data.tail(number).index,inplace=True)

In [13]:
#delete_model(model_data)
show_sorted_model(model_data)
#print(kd_x_train.shape)
#delete_model(model_data)

Unnamed: 0,model_name,description,learning_rate,iteration,weights,epsilon,elapsed_time,is_max_reached,loss,accuracy_kfold,variable


In [14]:
#train whole model
model_data = run_model('kd','whole model',learning_rate = 0.01, learning_rate_type = learning_rate_type.independent ,max_iterations = 150000,
                       epsilon = 1e-6,x_train = kd_x_train , y_train = kd_y_train , model_data = model_data,variable = 'all features')
show_sorted_model(model_data)



Unnamed: 0,model_name,description,learning_rate,iteration,weights,epsilon,elapsed_time,is_max_reached,loss,accuracy_kfold,variable
0,kd,whole model,0.01,956,"[[2.5819277747484066], [6.780007402086465], [-...",1e-06,0.049,False,15.855793,0.749495,all features


In [15]:
#explore different constant learning rates
learning_rates = [1/2,1/4,1/8,0.05]
for i in range(len(learning_rates)):
  title = 'whole model-lr:' + str(learning_rates[i])
  model_data = run_model('kd',title,learning_rate =  learning_rates[i], learning_rate_type = learning_rate_type.independent ,max_iterations = 150000,
                       epsilon = 1e-6,x_train = kd_x_train , y_train = kd_y_train , model_data = model_data,variable = 'learning rate')
show_sorted_model(model_data)

**************failed to reach minimum in 150000 iterations
**************failed to reach minimum in 150000 iterations
**************failed to reach minimum in 150000 iterations
**************failed to reach minimum in 150000 iterations
**************failed to reach minimum in 150000 iterations
**************failed to reach minimum in 150000 iterations
**************failed to reach minimum in 150000 iterations
**************failed to reach minimum in 150000 iterations
**************failed to reach minimum in 150000 iterations
**************failed to reach minimum in 150000 iterations
**************failed to reach minimum in 150000 iterations
**************failed to reach minimum in 150000 iterations
**************failed to reach minimum in 150000 iterations
**************failed to reach minimum in 150000 iterations
**************failed to reach minimum in 150000 iterations
**************failed to reach minimum in 150000 iterations
**************failed to reach minimum in 150000 iteratio

Unnamed: 0,model_name,description,learning_rate,iteration,weights,epsilon,elapsed_time,is_max_reached,loss,accuracy_kfold,variable
0,kd,whole model,0.01,956,"[[2.5819277747484066], [6.780007402086465], [-...",1e-06,0.049,False,15.855793,0.749495,all features
4,kd,whole model-lr:0.05,0.05,150000,"[[9.970728371948313], [26.491694165903258], [-...",1e-06,6.472,True,46.838322,0.688552,learning rate
2,kd,whole model-lr:0.25,0.25,150000,"[[47.64433276868854], [130.6790054571249], [-1...",1e-06,6.5,True,106.221172,0.683502,learning rate
1,kd,whole model-lr:0.5,0.5,150000,"[[95.81933692733199], [262.7825656994754], [-3...",1e-06,8.592,True,118.189617,0.683165,learning rate
3,kd,whole model-lr:0.125,0.125,150000,"[[24.070810464619214], [65.46666343519061], [-...",1e-06,8.383,True,87.970034,0.683165,learning rate


In [16]:
#explore different dependent learning rates
learning_rates_types = [learning_rate_type.iteration, learning_rate_type.iteration_plus_one,learning_rate_type.sample_size,
                        learning_rate_type.ten_sample_size,learning_rate_type.hundred_sample_size]
for i in range(len(learning_rates_types)):
  title = 'whole model-lr:' + str(learning_rates_types[i])
  model_data = run_model('kd',title,learning_rate =  0, learning_rate_type = learning_rates_types[i] ,max_iterations = 150000,
                       epsilon = 1e-6,x_train = kd_x_train , y_train = kd_y_train , model_data = model_data,variable = 'learning rate')
show_sorted_model(model_data)



Unnamed: 0,model_name,description,learning_rate,iteration,weights,epsilon,elapsed_time,is_max_reached,loss,accuracy_kfold,variable
6,kd,whole model-lr:lr_type.iteration_plus_one,lr_type.iteration_plus_one,1476,"[[3.175241989157885], [9.517996847051375], [-1...",1e-06,0.105,False,16.436353,0.751515,learning rate
5,kd,whole model-lr:lr_type.iteration,lr_type.iteration,2220,"[[3.6602142620593963], [11.56162794637247], [-...",1e-06,0.162,False,17.234261,0.750505,learning rate
0,kd,whole model,0.01,956,"[[2.5819277747484066], [6.780007402086465], [-...",1e-06,0.049,False,15.855793,0.749495,all features
7,kd,whole model-lr:lr_type.sample_size,lr_type.sample_size,2048,"[[2.5011580589018196], [6.514304926750582], [-...",1e-06,0.155,False,15.84957,0.749495,learning rate
8,kd,whole model-lr:lr_type.ten_sample_size,lr_type.ten_sample_size,3516,"[[1.8917011563673924], [3.7425174840678586], [...",1e-06,0.266,False,16.462365,0.741414,learning rate
4,kd,whole model-lr:0.05,0.05,150000,"[[9.970728371948313], [26.491694165903258], [-...",1e-06,6.472,True,46.838322,0.688552,learning rate
2,kd,whole model-lr:0.25,0.25,150000,"[[47.64433276868854], [130.6790054571249], [-1...",1e-06,6.5,True,106.221172,0.683502,learning rate
1,kd,whole model-lr:0.5,0.5,150000,"[[95.81933692733199], [262.7825656994754], [-3...",1e-06,8.592,True,118.189617,0.683165,learning rate
3,kd,whole model-lr:0.125,0.125,150000,"[[24.070810464619214], [65.46666343519061], [-...",1e-06,8.383,True,87.970034,0.683165,learning rate
9,kd,whole model-lr:lr_type.hundred_sample_size,lr_type.hundred_sample_size,2,"[[0.0003065942121212121], [0.00038259484848484...",1e-06,0.0,False,20.585959,0.516835,learning rate


In [17]:
#explore different epsilons 
epsilon_list = [1e-2,1e-3,1e-4,1e-5,1e-6,1e-7,1e-8,1e-9]
for i in range(len(epsilon_list)):
  title = 'whole model-epsilon:' + str(epsilon_list[i])
  model_data = run_model('kd',title,learning_rate =  0, learning_rate_type = learning_rate_type.iteration_plus_one ,max_iterations = 150000,
                       epsilon = epsilon_list[i],x_train = kd_x_train , y_train = kd_y_train , model_data = model_data,variable = 'epsilon')
show_sorted_model(model_data)



Unnamed: 0,model_name,description,learning_rate,iteration,weights,epsilon,elapsed_time,is_max_reached,loss,accuracy_kfold,variable
16,kd,whole model-epsilon:1e-08,lr_type.iteration_plus_one,9021,"[[2.96533714092659], [8.33217514426845], [-1.0...",1e-08,0.413,False,16.097559,0.751515,epsilon
14,kd,whole model-epsilon:1e-06,lr_type.iteration_plus_one,1476,"[[3.175241989157885], [9.517996847051375], [-1...",1e-06,0.072,False,16.436353,0.751515,epsilon
6,kd,whole model-lr:lr_type.iteration_plus_one,lr_type.iteration_plus_one,1476,"[[3.175241989157885], [9.517996847051375], [-1...",1e-06,0.105,False,16.436353,0.751515,learning rate
15,kd,whole model-epsilon:1e-07,lr_type.iteration_plus_one,3643,"[[3.0601381991207917], [8.83862515342757], [-1...",1e-07,0.166,False,16.225842,0.751178,epsilon
17,kd,whole model-epsilon:1e-09,lr_type.iteration_plus_one,22164,"[[2.887385852612343], [7.9553323804450375], [-...",1e-09,0.94,False,16.019264,0.750842,epsilon
5,kd,whole model-lr:lr_type.iteration,lr_type.iteration,2220,"[[3.6602142620593963], [11.56162794637247], [-...",1e-06,0.162,False,17.234261,0.750505,learning rate
13,kd,whole model-epsilon:1e-05,lr_type.iteration_plus_one,636,"[[3.3327577160627104], [10.39377495841507], [-...",1e-05,0.027,False,16.77591,0.749495,epsilon
0,kd,whole model,0.01,956,"[[2.5819277747484066], [6.780007402086465], [-...",1e-06,0.049,False,15.855793,0.749495,all features
7,kd,whole model-lr:lr_type.sample_size,lr_type.sample_size,2048,"[[2.5011580589018196], [6.514304926750582], [-...",1e-06,0.155,False,15.84957,0.749495,learning rate
12,kd,whole model-epsilon:0.0001,lr_type.iteration_plus_one,314,"[[3.6109084400585387], [11.42702526345321], [-...",0.0001,0.013,False,17.369442,0.747475,epsilon


In [18]:
show_sorted_model(model_data)

Unnamed: 0,model_name,description,learning_rate,iteration,weights,epsilon,elapsed_time,is_max_reached,loss,accuracy_kfold,variable
16,kd,whole model-epsilon:1e-08,lr_type.iteration_plus_one,9021,"[[2.96533714092659], [8.33217514426845], [-1.0...",1e-08,0.413,False,16.097559,0.751515,epsilon
14,kd,whole model-epsilon:1e-06,lr_type.iteration_plus_one,1476,"[[3.175241989157885], [9.517996847051375], [-1...",1e-06,0.072,False,16.436353,0.751515,epsilon
6,kd,whole model-lr:lr_type.iteration_plus_one,lr_type.iteration_plus_one,1476,"[[3.175241989157885], [9.517996847051375], [-1...",1e-06,0.105,False,16.436353,0.751515,learning rate
15,kd,whole model-epsilon:1e-07,lr_type.iteration_plus_one,3643,"[[3.0601381991207917], [8.83862515342757], [-1...",1e-07,0.166,False,16.225842,0.751178,epsilon
17,kd,whole model-epsilon:1e-09,lr_type.iteration_plus_one,22164,"[[2.887385852612343], [7.9553323804450375], [-...",1e-09,0.94,False,16.019264,0.750842,epsilon
5,kd,whole model-lr:lr_type.iteration,lr_type.iteration,2220,"[[3.6602142620593963], [11.56162794637247], [-...",1e-06,0.162,False,17.234261,0.750505,learning rate
13,kd,whole model-epsilon:1e-05,lr_type.iteration_plus_one,636,"[[3.3327577160627104], [10.39377495841507], [-...",1e-05,0.027,False,16.77591,0.749495,epsilon
0,kd,whole model,0.01,956,"[[2.5819277747484066], [6.780007402086465], [-...",1e-06,0.049,False,15.855793,0.749495,all features
7,kd,whole model-lr:lr_type.sample_size,lr_type.sample_size,2048,"[[2.5011580589018196], [6.514304926750582], [-...",1e-06,0.155,False,15.84957,0.749495,learning rate
12,kd,whole model-epsilon:0.0001,lr_type.iteration_plus_one,314,"[[3.6109084400585387], [11.42702526345321], [-...",0.0001,0.013,False,17.369442,0.747475,epsilon


In [19]:
show_sorted_model(model_data)

Unnamed: 0,model_name,description,learning_rate,iteration,weights,epsilon,elapsed_time,is_max_reached,loss,accuracy_kfold,variable
16,kd,whole model-epsilon:1e-08,lr_type.iteration_plus_one,9021,"[[2.96533714092659], [8.33217514426845], [-1.0...",1e-08,0.413,False,16.097559,0.751515,epsilon
14,kd,whole model-epsilon:1e-06,lr_type.iteration_plus_one,1476,"[[3.175241989157885], [9.517996847051375], [-1...",1e-06,0.072,False,16.436353,0.751515,epsilon
6,kd,whole model-lr:lr_type.iteration_plus_one,lr_type.iteration_plus_one,1476,"[[3.175241989157885], [9.517996847051375], [-1...",1e-06,0.105,False,16.436353,0.751515,learning rate
15,kd,whole model-epsilon:1e-07,lr_type.iteration_plus_one,3643,"[[3.0601381991207917], [8.83862515342757], [-1...",1e-07,0.166,False,16.225842,0.751178,epsilon
17,kd,whole model-epsilon:1e-09,lr_type.iteration_plus_one,22164,"[[2.887385852612343], [7.9553323804450375], [-...",1e-09,0.94,False,16.019264,0.750842,epsilon
5,kd,whole model-lr:lr_type.iteration,lr_type.iteration,2220,"[[3.6602142620593963], [11.56162794637247], [-...",1e-06,0.162,False,17.234261,0.750505,learning rate
13,kd,whole model-epsilon:1e-05,lr_type.iteration_plus_one,636,"[[3.3327577160627104], [10.39377495841507], [-...",1e-05,0.027,False,16.77591,0.749495,epsilon
0,kd,whole model,0.01,956,"[[2.5819277747484066], [6.780007402086465], [-...",1e-06,0.049,False,15.855793,0.749495,all features
7,kd,whole model-lr:lr_type.sample_size,lr_type.sample_size,2048,"[[2.5011580589018196], [6.514304926750582], [-...",1e-06,0.155,False,15.84957,0.749495,learning rate
12,kd,whole model-epsilon:0.0001,lr_type.iteration_plus_one,314,"[[3.6109084400585387], [11.42702526345321], [-...",0.0001,0.013,False,17.369442,0.747475,epsilon


In [20]:
kd_x_train_np = pd.DataFrame(kd_x_train)

In [21]:
#explore different logs of features
for i in range(kd_x_train.shape[1]):
    print (i,',log , column=>',kd_columns[i])
    kd_x_train_modified = log_transform_normalize(kd_x_train_np,i)
    title = 'log {}'.format(kd_columns[i])
    model_data = run_model('kd',title,learning_rate = 0, learning_rate_type = learning_rate_type.iteration_plus_one ,max_iterations = 150000,
                       epsilon = 1e-6,x_train = kd_x_train_modified , y_train = kd_y_train , model_data = model_data,variable = 'log')
show_sorted_model(model_data)

0 ,log , column=> Pregnancies
1 ,log , column=> Glucose
2 ,log , column=> BloodPressure
3 ,log , column=> Heart Rate
4 ,log , column=> SkinThickness
5 ,log , column=> Insulin
6 ,log , column=> BMI
7 ,log , column=> DiabetesPedigreeFunction
8 ,log , column=> Age


Unnamed: 0,model_name,description,learning_rate,iteration,weights,epsilon,elapsed_time,is_max_reached,loss,accuracy_kfold,variable
23,kd,log Insulin,lr_type.iteration_plus_one,802,"[[2.686500388124719], [8.005602930239544], [-0...",1e-06,0.037,False,15.82128,0.755556,log
25,kd,log DiabetesPedigreeFunction,lr_type.iteration_plus_one,1118,"[[3.1317646256736045], [8.935591338567065], [-...",1e-06,0.051,False,16.145685,0.751852,log
26,kd,log Age,lr_type.iteration_plus_one,1425,"[[3.2317379647520714], [9.47235968785085], [-1...",1e-06,0.07,False,16.404716,0.751515,log
14,kd,whole model-epsilon:1e-06,lr_type.iteration_plus_one,1476,"[[3.175241989157885], [9.517996847051375], [-1...",1e-06,0.072,False,16.436353,0.751515,epsilon
6,kd,whole model-lr:lr_type.iteration_plus_one,lr_type.iteration_plus_one,1476,"[[3.175241989157885], [9.517996847051375], [-1...",1e-06,0.105,False,16.436353,0.751515,learning rate
16,kd,whole model-epsilon:1e-08,lr_type.iteration_plus_one,9021,"[[2.96533714092659], [8.33217514426845], [-1.0...",1e-08,0.413,False,16.097559,0.751515,epsilon
15,kd,whole model-epsilon:1e-07,lr_type.iteration_plus_one,3643,"[[3.0601381991207917], [8.83862515342757], [-1...",1e-07,0.166,False,16.225842,0.751178,epsilon
17,kd,whole model-epsilon:1e-09,lr_type.iteration_plus_one,22164,"[[2.887385852612343], [7.9553323804450375], [-...",1e-09,0.94,False,16.019264,0.750842,epsilon
5,kd,whole model-lr:lr_type.iteration,lr_type.iteration,2220,"[[3.6602142620593963], [11.56162794637247], [-...",1e-06,0.162,False,17.234261,0.750505,learning rate
18,kd,log Pregnancies,lr_type.iteration_plus_one,1129,"[[0.47048312539781034], [8.606990965621858], [...",1e-06,0.048,False,16.526437,0.750168,log


In [22]:
#combine log insulin-age
kd_x_train_log_insulin = log_transform_normalize(kd_x_train_np,5)
kd_x_train_logai = log_transform_normalize(kd_x_train_log_insulin,8)

model_data = run_model('kd','log age-insulin',learning_rate = 0, learning_rate_type = learning_rate_type.iteration_plus_one ,max_iterations = 150000,
                       epsilon = 1e-6,x_train = kd_x_train_logai , y_train = kd_y_train , model_data = model_data,variable = 'log')
show_sorted_model(model_data)



Unnamed: 0,model_name,description,learning_rate,iteration,weights,epsilon,elapsed_time,is_max_reached,loss,accuracy_kfold,variable
27,kd,log age-insulin,lr_type.iteration_plus_one,836,"[[2.7683606646024232], [8.122909449757739], [-...",1e-06,0.035,False,15.737679,0.757576,log
23,kd,log Insulin,lr_type.iteration_plus_one,802,"[[2.686500388124719], [8.005602930239544], [-0...",1e-06,0.037,False,15.82128,0.755556,log
25,kd,log DiabetesPedigreeFunction,lr_type.iteration_plus_one,1118,"[[3.1317646256736045], [8.935591338567065], [-...",1e-06,0.051,False,16.145685,0.751852,log
6,kd,whole model-lr:lr_type.iteration_plus_one,lr_type.iteration_plus_one,1476,"[[3.175241989157885], [9.517996847051375], [-1...",1e-06,0.105,False,16.436353,0.751515,learning rate
26,kd,log Age,lr_type.iteration_plus_one,1425,"[[3.2317379647520714], [9.47235968785085], [-1...",1e-06,0.07,False,16.404716,0.751515,log
16,kd,whole model-epsilon:1e-08,lr_type.iteration_plus_one,9021,"[[2.96533714092659], [8.33217514426845], [-1.0...",1e-08,0.413,False,16.097559,0.751515,epsilon
14,kd,whole model-epsilon:1e-06,lr_type.iteration_plus_one,1476,"[[3.175241989157885], [9.517996847051375], [-1...",1e-06,0.072,False,16.436353,0.751515,epsilon
15,kd,whole model-epsilon:1e-07,lr_type.iteration_plus_one,3643,"[[3.0601381991207917], [8.83862515342757], [-1...",1e-07,0.166,False,16.225842,0.751178,epsilon
17,kd,whole model-epsilon:1e-09,lr_type.iteration_plus_one,22164,"[[2.887385852612343], [7.9553323804450375], [-...",1e-09,0.94,False,16.019264,0.750842,epsilon
5,kd,whole model-lr:lr_type.iteration,lr_type.iteration,2220,"[[3.6602142620593963], [11.56162794637247], [-...",1e-06,0.162,False,17.234261,0.750505,learning rate


In [23]:
#combine log DiabetesPedigreeFunction
kd_x_train_logaid = log_transform_normalize(kd_x_train_logai,7)

model_data = run_model('kd','log age-insulin-Dpf',learning_rate = 0, learning_rate_type = learning_rate_type.iteration_plus_one ,max_iterations = 150000,
                       epsilon = 1e-6,x_train = kd_x_train_logaid , y_train = kd_y_train , model_data = model_data,variable = 'log')
show_sorted_model(model_data)



Unnamed: 0,model_name,description,learning_rate,iteration,weights,epsilon,elapsed_time,is_max_reached,loss,accuracy_kfold,variable
27,kd,log age-insulin,lr_type.iteration_plus_one,836,"[[2.7683606646024232], [8.122909449757739], [-...",1e-06,0.035,False,15.737679,0.757576,log
28,kd,log age-insulin-Dpf,lr_type.iteration_plus_one,742,"[[2.81722351586145], [8.11597903143088], [-0.8...",1e-06,0.036,False,15.629731,0.756902,log
23,kd,log Insulin,lr_type.iteration_plus_one,802,"[[2.686500388124719], [8.005602930239544], [-0...",1e-06,0.037,False,15.82128,0.755556,log
25,kd,log DiabetesPedigreeFunction,lr_type.iteration_plus_one,1118,"[[3.1317646256736045], [8.935591338567065], [-...",1e-06,0.051,False,16.145685,0.751852,log
16,kd,whole model-epsilon:1e-08,lr_type.iteration_plus_one,9021,"[[2.96533714092659], [8.33217514426845], [-1.0...",1e-08,0.413,False,16.097559,0.751515,epsilon
26,kd,log Age,lr_type.iteration_plus_one,1425,"[[3.2317379647520714], [9.47235968785085], [-1...",1e-06,0.07,False,16.404716,0.751515,log
14,kd,whole model-epsilon:1e-06,lr_type.iteration_plus_one,1476,"[[3.175241989157885], [9.517996847051375], [-1...",1e-06,0.072,False,16.436353,0.751515,epsilon
6,kd,whole model-lr:lr_type.iteration_plus_one,lr_type.iteration_plus_one,1476,"[[3.175241989157885], [9.517996847051375], [-1...",1e-06,0.105,False,16.436353,0.751515,learning rate
15,kd,whole model-epsilon:1e-07,lr_type.iteration_plus_one,3643,"[[3.0601381991207917], [8.83862515342757], [-1...",1e-07,0.166,False,16.225842,0.751178,epsilon
17,kd,whole model-epsilon:1e-09,lr_type.iteration_plus_one,22164,"[[2.887385852612343], [7.9553323804450375], [-...",1e-09,0.94,False,16.019264,0.750842,epsilon


In [24]:
#combine log bmi-skintickness
kd_x_train_log_skin = log_transform_normalize(kd_x_train_np,4)
kd_x_train_skinbmi = log_transform_normalize(kd_x_train_log_skin,6)

model_data = run_model('kd','log skin-bmi',learning_rate = 0, learning_rate_type = learning_rate_type.iteration_plus_one ,max_iterations = 150000,
                       epsilon = 1e-6,x_train = kd_x_train_skinbmi , y_train = kd_y_train , model_data = model_data,variable = 'log')
show_sorted_model(model_data)



Unnamed: 0,model_name,description,learning_rate,iteration,weights,epsilon,elapsed_time,is_max_reached,loss,accuracy_kfold,variable
27,kd,log age-insulin,lr_type.iteration_plus_one,836,"[[2.7683606646024232], [8.122909449757739], [-...",1e-06,0.035,False,15.737679,0.757576,log
28,kd,log age-insulin-Dpf,lr_type.iteration_plus_one,742,"[[2.81722351586145], [8.11597903143088], [-0.8...",1e-06,0.036,False,15.629731,0.756902,log
23,kd,log Insulin,lr_type.iteration_plus_one,802,"[[2.686500388124719], [8.005602930239544], [-0...",1e-06,0.037,False,15.82128,0.755556,log
25,kd,log DiabetesPedigreeFunction,lr_type.iteration_plus_one,1118,"[[3.1317646256736045], [8.935591338567065], [-...",1e-06,0.051,False,16.145685,0.751852,log
26,kd,log Age,lr_type.iteration_plus_one,1425,"[[3.2317379647520714], [9.47235968785085], [-1...",1e-06,0.07,False,16.404716,0.751515,log
6,kd,whole model-lr:lr_type.iteration_plus_one,lr_type.iteration_plus_one,1476,"[[3.175241989157885], [9.517996847051375], [-1...",1e-06,0.105,False,16.436353,0.751515,learning rate
16,kd,whole model-epsilon:1e-08,lr_type.iteration_plus_one,9021,"[[2.96533714092659], [8.33217514426845], [-1.0...",1e-08,0.413,False,16.097559,0.751515,epsilon
14,kd,whole model-epsilon:1e-06,lr_type.iteration_plus_one,1476,"[[3.175241989157885], [9.517996847051375], [-1...",1e-06,0.072,False,16.436353,0.751515,epsilon
15,kd,whole model-epsilon:1e-07,lr_type.iteration_plus_one,3643,"[[3.0601381991207917], [8.83862515342757], [-1...",1e-07,0.166,False,16.225842,0.751178,epsilon
17,kd,whole model-epsilon:1e-09,lr_type.iteration_plus_one,22164,"[[2.887385852612343], [7.9553323804450375], [-...",1e-09,0.94,False,16.019264,0.750842,epsilon


In [25]:
model_data = run_model('kd','log age-insulin',learning_rate = 0, learning_rate_type = learning_rate_type.iteration_plus_one ,max_iterations = 150000,
                       epsilon = 1e-6,x_train = kd_x_train_logai , y_train = kd_y_train , model_data = model_data,variable = 'log')
show_sorted_model(model_data)





Unnamed: 0,model_name,description,learning_rate,iteration,weights,epsilon,elapsed_time,is_max_reached,loss,accuracy_kfold,variable
30,kd,log age-insulin,lr_type.iteration_plus_one,836,"[[2.7683606646024232], [8.122909449757739], [-...",1e-06,0.049,False,15.737679,0.757576,log
27,kd,log age-insulin,lr_type.iteration_plus_one,836,"[[2.7683606646024232], [8.122909449757739], [-...",1e-06,0.035,False,15.737679,0.757576,log
28,kd,log age-insulin-Dpf,lr_type.iteration_plus_one,742,"[[2.81722351586145], [8.11597903143088], [-0.8...",1e-06,0.036,False,15.629731,0.756902,log
23,kd,log Insulin,lr_type.iteration_plus_one,802,"[[2.686500388124719], [8.005602930239544], [-0...",1e-06,0.037,False,15.82128,0.755556,log
25,kd,log DiabetesPedigreeFunction,lr_type.iteration_plus_one,1118,"[[3.1317646256736045], [8.935591338567065], [-...",1e-06,0.051,False,16.145685,0.751852,log
14,kd,whole model-epsilon:1e-06,lr_type.iteration_plus_one,1476,"[[3.175241989157885], [9.517996847051375], [-1...",1e-06,0.072,False,16.436353,0.751515,epsilon
26,kd,log Age,lr_type.iteration_plus_one,1425,"[[3.2317379647520714], [9.47235968785085], [-1...",1e-06,0.07,False,16.404716,0.751515,log
6,kd,whole model-lr:lr_type.iteration_plus_one,lr_type.iteration_plus_one,1476,"[[3.175241989157885], [9.517996847051375], [-1...",1e-06,0.105,False,16.436353,0.751515,learning rate
16,kd,whole model-epsilon:1e-08,lr_type.iteration_plus_one,9021,"[[2.96533714092659], [8.33217514426845], [-1.0...",1e-08,0.413,False,16.097559,0.751515,epsilon
15,kd,whole model-epsilon:1e-07,lr_type.iteration_plus_one,3643,"[[3.0601381991207917], [8.83862515342757], [-1...",1e-07,0.166,False,16.225842,0.751178,epsilon


In [26]:
#check for removing variables
for i in range(kd_x_train_logai.shape[1]):
    print (i,',column=>',kd_columns[i])
    kd_x_train_modified = np.delete(kd_x_train_logai.to_numpy(), [i], 1)
    title = 'log age-insulin-no {}'.format(kd_columns[i])
    model_data = run_model('kd',title,learning_rate = 0, learning_rate_type = learning_rate_type.iteration_plus_one ,max_iterations = 150000,
                       epsilon = 1e-6,x_train = kd_x_train_modified , y_train = kd_y_train , model_data = model_data,variable = 'feature removal')
show_sorted_model(model_data)


0 ,column=> Pregnancies
1 ,column=> Glucose
2 ,column=> BloodPressure
3 ,column=> Heart Rate
4 ,column=> SkinThickness
5 ,column=> Insulin
6 ,column=> BMI
7 ,column=> DiabetesPedigreeFunction
8 ,column=> Age


Unnamed: 0,model_name,description,learning_rate,iteration,weights,epsilon,elapsed_time,is_max_reached,loss,accuracy_kfold,variable
34,kd,log age-insulin-no Heart Rate,lr_type.iteration_plus_one,813,"[[2.7129340471248065], [8.273684993858577], [-...",1e-06,0.062,False,15.717127,0.759933,feature removal
27,kd,log age-insulin,lr_type.iteration_plus_one,836,"[[2.7683606646024232], [8.122909449757739], [-...",1e-06,0.035,False,15.737679,0.757576,log
30,kd,log age-insulin,lr_type.iteration_plus_one,836,"[[2.7683606646024232], [8.122909449757739], [-...",1e-06,0.049,False,15.737679,0.757576,log
28,kd,log age-insulin-Dpf,lr_type.iteration_plus_one,742,"[[2.81722351586145], [8.11597903143088], [-0.8...",1e-06,0.036,False,15.629731,0.756902,log
23,kd,log Insulin,lr_type.iteration_plus_one,802,"[[2.686500388124719], [8.005602930239544], [-0...",1e-06,0.037,False,15.82128,0.755556,log
38,kd,log age-insulin-no DiabetesPedigreeFunction,lr_type.iteration_plus_one,995,"[[3.528294663986213], [8.888662016489207], [-0...",1e-06,0.072,False,15.632706,0.751852,feature removal
25,kd,log DiabetesPedigreeFunction,lr_type.iteration_plus_one,1118,"[[3.1317646256736045], [8.935591338567065], [-...",1e-06,0.051,False,16.145685,0.751852,log
16,kd,whole model-epsilon:1e-08,lr_type.iteration_plus_one,9021,"[[2.96533714092659], [8.33217514426845], [-1.0...",1e-08,0.413,False,16.097559,0.751515,epsilon
26,kd,log Age,lr_type.iteration_plus_one,1425,"[[3.2317379647520714], [9.47235968785085], [-1...",1e-06,0.07,False,16.404716,0.751515,log
14,kd,whole model-epsilon:1e-06,lr_type.iteration_plus_one,1476,"[[3.175241989157885], [9.517996847051375], [-1...",1e-06,0.072,False,16.436353,0.751515,epsilon


In [27]:
#remove heartrate from features
kd_x_train_features = np.delete(kd_x_train_logai.to_numpy(), [3], 1)
title = 'log age-insulin-no heartrate'.format(kd_columns[i])
model_data = run_model('kd',title,learning_rate = 0, learning_rate_type = learning_rate_type.iteration_plus_one ,max_iterations = 150000,
                       epsilon = 1e-6,x_train = kd_x_train_features , y_train = kd_y_train , model_data = model_data,variable = 'feature removal')
show_sorted_model(model_data)




Unnamed: 0,model_name,description,learning_rate,iteration,weights,epsilon,elapsed_time,is_max_reached,loss,accuracy_kfold,variable
40,kd,log age-insulin-no heartrate,lr_type.iteration_plus_one,813,"[[2.7129340471248065], [8.273684993858577], [-...",1e-06,0.045,False,15.717127,0.759933,feature removal
34,kd,log age-insulin-no Heart Rate,lr_type.iteration_plus_one,813,"[[2.7129340471248065], [8.273684993858577], [-...",1e-06,0.062,False,15.717127,0.759933,feature removal
27,kd,log age-insulin,lr_type.iteration_plus_one,836,"[[2.7683606646024232], [8.122909449757739], [-...",1e-06,0.035,False,15.737679,0.757576,log
30,kd,log age-insulin,lr_type.iteration_plus_one,836,"[[2.7683606646024232], [8.122909449757739], [-...",1e-06,0.049,False,15.737679,0.757576,log
28,kd,log age-insulin-Dpf,lr_type.iteration_plus_one,742,"[[2.81722351586145], [8.11597903143088], [-0.8...",1e-06,0.036,False,15.629731,0.756902,log
23,kd,log Insulin,lr_type.iteration_plus_one,802,"[[2.686500388124719], [8.005602930239544], [-0...",1e-06,0.037,False,15.82128,0.755556,log
38,kd,log age-insulin-no DiabetesPedigreeFunction,lr_type.iteration_plus_one,995,"[[3.528294663986213], [8.888662016489207], [-0...",1e-06,0.072,False,15.632706,0.751852,feature removal
25,kd,log DiabetesPedigreeFunction,lr_type.iteration_plus_one,1118,"[[3.1317646256736045], [8.935591338567065], [-...",1e-06,0.051,False,16.145685,0.751852,log
6,kd,whole model-lr:lr_type.iteration_plus_one,lr_type.iteration_plus_one,1476,"[[3.175241989157885], [9.517996847051375], [-1...",1e-06,0.105,False,16.436353,0.751515,learning rate
26,kd,log Age,lr_type.iteration_plus_one,1425,"[[3.2317379647520714], [9.47235968785085], [-1...",1e-06,0.07,False,16.404716,0.751515,log


In [28]:
#check power 2 of features
for i in range(kd_x_train_features.shape[1]):
    print (i,',column=>',kd_columns[i])
    kd_x_power = power_n_feature(pd.DataFrame(kd_x_train_features),i,2).to_numpy()
    title = 'log age-insulin-no heartrate-*2 {}'.format(kd_columns[i])
    print(title)
    model_data = run_model('kd',title,learning_rate = 0, learning_rate_type = learning_rate_type.iteration_plus_one ,max_iterations = 150000,
                       epsilon = 1e-6,x_train = kd_x_power , y_train = kd_y_train , model_data = model_data,variable = 'feature *2')  
show_sorted_model(model_data)

0 ,column=> Pregnancies
log age-insulin-no heartrate-*2 Pregnancies
1 ,column=> Glucose
log age-insulin-no heartrate-*2 Glucose
2 ,column=> BloodPressure
log age-insulin-no heartrate-*2 BloodPressure
3 ,column=> Heart Rate
log age-insulin-no heartrate-*2 Heart Rate
4 ,column=> SkinThickness
log age-insulin-no heartrate-*2 SkinThickness
5 ,column=> Insulin
log age-insulin-no heartrate-*2 Insulin
6 ,column=> BMI
log age-insulin-no heartrate-*2 BMI
7 ,column=> DiabetesPedigreeFunction
log age-insulin-no heartrate-*2 DiabetesPedigreeFunction


Unnamed: 0,model_name,description,learning_rate,iteration,weights,epsilon,elapsed_time,is_max_reached,loss,accuracy_kfold,variable
34,kd,log age-insulin-no Heart Rate,lr_type.iteration_plus_one,813,"[[2.7129340471248065], [8.273684993858577], [-...",1e-06,0.062,False,15.717127,0.759933,feature removal
40,kd,log age-insulin-no heartrate,lr_type.iteration_plus_one,813,"[[2.7129340471248065], [8.273684993858577], [-...",1e-06,0.045,False,15.717127,0.759933,feature removal
42,kd,log age-insulin-no heartrate-*2 Glucose,lr_type.iteration_plus_one,706,"[[2.7707684939995394], [5.782931625664704], [-...",1e-06,0.032,False,15.705729,0.758586,feature *2
43,kd,log age-insulin-no heartrate-*2 BloodPressure,lr_type.iteration_plus_one,702,"[[2.6840445568619615], [8.100362853592744], [-...",1e-06,0.033,False,15.651793,0.758249,feature *2
41,kd,log age-insulin-no heartrate-*2 Pregnancies,lr_type.iteration_plus_one,832,"[[3.7667195356425127], [8.238912283417783], [-...",1e-06,0.046,False,15.915845,0.758249,feature *2
27,kd,log age-insulin,lr_type.iteration_plus_one,836,"[[2.7683606646024232], [8.122909449757739], [-...",1e-06,0.035,False,15.737679,0.757576,log
30,kd,log age-insulin,lr_type.iteration_plus_one,836,"[[2.7683606646024232], [8.122909449757739], [-...",1e-06,0.049,False,15.737679,0.757576,log
28,kd,log age-insulin-Dpf,lr_type.iteration_plus_one,742,"[[2.81722351586145], [8.11597903143088], [-0.8...",1e-06,0.036,False,15.629731,0.756902,log
47,kd,log age-insulin-no heartrate-*2 BMI,lr_type.iteration_plus_one,938,"[[3.1911630539778977], [8.668229576381282], [-...",1e-06,0.039,False,15.849282,0.756566,feature *2
46,kd,log age-insulin-no heartrate-*2 Insulin,lr_type.iteration_plus_one,741,"[[2.5822636064767592], [7.952642108792551], [-...",1e-06,0.031,False,15.840965,0.755556,feature *2


In [29]:
#model_data.tail(1)
#model_data.drop(model_data.tail(1).index,inplace=True) # drop last n rows
show_sorted_model(model_data)

Unnamed: 0,model_name,description,learning_rate,iteration,weights,epsilon,elapsed_time,is_max_reached,loss,accuracy_kfold,variable
34,kd,log age-insulin-no Heart Rate,lr_type.iteration_plus_one,813,"[[2.7129340471248065], [8.273684993858577], [-...",1e-06,0.062,False,15.717127,0.759933,feature removal
40,kd,log age-insulin-no heartrate,lr_type.iteration_plus_one,813,"[[2.7129340471248065], [8.273684993858577], [-...",1e-06,0.045,False,15.717127,0.759933,feature removal
42,kd,log age-insulin-no heartrate-*2 Glucose,lr_type.iteration_plus_one,706,"[[2.7707684939995394], [5.782931625664704], [-...",1e-06,0.032,False,15.705729,0.758586,feature *2
43,kd,log age-insulin-no heartrate-*2 BloodPressure,lr_type.iteration_plus_one,702,"[[2.6840445568619615], [8.100362853592744], [-...",1e-06,0.033,False,15.651793,0.758249,feature *2
41,kd,log age-insulin-no heartrate-*2 Pregnancies,lr_type.iteration_plus_one,832,"[[3.7667195356425127], [8.238912283417783], [-...",1e-06,0.046,False,15.915845,0.758249,feature *2
27,kd,log age-insulin,lr_type.iteration_plus_one,836,"[[2.7683606646024232], [8.122909449757739], [-...",1e-06,0.035,False,15.737679,0.757576,log
30,kd,log age-insulin,lr_type.iteration_plus_one,836,"[[2.7683606646024232], [8.122909449757739], [-...",1e-06,0.049,False,15.737679,0.757576,log
28,kd,log age-insulin-Dpf,lr_type.iteration_plus_one,742,"[[2.81722351586145], [8.11597903143088], [-0.8...",1e-06,0.036,False,15.629731,0.756902,log
47,kd,log age-insulin-no heartrate-*2 BMI,lr_type.iteration_plus_one,938,"[[3.1911630539778977], [8.668229576381282], [-...",1e-06,0.039,False,15.849282,0.756566,feature *2
46,kd,log age-insulin-no heartrate-*2 Insulin,lr_type.iteration_plus_one,741,"[[2.5822636064767592], [7.952642108792551], [-...",1e-06,0.031,False,15.840965,0.755556,feature *2


In [30]:
#check power 3 of features
for i in range(kd_x_train_features.shape[1]):
    print (i,',column=>',kd_columns[i])
    kd_x_power = power_n_feature(pd.DataFrame(kd_x_train_features),i,3).to_numpy()
    title = 'log age-insulin-no heartrate-*3 {}'.format(kd_columns[i])
    print(title)
    model_data = run_model('kd',title,learning_rate = 0, learning_rate_type = learning_rate_type.iteration_plus_one ,max_iterations = 150000,
                       epsilon = 1e-6,x_train = kd_x_power , y_train = kd_y_train , model_data = model_data,variable = 'feature *3')  
show_sorted_model(model_data)

0 ,column=> Pregnancies
log age-insulin-no heartrate-*3 Pregnancies
1 ,column=> Glucose
log age-insulin-no heartrate-*3 Glucose
2 ,column=> BloodPressure
log age-insulin-no heartrate-*3 BloodPressure
3 ,column=> Heart Rate
log age-insulin-no heartrate-*3 Heart Rate
4 ,column=> SkinThickness
log age-insulin-no heartrate-*3 SkinThickness


  return 1 / (1 + np.exp(-arg))


5 ,column=> Insulin
log age-insulin-no heartrate-*3 Insulin
6 ,column=> BMI
log age-insulin-no heartrate-*3 BMI
7 ,column=> DiabetesPedigreeFunction
log age-insulin-no heartrate-*3 DiabetesPedigreeFunction


Unnamed: 0,model_name,description,learning_rate,iteration,weights,epsilon,elapsed_time,is_max_reached,loss,accuracy_kfold,variable
53,kd,log age-insulin-no heartrate-*3 SkinThickness,lr_type.iteration_plus_one,877,"[[2.7512737233256144], [8.386789991625008], [-...",1e-06,0.058,False,15.777801,0.773737,feature *3
49,kd,log age-insulin-no heartrate-*3 Pregnancies,lr_type.iteration_plus_one,858,"[[3.733327173884576], [8.180972680577565], [-1...",1e-06,0.043,False,16.092013,0.761279,feature *3
40,kd,log age-insulin-no heartrate,lr_type.iteration_plus_one,813,"[[2.7129340471248065], [8.273684993858577], [-...",1e-06,0.045,False,15.717127,0.759933,feature removal
34,kd,log age-insulin-no Heart Rate,lr_type.iteration_plus_one,813,"[[2.7129340471248065], [8.273684993858577], [-...",1e-06,0.062,False,15.717127,0.759933,feature removal
50,kd,log age-insulin-no heartrate-*3 Glucose,lr_type.iteration_plus_one,708,"[[2.8106015374663595], [5.369363601783941], [-...",1e-06,0.035,False,15.784424,0.758923,feature *3
42,kd,log age-insulin-no heartrate-*2 Glucose,lr_type.iteration_plus_one,706,"[[2.7707684939995394], [5.782931625664704], [-...",1e-06,0.032,False,15.705729,0.758586,feature *2
43,kd,log age-insulin-no heartrate-*2 BloodPressure,lr_type.iteration_plus_one,702,"[[2.6840445568619615], [8.100362853592744], [-...",1e-06,0.033,False,15.651793,0.758249,feature *2
41,kd,log age-insulin-no heartrate-*2 Pregnancies,lr_type.iteration_plus_one,832,"[[3.7667195356425127], [8.238912283417783], [-...",1e-06,0.046,False,15.915845,0.758249,feature *2
27,kd,log age-insulin,lr_type.iteration_plus_one,836,"[[2.7683606646024232], [8.122909449757739], [-...",1e-06,0.035,False,15.737679,0.757576,log
30,kd,log age-insulin,lr_type.iteration_plus_one,836,"[[2.7683606646024232], [8.122909449757739], [-...",1e-06,0.049,False,15.737679,0.757576,log


In [31]:
(model_data.sort_values(by=['accuracy_kfold'], ascending=False)).to_csv('kidney_disease_models.csv', index=False)

In [32]:
import os
cwd = os.getcwd()
print(cwd)

/content


In [33]:
def recursive_feature_elimination(X, y, model, num_features):
    num_samples, num_total_features = X.shape
    
    # Initialize the mask to include all features
    mask = np.ones(num_total_features, dtype=bool)
    
    def rfe(X, y, model, mask, num_features):
      
        if np.sum(mask) == num_features:
            return X[:, mask]
      
        model.fit(X[:, mask], y)
        feature_importances = np.zeros(num_total_features)
        feature_importances[mask] = model.feature_importances_
        
        least_important_feature_idx = np.argmin(feature_importances)
        mask[least_important_feature_idx] = False
        return rfe(X, y, model, mask, num_features)
    return rfe(X, y, model, mask, num_features)


In [34]:
(model_data.sort_values(by=['accuracy_kfold'], ascending=False))

Unnamed: 0,model_name,description,learning_rate,iteration,weights,epsilon,elapsed_time,is_max_reached,loss,accuracy_kfold,variable
53,kd,log age-insulin-no heartrate-*3 SkinThickness,lr_type.iteration_plus_one,877,"[[2.7512737233256144], [8.386789991625008], [-...",1e-06,0.058,False,15.777801,0.773737,feature *3
49,kd,log age-insulin-no heartrate-*3 Pregnancies,lr_type.iteration_plus_one,858,"[[3.733327173884576], [8.180972680577565], [-1...",1e-06,0.043,False,16.092013,0.761279,feature *3
40,kd,log age-insulin-no heartrate,lr_type.iteration_plus_one,813,"[[2.7129340471248065], [8.273684993858577], [-...",1e-06,0.045,False,15.717127,0.759933,feature removal
34,kd,log age-insulin-no Heart Rate,lr_type.iteration_plus_one,813,"[[2.7129340471248065], [8.273684993858577], [-...",1e-06,0.062,False,15.717127,0.759933,feature removal
50,kd,log age-insulin-no heartrate-*3 Glucose,lr_type.iteration_plus_one,708,"[[2.8106015374663595], [5.369363601783941], [-...",1e-06,0.035,False,15.784424,0.758923,feature *3
42,kd,log age-insulin-no heartrate-*2 Glucose,lr_type.iteration_plus_one,706,"[[2.7707684939995394], [5.782931625664704], [-...",1e-06,0.032,False,15.705729,0.758586,feature *2
43,kd,log age-insulin-no heartrate-*2 BloodPressure,lr_type.iteration_plus_one,702,"[[2.6840445568619615], [8.100362853592744], [-...",1e-06,0.033,False,15.651793,0.758249,feature *2
41,kd,log age-insulin-no heartrate-*2 Pregnancies,lr_type.iteration_plus_one,832,"[[3.7667195356425127], [8.238912283417783], [-...",1e-06,0.046,False,15.915845,0.758249,feature *2
27,kd,log age-insulin,lr_type.iteration_plus_one,836,"[[2.7683606646024232], [8.122909449757739], [-...",1e-06,0.035,False,15.737679,0.757576,log
30,kd,log age-insulin,lr_type.iteration_plus_one,836,"[[2.7683606646024232], [8.122909449757739], [-...",1e-06,0.049,False,15.737679,0.757576,log


In [47]:
#(model_data['variable']=='learning rate')
model_data.loc[model_data['variable'] == 'feature removal']
#model_data.loc[model_data['variable'] == 'all features']

#.sort_values(by=['accuracy_kfold'], ascending=False))

Unnamed: 0,model_name,description,learning_rate,iteration,weights,epsilon,elapsed_time,is_max_reached,loss,accuracy_kfold,variable
31,kd,log age-insulin-no Pregnancies,lr_type.iteration_plus_one,877,"[[7.954655592251466], [-1.0448422866347953], [...",1e-06,0.041,False,16.310581,0.750842,feature removal
32,kd,log age-insulin-no Glucose,lr_type.iteration_plus_one,484,"[[2.360917183337187], [-0.8182987407890269], [...",1e-06,0.035,False,17.264986,0.72963,feature removal
33,kd,log age-insulin-no BloodPressure,lr_type.iteration_plus_one,656,"[[2.726699034298592], [7.741999408211606], [-0...",1e-06,0.046,False,15.583091,0.750842,feature removal
34,kd,log age-insulin-no Heart Rate,lr_type.iteration_plus_one,813,"[[2.7129340471248065], [8.273684993858577], [-...",1e-06,0.062,False,15.717127,0.759933,feature removal
35,kd,log age-insulin-no SkinThickness,lr_type.iteration_plus_one,987,"[[2.8454163866127455], [7.444158178181417], [-...",1e-06,0.073,False,15.788618,0.750505,feature removal
36,kd,log age-insulin-no Insulin,lr_type.iteration_plus_one,1293,"[[2.820836202474606], [9.936728220495777], [-0...",1e-06,0.099,False,17.179665,0.739731,feature removal
37,kd,log age-insulin-no BMI,lr_type.iteration_plus_one,711,"[[2.5109910019024677], [7.816325231383992], [-...",1e-06,0.049,False,15.701077,0.746465,feature removal
38,kd,log age-insulin-no DiabetesPedigreeFunction,lr_type.iteration_plus_one,995,"[[3.528294663986213], [8.888662016489207], [-0...",1e-06,0.072,False,15.632706,0.751852,feature removal
39,kd,log age-insulin-no Age,lr_type.iteration_plus_one,790,"[[2.5616285768893263], [7.794299182419026], [-...",1e-06,0.061,False,15.972236,0.740067,feature removal
40,kd,log age-insulin-no heartrate,lr_type.iteration_plus_one,813,"[[2.7129340471248065], [8.273684993858577], [-...",1e-06,0.045,False,15.717127,0.759933,feature removal
