In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# INTRODUCTION
* Bu kernelde amacımız veri setini kullanarak bir hastanın diyabet olup olmadığını tahmin etmektir. Tahmin etme işlemini Logistic Regression ile yaparak 0 veya 1 sınıf etiketlerine ulaşıcağız.

        Bu kenelde, pima-indians-diabetes data seti ile Linear Learning gerçekleştirilmiştir.
        Features:
        Pregnancies: Gebelik Sayısı
        Glucose: Oral glukoz tolerans testinde glikoz konsantrasyonu değeri.
        BloogPressure: Kan Değeri(mm Hg)
        SkinThickness: Cilt Kalınlığı(mm)
        Insulin: 2 saatlik serum insulini(mu U/ml)
        BMI:  Vücut Kütle indeksi
        DiabetesPedigreeFunction: Diyabet soyağacı işlevi
        Age: Yaş
        Outcome: Diyabet olup olmaması 1 veya 0

  

<br>
1.) [Data Reading and Data Pre-Processing](#1)<br>
2.) [Train Test Split Data](#2)<br>
3.) [Parameter Initialize and Sigmoid Function](#3)<br>
4.) [Forward and Back Propagation Function](#4)<br>
5.) [Update Function for Parameters (Weight,Bias)](#5)<br>
6.) [Predict Method](#6)<br>
7.) [Logistic Regression (test main)](#7)<br>
8.) [Logistic Regression with Sklearn Library](#8)<br>



<a id="1"></a><br>
# Data Reading and Data Pre-Processing

In [None]:
data = pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")
data.info()

In [None]:
data.head()

In [None]:
# Dependent variables, class label for train dataset
y=data["Outcome"].values  #array
y

In [None]:
# Independent variables, traindataset
x_data = data.drop(["Outcome"],axis=1)
x_data.head()

In [None]:
# Normalization Dataset
# normalized data = (X - X_MİN) / (X_MAX - X_MİN)
x = (x_data - np.min(x_data)) / (np.max(x_data) - np.min(x_data))
x.head()

<a id="2"></a>
# Train Test Split Data
    Train Test Split data==> 80% of data set for Train, 20% of data set for Test
    train_test_split method'unun içine x ve y eğitim için verildi.
    test_size=0.2 parametresi ilede veri setinin %20'sini test olarak ayrıldı.
    random_state parametresi ile "42" sayısını id olarak tut ve aynı işlem  birdaha yapılırsa:
    aynı bölümleme işlemini yaparak aynı sonuçlara ulaşmamızı bize sağla.
    bu method sonucunda oluşacak resultlarıda belirtilen değişkenlere aktar.
    x'in %80 x_train , x'in %20'si x_test ; y'nin %80'i y_train, y'nin %20'si y_test


In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

# Feature ve onlara ait değerlerin yeri değiştirildi.
x_train = x_train.T
x_test = x_test.T
y_train = y_train.T
y_test = y_test.T

print("Changed of Features and Values place.")

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

<a id="3"></a>
# Parameter Initialize and Sigmoid Function

    Dimension Parameter: Features count ==> dimension = 8
    
    Sigmoid Function : f(x) = 1 / ( 1 + (e ^ -x)
    Initialize weight = 0.01 for each data
    Initialize bias = 0

In [None]:
# Initialize
# w = Create matrix array and all array values are 0.01
# np.exp = exponent method in numpy library
def initialize_weights_and_bias(dimension):
    #initialize
    w = np.full((dimension,1),0.01) 
    b=0.0
    return w,b

def sigmoid(z):
    y_head = 1 / (1 + np.exp(-z))
    return y_head

<a id="4"></a>
# Forward and Back Propagation Function

*  **z = bias + px1*w1 + px2*w2 + ... + pxn*wn **
*  **loss function = -(1 - y) * log(1- y_head) - y * log(y_head)**
*  **cost function = sum(loss value) / train dataset sample count**
   
         Her bir weightin, kendisine ait her bir x_train ile çarpılması gerekir.
         
         Örneğin:
         
         x_train,y_train = Öğrenme veri seti, w=weightler ==> (30,1) * (30,455) matrixlerinin çarpması olamaz.
         
             1. matrix'in sütunu ile 2. matrix'in satırı birbirine uymalıdır => (1,30) * (30,455) olmalıdır.
             
         Bu işlem sonucunda da (1,455) lik bir matrix elde edilir. ==> np.dot(w.T,x_train,y_train)
         
         Forward - Backward İşleminde yapılacaklar:
         
      *forward için:*
      
           1.) x_train değerleri ile ağırlıkları(weights) çarp ve bias ekle.
           2.) y_head değerini sigmoid function ile hesapla.
           3.) loss function formulünden yola çıkarak loss değerini hesapla
           4.) cost functionu hesapla => sum(loss) / sample_count

      *Backward için:*
      
           1.)backward işleminde gerekli weighte göre türev al.
           2.)backward işleminde gerekli bias'a göre türev al
           cost ve gradients(derivative_weight, derivative_bias) return et.

In [None]:
def forward_backward_propagation(w,b,x_train,y_train):
    # forward propagation
    z = np.dot(w.T,x_train) + b
    y_head = sigmoid(z)
    loss =  -(1 - y_train) * np.log(1 - y_head) -y_train * np.log(y_head)
    cost = (np.sum(loss)) / x_train.shape[1]
    
    #backward propagation
    derivative_weight = (np.dot(x_train, ((y_head - y_train).T))) / x_train.shape[1] #derivative based on weight
    derivative_bias = np.sum(y_head - y_train ) / x_train.shape[1] #derivative based on bias
    
    #weight and bias are derivates kept in dictionary(gradients)
    gradients = {"derivative_weight": derivative_weight, "derivative_bias":derivative_bias}
    
    return cost,gradients

<a id="5"></a>
# Update Function

     Update işlemi arka arkaya forward propagation ve backward propagatin işleminin n defa yapılması işlemidir.
     Bu nedenle parametrelerimiz:
         güncellenecek weightler             : w                    => parameter 1
         güncellenecek biaslar               : b                    => parameter 2
         forward için x_train input1         : x_train              => inputs features values
         forward için y_train input_label    : y_train              => inputs class labels
         slope için learning_rate değeri     : learning_rate        => hyper parameter 1
         forward ve backward tekrar sayısı   : number_of_iteration  => hyper parameter 2
     Not: Gradients, weight ve bias'ın türevlerini tutar.
     costları tutarız çünkü nuber_of_itearation sayısını belirlemek için.
     learning_rate fazla veya az olursa öğrenme işlemi kazaya uğrayabilir.
     cost2'nin pek bir işlevi yoktur sadece her 10 adımda bir costları tutuyoruz 
     Bunun sebebi costları 10'ar adımda bir plot ettirmektir.

In [None]:
# Updating(learning) parameters
def update(w, b, x_train, y_train, learning_rate,number_of_iterarion):
    cost_list = []
    cost_list2 = []
    index = []
    # updating(learning) parameters is number_of_iterarion times
    for i in range(number_of_iterarion):
        # make forward and backward propagation and find cost and gradients
        cost,gradients = forward_backward_propagation(w,b,x_train,y_train)
        cost_list.append(cost)
        # lets update
        w = w - learning_rate * gradients["derivative_weight"]
        b = b - learning_rate * gradients["derivative_bias"]
        if i % 10 == 0:
            cost_list2.append(cost)
            index.append(i)
            print ("Cost after iteration %i: %f" %(i, cost))
    # we update(learn) parameters weights and bias
    parameters = {"weight": w,"bias": b}
    plt.plot(index,cost_list2)
    plt.xticks(index,rotation='vertical')
    plt.xlabel("Number of Iterarion")
    plt.ylabel("Cost")
    plt.show()
    return parameters, gradients, cost_list
#parameters, gradients, cost_list = update(w, b, x_train, y_train, learning_rate = 0.009,number_of_iterarion = 200)

<a id="6"></a>
# Predict Method

In [None]:
# Predict Method

def predict(w,b,x_test):
    z=sigmoid(np.dot(w.T,x_test) + b)
    Y_prediction = np.zeros((1,x_test.shape[1]))
    
    # if z is bigger than 0.5, our prediction is sign one (y_head=1),
    # if z is smaller than 0.5, our prediction is sign zero (y_head=0),
    
    for i in range(z.shape[1]):
        if z[0,i]<= 0.5:
            Y_prediction[0,i] = 0
        else:
            Y_prediction[0,i] = 1

    return Y_prediction
    

<a id="7"></a>
# Logistic Regression (test main)

In [None]:
def logistic_regression(x_train, y_train, x_test, y_test, learning_rate ,  num_iterations):
    
    #initialize
    dimension =  x_train.shape[0]  # that is 4096
    w,b = initialize_weights_and_bias(dimension)
    # do not change learning rate
    
    #update method for forward and backward propagation
    parameters, gradients, cost_list = update(w, b, x_train, y_train, learning_rate,num_iterations)
    
    y_prediction_test = predict(parameters["weight"],parameters["bias"],x_test)
    

    # Print train/test Errors
    print("test accuracy: {} %".format(100 - np.mean(np.abs(y_prediction_test - y_test)) * 100))
    

In [None]:
#test
logistic_regression(x_train, y_train, x_test, y_test,learning_rate = 5, num_iterations = 300)

<a id="8"></a>
# Logistic Regression with Sklearn Library

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train.T,y_train.T)
print("Test Accuracy : {}".format(lr.score(x_test.T,y_test.T)))

In [None]:
from sklearn import linear_model
logreg = linear_model.LogisticRegression(random_state = 42,max_iter= 150)
print("test accuracy: {} ".format(logreg.fit(x_train.T, y_train.T).score(x_test.T, y_test.T)))
print("train accuracy: {} ".format(logreg.fit(x_train.T, y_train.T).score(x_train.T, y_train.T)))