# The sourece code of using python to realize logistic regression

<div class="mark">
----------------------------------------------------------------------------------------------------------------</div><i class="fa fa-lightbulb-o "></i>

## The preliminary preparation before data anslysis and machine learning

The following code will show the whole process of logistic regression without calling any exsiting package like the logistic regression library in scikit-learn or pytorch. The following are the packages we called in the programme.

In [None]:
import numpy as np                          
# The numpy package can help us with the calculation of vector and matrix
import pandas as pd                        
# The pandas package can offer us some new data structure like dataframe 
import matplotlib.pyplot as plt              
# The matplotlib package can realize the visualization of the process
%matplotlib inline                           
# This command is a magic function, 
# whcih can embed the chart into the jupyter notebook
import os 

The next step is to have a brief view of the dataset, we could transfer the csv file into a dataframe 

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


In [None]:
pdData=pd.read_csv("/kaggle/input/diabetes-dataset/diabetes2.csv")  
# read the csv file and trandfer it into a dataframe
pdData.info()   
# The info() function can offer us the basic information of the dataset

As the dataframe has 768 rows and 0 columns, according to the information, there is no Null Value in the dataset, so we do not need to do the missing value processing.

## Visualization part


In [None]:
pdData.columns

In [None]:
positive=pdData[pdData["Outcome"]==1]
negative=pdData[pdData["Outcome"]==0]
plt.figure(figsize=(24,7))

plt.subplot(1,2,1)
plt.scatter(positive["Glucose"],positive["DiabetesPedigreeFunction"],
            marker="o",label="Outcom=1")
plt.scatter(negative["Glucose"],negative["DiabetesPedigreeFunction"],
            marker="x",label="Outcom=0")
plt.legend()
plt.xlabel("Glucose")
plt.ylabel("DiabetesPedigreeFunction")

plt.subplot(1,2,2)
plt.scatter(positive["BloodPressure"],positive["Insulin"],
            marker="o",label="Outcom=1")
plt.scatter(negative["BloodPressure"],negative["Insulin"],
            marker="x",label="Outcom=0")
plt.legend()
plt.xlabel("BloodPressure")
plt.ylabel("Insulin")

#plt.savefig("visual")
plt.show()



In [None]:
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(dpi=100)

ax = Axes3D(fig)
ax.scatter(positive["Glucose"],positive["BloodPressure"],
           positive["SkinThickness"],cmap="cool",label="outcome=1")
ax.scatter(negative["Glucose"],negative["BloodPressure"],
           negative["SkinThickness"],cmap="cool",label="outcome=0")

ax.set_xlabel("Glucose")
ax.set_ylabel("BloodPressure")
ax.set_zlabel("SkinThickness")
ax.set_title("Visualization of the classification")
plt.legend()
plt.savefig("3D")
plt.show()

In [None]:
fig = plt.figure(dpi=100)

ax = Axes3D(fig)
ax.scatter(positive["BMI"],positive["DiabetesPedigreeFunction"],
           positive["Age"],cmap="cool",label="outcome=1")
ax.scatter(negative["BMI"],negative["DiabetesPedigreeFunction"],
           negative["Age"],cmap="cool",label="outcome=0")
ax.set_xlabel("BMI")
ax.set_ylabel("DiabetesPredigreeFunction")
ax.set_zlabel("Age")
ax.set_title("Visualization of the classification")
plt.legend()
plt.savefig("3D2")
plt.show()

## Prepare all the function we need during the logistic regression

### Sigmoid Function 

The Sigmoid Function is a S-shape function which could output a probability.


In [None]:
def sigmoid(z):
    return 1/(1+np.exp(-z))

### Model function

The model function is used to calculate the inner product of two matrix using the numpy package and then output the outcome of the sigmoid function over the result. 

In [None]:
def model(X,theta):
    return sigmoid(np.dot(X,theta.T))
# np.dot means do the inner product of two matrix

### The loss function

As is illustrated in the essay, the cross entropy was used to calculate the loss


In [None]:
def cost(X,y,theta):
    left=np.multiply(-y,np.log(model(X,theta)))
    right=np.multiply(1-y,np.log(1-model(X,theta)))
    return np.sum(left-right)/(len(X))
# np.sum means do the matrix addition of two vector 

### The gradient calculation function

According to the result we have deducted in the essay, we use the following code to calculate the gradient.

In [None]:
def gradient(X,y,theta):
    grad=np.zeros(theta.shape)
    error=(model(X,theta)-y).ravel()
    for j in range(len(theta.ravel())):
        term=np.multiply(error,X[:,j])        
        grad[0,j]=np.sum(term)/len(X)
    return grad

### The shuffling function

The shuffling function is used to randomly arrange the data.

In [None]:
import numpy.random
def shuffleData(data):
    np.random.shuffle(data)
    cols=data.shape[1]
    X=data[:,0:cols-1]
    y=data[:,cols-1:]
    return X,y

### The criteria of stop the program

Here we create three different stop conditions to pick out the best one.

In [None]:
#label the three stop conditions with three integers 
STOP_ITER=0                         
STOP_COST=1
STOP_GRAD=2
def stopCriterion(type,value,threshold):             
    # set the condition respectively 
    
    if type==STOP_ITER:   return value>threshold
# the first stop_iteraion condition means that 
# we need to set the number of the times of iteration, 
# if the times of iteration exceeds the value we set, 
# then stop the loop
    
    if type==STOP_COST:   return abs(value[-1]-value[-2])<threshold
# the second stop_cost condition means that 
# if the difference between two costs is less than the threshold 
# we set, then stop the loop
    
    if type==STOP_GRAD:   return np.linalg.norm(value)<threshold 
# the third stop_gradient condition menas that 
# if the norm of the gradient vector is less than the value we 
# set, then stop the loop

### The Gradient descent function 

In [None]:
import time  
# this package is used to calculate the time for running the program
def descent(data,theta,batchSize,stopType,thresh,alpha):
    init_time=time.time()
    i=0  # The time of iteration 
    k=0  # The number of the batch 
    X,y=shuffleData(data)         
    # get the sample and target after doing the shuffle   
    grad=np.zeros(theta.shape)    
    # creata an array to store the gradient with the initial value of 0
    costs=[cost(X,y,theta)]       
    # creata a list object to store 
    # all the cost during the gradient descent 
    
    while True:
        grad=gradient(X[k:k+batchSize],y[k:k+batchSize],theta)     
        # update the value of the gradient 
        k+=batchSize                                               
        # update the value of k 
       
        if k>=n:                        
            k=0
            X,y=shuffleData(data)       
            # reshuffle the data if k>=n, 
            # this is only used for the mini-batch method
        
        theta=theta-alpha*grad          
        # update the parameters 
        costs.append(cost(X,y,theta))         
        # store the new cost and append it into the costs list 
        i+=1                                 
        # update i
        
        if stopType==STOP_ITER:         
            value=i                     
            #set the time of iteration as value
        elif stopType==STOP_COST:
            value=costs                 
            #set the costs list as value 
        elif stopType==STOP_GRAD:
            value=grad                  
            #set the gradience vector as value
        if stopCriterion(stopType,value,thresh): break          
            # set the stop condition
    
    return theta,i-1,costs,grad,time.time()-init_time

### The  plotting function for visualization 

In [None]:
def runExpe(data,theta,batchSize,stopType,thresh,alpha):
    theta,iter,costs,grad,dur=descent(data,theta,batchSize,stopType,thresh,alpha)   
    # call the gradient dexcent function
    
    
    name="Scaled"
    name += "data - learning rate :{} - ".format(alpha)   
    # represent the learning rate of the model
    
    # represent the stop_condition we choose 
    if stopType==STOP_ITER:
        strStop="{} iterations".format(thresh)
    elif stopType==STOP_COST:
        strStop="costs change < {}".format(thresh)
    else:
        strStop = "gradient norm < {}".format(thresh)
    
    name+=strStop
    print("***{}\nTheta:{} - Iter: {} - Last cost: {:03.2f} - Duration: {:03.2f}s".format(name,theta,iter,costs[-1],dur))
    # set the title 
    
    fig,ax=plt.subplots(figsize=(12,4))           
    # create a canvas with the ration of length and width 12:4
    ax.plot(np.arange(len(costs)),costs,'r')      
    # plot the chart 
    ax.set_xlabel("Iteration")                    
    # set the title of x-axis
    ax.set_ylabel("Cost")                        
    # set the title of y-axis
    ax.set_title(name.upper())                    
    # set the title
   
    return theta 

## Process the data

In [None]:
pdData.insert(0,'ones',1)           
# add a column to the pdData for the update of b

In [None]:
orig_data=pdData.values             
# here in Python 3.8 and later versions, the _.values means 
# transfer the values of a dataframe object into a multi-dimensional 
# numpy array   
#NOTE: 
#       if your python version is low, you can change the code into:  
#                            to_matrix()     
#       but the to_matrix function was deleted in the new version

cols=orig_data.shape[1]             
# get the number of columns and set it as a constant

X=orig_data[:,0:cols-1]             
# X means the attributes of the feature 
y=orig_data[:,cols-1:cols]          
# y means the label of the samples
theta = np.zeros ([1,9])            
# originate the theta 

In [None]:
X.shape,y.shape,theta.shape         
# have a look of the shape of X, y and theta 

## Using the sklearn to scale the data 

As different variables may have a large difference in their variance, so that we can do the scaling process to ensure that each variable could have the same mean of 0 and variance of 1 


In [None]:
from sklearn import preprocessing as pp                
# call the prerocessing function in the sklearn package 
scaled_data=orig_data.copy()                           
# get a copy from the orginal data
scaled_data[:,1:9]=pp.scale(orig_data[:,1:9])         
# do the normalization 

The scaled data is the data after the normalization, which could do help to the machine learning result 

## Test the result of different methods 

### The full gradient descent 

Here we set the batch size as n and do the gradient descent 

In [None]:
n=768
runExpe(scaled_data,theta,n,STOP_ITER,thresh=20000,alpha=0.001)

### Using the stop_cost as the stop condition

In [None]:
n=768
runExpe(scaled_data,theta,n,STOP_COST,thresh=0.000004,alpha=0.001)

### Stochastic Gradient Descent 

Here we set the batch size as 1 and do the parameters update, the advantage of this method is that the speed of descent is very large so 
that we can set the iteration times much larger 

In [None]:
n=768
runExpe(scaled_data,theta,1,STOP_ITER,thresh=50000,alpha=0.01)

### Mini-batch Gradient Descent 

In [None]:
n=768
runExpe(scaled_data,theta,20,STOP_COST,thresh=0.0000000001,alpha=0.001)

## Calculate the accuracy of the final result 

We will choose the best result with the cost of 0.47

In [None]:
newtheta=np.array([-0.8265755 ,  0.37491152,  1.03329477, -0.21938382, -0.00491618,-0.08607098,  0.65185757,  0.2965955 ,  0.20040424])
# store the theta with the best performance 

In [None]:
def predict(X,theta):
    return [1 if x>=0.5 else 0 for x in model(X,theta)]
# return a function with the predicted result 
# The principle is that:
#          if the predicted outcome is larger than 0.5 then label the outcome with 1
#          if the predicted outcome is smaller than 0.5 then label the outcome with 0

In [None]:
scaled_X=scaled_data[:,0:cols-1]

In [None]:
predictions=predict(scaled_X,newtheta)           
# create a precitions list to store the predicted outcome 
correct = [1 if ((a==1 and b==1) or (a==0 and b==0)) 
           else 0 for (a,b) in zip(predictions,y)]
# if the predicted outcome is in line with the target label, 
# then we set the value as 1, otherwise 0
                                                
accuracy= (sum(map(int,correct))/len(correct))
# calculate the ratio of correctness 
print('accuracy={}'.format(accuracy))      # print the accuracy 


## Viusalization of the result 

In [None]:
visual=pd.DataFrame(scaled_data)   
# create a new dataframe for visualization 
visual.columns=["zero",'Pregnancies', 'Glucose', 
                'BloodPressure', 'SkinThickness', 'Insulin',
                'BMI', 'DiabetesPedigreeFunction', 
                'Age', 'Outcome']
                # change the name of the columns 

In [None]:
visual.head(10)  # have a brief view of the scaled data 

In [None]:
pos=visual[visual["Outcome"]==1]
neg=visual[visual["Outcome"]==0]
# create two dataframe to store the data

In [None]:
plt.figure()  # create the canvas 

plt.scatter(pos["Glucose"],pos["BloodPressure"],marker="o",label="Outcome=1")  
# draw the scatter plot 
plt.scatter(neg["Glucose"],neg["BloodPressure"],marker="x",label='Outcome=0') 

plt.legend()    
plt.xlabel("Glucose")       # set the title of x-axis 
plt.ylabel("BloodPressure") # set the title of y-axis

x= np.linspace(0,1.3,30)
y= (0.97485299*x-0.78869174)/(-0.19376083)
                            # plot the function based on the final parameters 
plt.plot(x,y,lw=3,c='r')    # set the format of the line 
plt.show()

In [None]:
plt.figure()

plt.scatter(pos["Glucose"],pos["BMI"],marker="o",label="Outcome=1")
plt.scatter(neg["Glucose"],neg["BMI"],marker="x",label='Outcome=0')

plt.legend()
plt.xlabel("Glucose")
plt.ylabel("BMI")

x= np.linspace(-1.3,2.2,30)
y= (0.97485299*x-0.78869174)/(-0.60651927)
plt.plot(x,y,lw=3,c='r')

plt.show()

<div class="mark">
------------------------------------------------------------------------------------------------------------------------</div><i class="fa fa-lightbulb-o "></i>

---END