In [0]:
#To download the dataset and its assorted details
!wget https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.csv
!wget https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.names
#It contains 768 rows and 9 columns. All of the values in the file are numeric, specifically floating point values

In [0]:
from math import exp

#Prediction function of logistic regression 
def predict(rows,coefficients):
  yhat = coefficients[0]
  for i in range(len(rows)-1):
    yhat+=coefficients[i+1]*rows[i]
  return 1.0/(1.0+exp(-yhat))

In [0]:
def coefficients_sgd(train,l_rate,n_epoch):
  coef = [ 0.0 for i in range(len(train[0])) ]
  for epoch in range(n_epoch):
    sum_error = 0
    for row in train:
      yhat = predict(row,coef)
      error = row[-1] - yhat
      sum_error = sum_error + error**2
      coef[0] = coef[0] + l_rate*error*(1.0 - yhat)
      for i in range(len(row) - 1):
        coef[i+1] = coef[i+1] + l_rate*error*(1.0-yhat)*row[i]
    print('\t>>epoch = %d,l_rate = %.3f,error=%.3f'%(epoch,l_rate,sum_error))
  return coef

In [0]:
#Naive initial test without training
#Dataset picked randomly
dataset = [[2.7810836,2.550537003,0],
	[1.465489372,2.362125076,0],
	[3.396561688,4.400293529,0],
	[1.38807019,1.850220317,0],
	[3.06407232,3.005305973,0],
	[7.627531214,2.759262235,1],
	[5.332441248,2.088626775,1],
	[6.922596716,1.77106367,1],
	[8.675418651,-0.242068655,1],
	[7.673756466,3.508563011,1]]

l_rate = 0.3
n_epoch = 100
coef = coefficients_sgd(dataset, l_rate, n_epoch)
print(coef)

In [0]:
#Logistic Regression on Diabetes Dataset

from random import seed,randrange
from csv import reader

In [0]:
#Loading the csv file and building dataset array
def load_csv(filename):
  dataset = list()
  with open(filename,'r') as file:
    csv_reader = reader(file)
    for row in csv_reader:
      if not row:
        continue
      dataset.append(row)
  return dataset

In [0]:
#Convert string column to float
def str_column_to_float(dataset,column):
  for row in dataset:
    row[column] = float(row[column].strip())

In [0]:
# Find the min and max values for each column
def dataset_minmax(dataset):
  minmax = list()
  for i in range(len(dataset[0])):
    col_values = [ row[i] for row in dataset ]
    value_min = min(col_values)
    value_max = max(col_values)
    minmax.append([value_min,value_max])
  return minmax

In [0]:
#Rescale dataset to range 0-1
def normalize_dataset(dataset, minmax):
  for row in dataset:
    for i in range(len(row)):
      row[i] = (row[i] - minmax[i][0])/(minmax[i][1] - minmax[i][0])

In [0]:
#Split the dataset into k folds
def cross_validation_split(dataset,n_folds):
  dataset_split = list()
  dataset_copy = list(dataset)
  fold_size = int(len(dataset) / n_folds)
  for i in range(n_folds):
    fold = list()
    while len(fold) < fold_size:
      fold.append(dataset_copy.pop(randrange(len(dataset_copy))))
    dataset_split.append(fold)
  return dataset_split

In [0]:
#Calculate accuracy percentage
def accuracy_metric(actual, predicted):
  correct = 0
  for i in range(len(actual)):
    if actual[i] == predicted[i]:
      correct = correct + 1
  return correct/ float(len(actual)) *100.0

In [0]:
#Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset,algorithm, n_folds, *args):
  folds = cross_validation_split(dataset,n_folds)
  scores = list()
  for fold in folds:
    train_set = list(folds)
    train_set.remove(fold)
    train_set = sum(train_set,[])
    test_set = list()
    for row in fold:
      row_copy = list(row)
      test_set.append(row_copy)
      row_copy[-1] = None
    predicted = algorithm(train_set, test_set, *args)
    actual = [ row[-1] for row in fold]
    accuracy = accuracy_metric(actual,predicted)
    scores.append(accuracy)
  return scores

In [0]:
# Linear Regression Algorithm With Stochastic Gradient Descent
def logistic_regression(train, test, l_rate, n_epoch):
	predictions = list()
	coef = coefficients_sgd(train, l_rate, n_epoch)
	for row in test:
		yhat = predict(row, coef)
		yhat = round(yhat)
		predictions.append(yhat)
	return(predictions)

In [31]:
# Test the logistic regression algorithm on the diabetes dataset
seed(1)
# load and prepare data
filename = 'pima-indians-diabetes.csv'
dataset = load_csv(filename)
for i in range(len(dataset[0])):
	str_column_to_float(dataset, i)
# normalize
minmax = dataset_minmax(dataset)
normalize_dataset(dataset, minmax)
# evaluate algorithm
n_folds = 5
l_rate = 0.1
n_epoch = 100
scores = evaluate_algorithm(dataset, logistic_regression, n_folds, l_rate, n_epoch)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

	>>epoch = 0,l_rate = 0.100,error=137.200
	>>epoch = 1,l_rate = 0.100,error=124.854
	>>epoch = 2,l_rate = 0.100,error=117.891
	>>epoch = 3,l_rate = 0.100,error=113.475
	>>epoch = 4,l_rate = 0.100,error=110.424
	>>epoch = 5,l_rate = 0.100,error=108.182
	>>epoch = 6,l_rate = 0.100,error=106.459
	>>epoch = 7,l_rate = 0.100,error=105.092
	>>epoch = 8,l_rate = 0.100,error=103.980
	>>epoch = 9,l_rate = 0.100,error=103.056
	>>epoch = 10,l_rate = 0.100,error=102.278
	>>epoch = 11,l_rate = 0.100,error=101.614
	>>epoch = 12,l_rate = 0.100,error=101.042
	>>epoch = 13,l_rate = 0.100,error=100.545
	>>epoch = 14,l_rate = 0.100,error=100.109
	>>epoch = 15,l_rate = 0.100,error=99.725
	>>epoch = 16,l_rate = 0.100,error=99.385
	>>epoch = 17,l_rate = 0.100,error=99.083
	>>epoch = 18,l_rate = 0.100,error=98.813
	>>epoch = 19,l_rate = 0.100,error=98.570
	>>epoch = 20,l_rate = 0.100,error=98.353
	>>epoch = 21,l_rate = 0.100,error=98.156
	>>epoch = 22,l_rate = 0.100,error=97.979
	>>epoch = 23,l_rate = 0.100,