<a href="https://colab.research.google.com/github/stefanoridolfi/ML_From_scratch/blob/master/ch_1_9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
# FROM ch_1_1.ipynb##########################
##############################################
import pandas as pd
from random import seed
from random import randrange
from csv import reader
from math import sqrt,exp

''' if no headers in csv
def load_csv(filename_url):
  CSV_url=filename_url
  pdfile=pd.read_csv(CSV_url,header=None)
  dataset=pdfile.values.tolist()
  return dataset
  '''
'''if header in csv
#CVS with headers ##################
def load_csv(filename_url):
  CSV_url=filename_url
  pdfile=pd.read_csv(CSV_url,sep=';',nrows=48)
  #pdfile=pd.read_csv(CSV_url,sep=';')
  headers=pdfile.head()
  dataset=pdfile.values.tolist()
  return dataset, list(headers)
  '''
def load_csv(filename_url):
  CSV_url=filename_url
  pdfile=pd.read_csv(CSV_url,header=None)
  dataset=pdfile.values.tolist()
  return dataset


##################

# Dataset are in https://github.com/jbrownlee/Datasets

#dataset winequality
#CSV_url='https://raw.githubusercontent.com/stefanoridolfi/ML_From_scratch/master/winequality-white.csv'

# URL contenente dataset CSV Pima Indians dataset
CSV_url='https://goo.gl/2tMFne'

dataset= load_csv(CSV_url)
print('Loaded data file {0} with {1} rows and {2} columns\n\n'.format(CSV_url, len(dataset),len(dataset[0])))

# Convert string column to float
def str_column_to_float(dataset, column):
  for row in dataset:
    row[column] = float(row[column].strip())


# Make a prediction for logistic regression: function that calculates the output Y using coeffcients b0, b1 ...etc
def predict(row, coefficients):
  yhat = coefficients[0]
  for i in range(len(row)-1):
    yhat += coefficients[i + 1] * row[i]
  return 1.0/(1.0 +exp(-yhat))



# Estimate linear regression coefficients using stochastic gradient descent
def coefficients_sgd(train, l_rate, n_epoch):
  coef = [0.0 for i in range(len(train[0]))]
  for epoch in range(n_epoch):
    sum_error = 0
    for row in train:
      yhat = predict(row, coef)
      error = yhat - row[-1]
      sum_error += error**2
      coef[0] = coef[0] + l_rate * error*yhat*(1-yhat)
      for i in range(len(row)-1):
        coef[i + 1] = coef[i + 1] + l_rate * error * yhat*(1-yhat)*row[i]
    #print('>epoch=%d, lrate=%.3f, error=%.3f' % (epoch, l_rate, sum_error))
  return coef

#Find the min and max values for each column
def dataset_minmax(dataset):
  minmax = list()
  for i in range(len(dataset[0])):
    col_values = [row[i] for row in dataset]
    value_min = min(col_values)
    value_max = max(col_values)
    minmax.append([value_min, value_max])
  return minmax

#Rescale dataset columns to the range 0-1
def normalize_dataset(dataset, minmax):
  dataset_normal=list(dataset)
  for row in dataset_normal:
    for i in range(len(row)):
      row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])
  return dataset_normal

#Calculate root mean squared error
def rmse_metric(actual, predicted):
  sum_error = 0.0
  for i in range(len(actual)):
    prediction_error = predicted[i] - actual[i]
    sum_error += (prediction_error ** 2)
    mean_error = sum_error / float(len(actual))
  return sqrt(mean_error)

# function that splits dataset in two parts, train and test
def train_test_split(dataset,split=0.7):
  train_list=[]
  train_size=int(split*len(dataset))
  test_list=list(dataset)
  while len(train_list)  <train_size:
    index=randrange(len(test_list))
    #print("index=",index)
    elem=test_list.pop(index)
    train_list.append(elem)
  return train_list, test_list

# Function that splits  dataset in k folds : result is a list of list of folds, inside is fold there are lists that are the columns of dataset
def cross_validation_split(dataset, folds=4):
  dataset_split=list()
  dataset_copy=list(dataset)
  print("len dataset", len(dataset))
  fold_size=int(len(dataset)/folds)
  for i in range(folds):
    fold=[]
    while len(fold)<fold_size:
      index=randrange(len(dataset_copy))
      elem=dataset_copy.pop(index)
      fold.append(elem)
    dataset_split.append(fold)
  return dataset_split


###############START code###########################
seed(1)
n_folds=2
lr=0.01
epoches=200
minmax=dataset_minmax(dataset)
dataset_normal=normalize_dataset(dataset, minmax)
dataset_split_folds=cross_validation_split(dataset_normal,n_folds)
train, test =train_test_split(dataset_normal)
#print("dataset split 2 parts: train:",train,"\ntest:",test)

################# Case dataset splitted in 2 parts########################################
coef = coefficients_sgd(train, 0.01, 500)
#print("Coefficients:", coef)
predicted=[]
for row in test:
  predicted.append(predict(row, coef))
  #print("Expected=%.3f, Predicted=%.3f" % (row[-1], yhat))
actual=[row[-1] for row in test]
rmse_error=rmse_metric(actual, predicted)
print("rmse error split in 2 parts case: %.3f" % rmse_error)

################# Case dataset splitted in folds########################################
scores=[]
for fold in dataset_split_folds:
  train_set=list(dataset_split_folds)
  train_set.remove(fold)
  train_set=sum(train_set,[])
  test_set=fold
  coef = coefficients_sgd(train_set, lr, epoches)
  predicted=[]
  for row in test_set:
    predicted.append(predict(row, coef))
    actual=[row[-1] for row in test_set]
    print("actual=",actual,"  predicted=",predict(row, coef))
    rmse_error=rmse_metric(actual, predicted)
    scores.append(rmse_error)
  score=sum(scores)/float(len(scores))
print("scores",scores,"\nmean score cross validation: %.3f" % score)





Loaded data file https://goo.gl/2tMFne with 768 rows and 9 columns


len dataset 768
rmse error split in 2 parts case: 0.797
actual= [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0

IndexError: ignored