<a href="https://colab.research.google.com/github/stefanoridolfi/ML_From_scratch/blob/master/ch_1_8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# FROM ch_1_1.ipynb##########################
##############################################
import pandas as pd
from random import seed
from random import randrange
from csv import reader
from math import sqrt

''' if no headers in csv
def load_csv(filename_url):
  CSV_url=filename_url
  pdfile=pd.read_csv(CSV_url,header=None)
  dataset=pdfile.values.tolist()
  return dataset
  '''

# CVS with headers ##################
def load_csv(filename_url):
  CSV_url=filename_url
  #pdfile=pd.read_csv(CSV_url,sep=';',nrows=4830)
  pdfile=pd.read_csv(CSV_url,sep=';')
  headers=pdfile.head()
  dataset=pdfile.values.tolist()
  return dataset, list(headers)
##################

# Dataset are in https://github.com/jbrownlee/Datasets

#dataset winequality
CSV_url='https://raw.githubusercontent.com/stefanoridolfi/ML_From_scratch/master/winequality-white.csv'

dataset, headers = load_csv(CSV_url)
print('Loaded data file {0} with {1} rows and {2} columns\n\n'.format(CSV_url, len(dataset),len(dataset[0])))
print("dataset with",len(headers),"headers\n Headers:  ",headers)
print("dataset:\n",dataset)

# Convert string column to float
def str_column_to_float(dataset, column):
  for row in dataset:
    row[column] = float(row[column].strip())

# Make a prediction: function that calculates the output Y using coeffcients b0, b1 ...etc
def predict(row, coefficients):
  yhat = coefficients[0]
  for i in range(len(row)-1):
    yhat += coefficients[i + 1] * row[i]
  return yhat

# Estimate linear regression coefficients using stochastic gradient descent
def coefficients_sgd(train, l_rate, n_epoch):
  coef = [0.0 for i in range(len(train[0]))]
  for epoch in range(n_epoch):
    sum_error = 0
    for row in train:
      yhat = predict(row, coef)
      error = yhat - row[-1]
      sum_error += error**2
      coef[0] = coef[0] - l_rate * error
      for i in range(len(row)-1):
        coef[i + 1] = coef[i + 1] - l_rate * error * row[i]
    #print('>epoch=%d, lrate=%.3f, error=%.3f' % (epoch, l_rate, sum_error))
  return coef

#Find the min and max values for each column
def dataset_minmax(dataset):
  minmax = list()
  for i in range(len(dataset[0])):
    col_values = [row[i] for row in dataset]
    value_min = min(col_values)
    value_max = max(col_values)
    minmax.append([value_min, value_max])
  return minmax

#Rescale dataset columns to the range 0-1
def normalize_dataset(dataset, minmax):
  dataset_normal=list(dataset)
  for row in dataset_normal:
    for i in range(len(row)):
      row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])
  return dataset_normal

#Calculate root mean squared error
def rmse_metric(actual, predicted):
  sum_error = 0.0
  for i in range(len(actual)):
    prediction_error = predicted[i] - actual[i]
    sum_error += (prediction_error ** 2)
    mean_error = sum_error / float(len(actual))
  return sqrt(mean_error)





minmax=dataset_minmax(dataset)
dataset_normal=normalize_dataset(dataset, minmax)
# ############split dataset_normal in train and test and use train to calculate coeffcients and test to calculate rmse
coef = coefficients_sgd(dataset_normal, 0.01, 500)
#print("Coefficients:", coef)
predicted=[]
for row in dataset_normal:
  predicted.append(predict(row, coef))
  #print("Expected=%.3f, Predicted=%.3f" % (row[-1], yhat))

actual=[row[-1] for row in dataset_normal]
rmse_error=rmse_metric(actual, predicted)
print("rmse error",rmse_error)