In [None]:
from random import seed
from random import randrange
from csv import reader
from math import sqrt

In [None]:
# Pre Processing

def getdata(filename, q_rows, q_cols):
    dataset = np.zeros((q_rows,q_cols))

    with open(filename, "r") as file:
        csv_reader = reader(file)
        i = 0
        if q_rows == 'all' and q_cols == 'all':
            for row in csv_reader:
                if i > 0:
                    if not row:
                        continue
                    for j in range(len(row)):
                        dataset[i-1, j] = float(row[j])
                i += 1
            return dataset
            
        for row in csv_reader:
            if i > 0:
                if not row or i == q_rows+1:
                    continue
                for j in range(q_rows):
                    dataset[i-1, j] = float(row[j])
            i += 1
    return dataset

class NormStandart():
    def __init__(self, dataset, dsformat):
        self.dataset = dataset
        self.format = dsformat

    def normalize(self, dataset):
        max = dataset.max()
        min = dataset.min()
        for i in range(len(dataset.transpose()[0])):
            for j in range(4):
                dataset[i][j] = (dataset[i][j] - min)/(max-min)
        return dataset


    def mean(self, dataset):
        col_quant = len(dataset[0])
        dataset = dataset.transpose()
        means = np.zeros(col_quant)
        for i in range(col_quant):
            means[i] = np.sum(dataset[i])/float(len(dataset[i]))
        return means

    # calculate column standard deviations
    def stdev(self, dataset, means):
        col_quant = len(dataset[0])
        stdevs = np.zeros(col_quant)
        for i in range(col_quant):
            variance = [pow(row[i]-means[i], 2) for row in dataset]
            stdevs[i] = sum(variance)
        stdevs = [sqrt(x/(float(len(dataset)-1))) for x in stdevs]
        return np.array(stdevs)

    def standardize_dataset(self, dataset, means, stdevs):
        for row in dataset:
            for i in range(len(row)):
                row[i] = (row[i] - means[i]) / stdevs[i]

    def normalize_and_standartize(self):

        self.dataset = self.normalize(self.dataset)
        means = self.mean(self.dataset)
        stdevs = self.stdev(self.dataset, means)

        for row in self.dataset:
            for i in range(len(row)):
                row[i] = (row[i] - means[i]) / stdevs[i]
        if self.format == "numpy":
            return self.dataset
        elif self.format == "pandas":
            return pd.DataFrame(data=self.dataset)

ds = getdata("winequality-white.csv", 'all', 'all')
ns = NormStandart(ds, "numpy")
dataset_nolabels = ns.normalize_and_standartize()
i = 0
dataset = np.zeros((100,5))
for i in range(100):
    if i <= 49:
        dataset[i] = np.append(dataset_nolabels[i], 1, axis=None)
    else:
        dataset[i] = np.append(dataset_nolabels[i], -1, axis=None)

dataset = pd.DataFrame(data=dataset)

Y = dataset.loc[:, 4]
X = dataset.iloc[:, :-1]

# # insert 1 in every row for intercept b
X.insert(loc=len(X.columns), column=4, value=1)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print("Train Data")
print(X_train.head(5))
print(y_train.head(5))

print("Test Data")
print(X_test.head(5))
print(y_test.head(5))


In [None]:
# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
	dataset_split = list()
	dataset_copy = list(dataset)
	fold_size = int(len(dataset) / n_folds)
	for i in range(n_folds):
		fold = list()
		while len(fold) < fold_size:
			index = randrange(len(dataset_copy))
			fold.append(dataset_copy.pop(index))
		dataset_split.append(fold)
	return dataset_split

# Calculate root mean squared error
def rmse_metric(actual, predicted):
	sum_error = 0.0
	for i in range(len(actual)):
		prediction_error = predicted[i] - actual[i]
		sum_error += (prediction_error ** 2)
	mean_error = sum_error / float(len(actual))
	return sqrt(mean_error)

# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
	folds = cross_validation_split(dataset, n_folds)
	scores = list()
	for fold in folds:
		train_set = list(folds)
		train_set.remove(fold)
		train_set = sum(train_set, [])
		test_set = list()
		for row in fold:
			row_copy = list(row)
			test_set.append(row_copy)
			row_copy[-1] = None
		predicted = algorithm(train_set, test_set, *args)
		actual = [row[-1] for row in fold]
		rmse = rmse_metric(actual, predicted)
		scores.append(rmse)
	return scores

# Make a prediction with coefficients
def predict(row, coefficients):
	yhat = coefficients[0]
	for i in range(len(row)-1):
		yhat += coefficients[i + 1] * row[i]
	return yhat

# Estimate linear regression coefficients using stochastic gradient descent
def coefficients_sgd(train, l_rate, n_epoch):
	coef = [0.0 for i in range(len(train[0]))]
	for epoch in range(n_epoch):
		for row in train:
			yhat = predict(row, coef)
			error = yhat - row[-1]
			coef[0] = coef[0] - l_rate * error
			for i in range(len(row)-1):
				coef[i + 1] = coef[i + 1] - l_rate * error * row[i]
			# print(l_rate, n_epoch, error)
	return coef

# Linear Regression Algorithm With Stochastic Gradient Descent
def linear_regression_sgd(train, test, l_rate, n_epoch):
	predictions = list()
	coef = coefficients_sgd(train, l_rate, n_epoch)
	for row in test:
		yhat = predict(row, coef)
		predictions.append(yhat)
	return(predictions)

# Linear Regression on wine quality dataset
seed(1)
# load and prepare data
filename = 'winequality-white.csv'
dataset = load_csv(filename)
for i in range(len(dataset[0])):
	str_column_to_float(dataset, i)
# normalize
minmax = dataset_minmax(dataset)
normalize_dataset(dataset, minmax)
# evaluate algorithm
n_folds = 5
l_rate = 0.01
n_epoch = 50
scores = evaluate_algorithm(dataset, linear_regression_sgd, n_folds, l_rate, n_epoch)
print('Scores: %s' % scores)
print('Mean RMSE: %.3f' % (sum(scores)/float(len(scores))))