In [None]:
import os
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

%matplotlib inline
print("Libraries imported...")

In [None]:
def load_data():
    csv_path = '../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv'
    data = np.loadtxt(csv_path, delimiter=',', skiprows=1)
    x = np.array([np.append(1, row) for row in data[:, :-1]])
    y = np.array([1 if label >= 7 else 0 for label in data[:, -1]])
    return x, y

In [None]:
X, y = load_data()
print(f"Data shape: {X.shape}\nLabels Shape: {y.shape}")

In [None]:
def standardize(data):
    mean = np.mean(data[:, 1:], axis=0)
    std_dev = np.std(data[:, 1:], axis=0)
    z = np.array([(row - mean) / std_dev for row in data[:, 1:]])

    return np.column_stack((data[:, 0], z))

In [None]:
std_X = standardize(data=X)
print(f"Standardized Data shape: {std_X.shape}")

In [None]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [None]:
print(f"Sigmoid of 0: {sigmoid(0)}, Expected: 0.5")

In [None]:
# Threshold function used for decisions
def threshold(a):
    if a >= 0.5:
        return 1
    else:
        return 0


# Function to compute cost_function / Error - Logistic Regression Cost Function
def cost_function(data, labels, weights):
    m = len(labels)
    h_x = sigmoid(np.dot(data, weights))
    cost = np.dot(-labels, np.log(h_x)) - np.dot((1 - labels), np.log(1 - h_x))
    return cost / m


# Function to compute gradients
def gradient(data, labels, weights):
    m = len(labels)
    h_x = sigmoid(np.dot(data, weights))
    grads = np.zeros(shape=weights.shape)
    for i, grad in enumerate(grads):
        grads[i] = np.sum((h_x - labels).dot(data[:, i])) / m

    return grads


# Batch Gradient Descent, to optmize/minimize the weights
def BGD(data, labels, learning_rate, epochs):
    J = []
    thetas = np.zeros(shape=data[0].shape)
    for _ in range(epochs):
        thetas[:] -= learning_rate * gradient(data=data, labels=labels, weights=thetas)
        J.append(cost_function(data=data, labels=labels, weights=thetas))

    return J, thetas

In [None]:
# Function used for plotting the learning curve while training
def plot_error(error):
    sns.set_style(style='darkgrid')
    plt.plot(error)
    plt.xlabel("Iterations")
    plt.ylabel("Error")
    plt.title("Cost Function")
    plt.show()


# Function used to make predictions using the optimized weights
def predict(x, y, weights):
    count = 0
    preds = [threshold(sigmoid(np.dot(row, weights))) for row in x]
    for i in range(len(preds)):
        if preds[i] == y[i]:
            count += 1

    return preds, count


# Root Mean Squared Error
def RMSE(predictions, actual):
    rmse = np.sum(np.square(predictions - actual)) / len(actual)
    return np.sqrt(rmse)

In [None]:
# Using 20% of the data for testing, and 80% for training.
x_train, x_test, y_train, y_test = train_test_split(std_X, y, test_size=0.2, shuffle=True, random_state=42)
# Using Batch-Gradient-Descent to optimize weights
J, thetas = BGD(data=x_train, labels=y_train, learning_rate=0.01, epochs=5000)
plot_error(error=J)

In [None]:
predictions, correct_nums = predict(x=x_test, y=y_test, weights=thetas)
predictions_train, correct_nums_train = predict(x=x_train, y=y_train, weights=thetas)
print("Training data Accuracy: {}".format((correct_nums_train / len(y_train)) * 100))
print("Test data Accuracy: {}".format((correct_nums / len(y_test)) * 100))
print("RMSE on Test data: {}".format(RMSE(predictions=predictions, actual=y_test)))

In [None]:
tn, fp, fn, tp = confusion_matrix(y_true=y_test, y_pred=predictions, labels=[0, 1]).ravel()
print(f"\nTrue Negatives: {tn}\nTrue Positives: {tp}\nFalse Negative: {fn}\nFalse Positive: {fp}")

In [None]:
sns.heatmap(confusion_matrix(predictions, y_test), robust=True, annot=True)