## Importing Librarys

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler

## Loading Dataset

In [19]:
# Load the dataset
df = pd.read_csv('winequality-red.csv')

In [3]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


## Processing Dataset

In [4]:
# Create a new column for binary classification
df['good_quality'] = df['quality'] >= 7
df['good_quality'] = df['good_quality'].astype(int)
df.drop('quality', axis=1, inplace=True)

In [18]:
# Checking new column
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,good_quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0


In [7]:
# Separate features and target variable
X = df.drop('good_quality', axis=1)
y = df['good_quality']

## Training Data

In [9]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [14]:
# Finding Best value of K
best_k = 0
highest_accuracy = 0

for k in range(1, 21):  # Testing k from 1 to 20
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    accuracy = knn.score(X_test, y_test)
    if accuracy > highest_accuracy:
        highest_accuracy = accuracy
        best_k = k

print(f"Best k value: {best_k}")
print(f"Highest accuracy: {highest_accuracy}")

Best k value: 4
Highest accuracy: 0.859375


In [15]:
# Logistic Regression
logistic_regression = LogisticRegression(max_iter=1000)
logistic_regression.fit(X_train, y_train)
log_reg_pred = logistic_regression.predict(X_test)

# K-Nearest Neighbors
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)

# Decision Trees
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
decision_tree_pred = decision_tree.predict(X_test)

# Random Forest
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)
random_forest_pred = random_forest.predict(X_test)

## Evaluating all models

In [16]:
# Evaluate models
models = ["Logistic Regression", "K-Nearest Neighbors", "Decision Trees", "Random Forest"]
predictions = [log_reg_pred, knn_pred, decision_tree_pred, random_forest_pred]

for model, pred in zip(models, predictions):
    acc_score = accuracy_score(y_test, pred)
    f1 = f1_score(y_test, pred, average='weighted')
    print(f"Model: {model}")
    print(f"Accuracy Score: {acc_score:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("-------------------------------------")

Model: Logistic Regression
Accuracy Score: 0.8594
F1 Score: 0.8344
-------------------------------------
Model: K-Nearest Neighbors
Accuracy Score: 0.8594
F1 Score: 0.8182
-------------------------------------
Model: Decision Trees
Accuracy Score: 0.8812
F1 Score: 0.8802
-------------------------------------
Model: Random Forest
Accuracy Score: 0.9062
F1 Score: 0.8980
-------------------------------------


## Logistic Regression from Scratch

In [56]:
# Logistic Regression from Scratch
class LogisticRegressionScratch:
    def __init__(self, learning_rate=0.01, n_iterations=1000):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        self.theta = np.zeros(X.shape[1])
        m = len(y)
        for _ in range(self.n_iterations):
            z = np.dot(X, self.theta)
            h = self.sigmoid(z)
            gradient = np.dot(X.T, (h - y)) / m
            self.theta -= self.learning_rate * gradient

    def predict(self, X):
        return np.round(self.sigmoid(np.dot(X, self.theta)))

# Instantiate and train Logistic Regression from Scratch
log_reg_scratch = LogisticRegressionScratch()
log_reg_scratch.fit(X_train, y_train)

# Predict on test set
predictions_scratch = log_reg_scratch.predict(X_test)

# Evaluate Logistic Regression from Scratch
acc_score_scratch = accuracy_score(y_test, predictions_scratch)
f1_scratch = f1_score(y_test, predictions_scratch, average='weighted')

print("Logistic Regression from Scratch")
print(f"Accuracy Score: {acc_score_scratch:.4f}")
print(f"F1 Score: {f1_scratch:.4f}")

Logistic Regression from Scratch
Accuracy Score: 0.8531
F1 Score: 0.7855
