In [49]:
import pandas as pd
import numpy as np
import sklearn 


## (i)

In [50]:
vinho_wine=pd.read_csv("winequality-red.csv")


In [51]:
vinho_wine.head()


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [52]:
vinho_wine.insert(12,"good_quality",0)

In [53]:
vinho_wine.head()
print(vinho_wine.shape)

(1599, 13)


In [54]:
for index, row in vinho_wine.iterrows():
    if row['quality'] >= 7:
        vinho_wine.at[index, 'good_quality'] = 1

In [55]:
vinho_wine.drop('quality', axis=1, inplace=True)

In [56]:
vinho_wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,good_quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0


## (ii)

In [57]:
print(vinho_wine.isnull().sum())

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
good_quality            0
dtype: int64


In [58]:
X = vinho_wine[['fixed acidity', 'volatile acidity', 'citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol','good_quality']]


In [59]:
Y=vinho_wine[['good_quality']].values.ravel()

In [60]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [61]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=0.25, random_state=42)


## (iii)

### (a)

In [72]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score


reg = LogisticRegression()
reg.fit(X_train, y_train)


y_pred = reg.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
print("accuracy", accuracy)

f1score = f1_score(y_test, y_pred)
print("f1_score", f1score)

accuracy 1.0
f1_score 1.0


### (b)

In [63]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score

knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("accuracy", accuracy)

f1score = f1_score(y_test, y_pred)
print("f1_score", f1score)

accuracy 0.9975
f1_score 0.9904761904761905


### (c)

In [64]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score
decision_tree = DecisionTreeClassifier() 
decision_tree.fit(X_train, y_train)

y_pred = decision_tree.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("accuracy", accuracy)

f1score = f1_score(y_test, y_pred)
print("f1_score", f1score)

accuracy 1.0
f1_score 1.0


### (d)

In [65]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

random_forest = RandomForestClassifier()  

random_forest.fit(X_train,y_train)

y_pred=random_forest.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("accuracy", accuracy)

f1score = f1_score(y_test, y_pred)
print("f1_score", f1score)

accuracy 1.0
f1_score 1.0


### (e)

In [73]:
import numpy as np 
from numpy import log,dot,e,shape
import matplotlib.pyplot as plt


In [74]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [75]:
def compute_cost(X, y, theta):
    m = len(y)
    h = sigmoid(np.dot(X, theta))
    cost = (-1 / m) * np.sum(y * np.log(h ) + (1 - y) * np.log(1 - h ))
    return cost

In [76]:
def gradient_descent(X, y, theta, alpha, iterations):
    m = len(y)
    cost_history = []

    for i in range(iterations):
        h = sigmoid(np.dot(X, theta))
        gradient = np.dot(X.T, (h - y)) / m
        theta -= alpha * gradient

        cost = compute_cost(X, y, theta)
        cost_history.append(cost)


    return theta, cost_history

In [77]:
def f1_score(y_true, y_pred):
    true_positives = np.sum((y_true == 1) & (y_pred == 1))
    false_positives = np.sum((y_true == 0) & (y_pred == 1))
    false_negatives = np.sum((y_true == 1) & (y_pred == 0))

    precision = true_positives / (true_positives + false_positives + 1e-8)
    recall = true_positives / (true_positives + false_negatives + 1e-8)
    f1 = 2 * (precision * recall) / (precision + recall + 1e-8)

    return  f1


In [81]:
theta = np.zeros(X_train.shape[1])  
alpha = 0.001  
iterations = 10000

trained_theta, cost_history = gradient_descent(X_train, y_train, theta, alpha, iterations)

def predict(X, theta):
    return sigmoid(np.dot(X, theta))

predictions = predict(X_test, trained_theta)
predictions = np.round(predictions) 

accuracy = np.mean(predictions == y_test)
print("accuracy", accuracy)

f1 = f1_score(y_test, predictions)
print("f1_score", f1)

accuracy 0.9525
f1_score 0.8479999949798401
