# White Wine Quality

## Q1

In [17]:
import pandas as pd

file_path = 'winequality-white.csv'

df = pd.read_csv(file_path, sep=';')

df['y'] = (df['quality'] > 5).astype(int)

df.tail(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,y
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.5,11.2,6,1
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.9949,3.15,0.46,9.6,5,0
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,1
4896,5.5,0.29,0.3,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,1
4897,6.0,0.21,0.38,0.8,0.02,22.0,98.0,0.98941,3.26,0.32,11.8,6,1


## Q2 

In [15]:
class_counts = df['y'].value_counts(normalize=True)

class_imbalance_ratio = class_counts.to_dict()
print(class_imbalance_ratio)


{1: 0.6651694569211923, 0: 0.33483054307880766}


## Q3

In [20]:
from sklearn.model_selection import train_test_split

X = df.drop(['quality', 'y'], axis=1)
y = df['y']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

train_size = len(y_train) / len(y)
val_size = len(y_val) / len(y)
test_size = len(y_test) / len(y)

print(train_size)
print(val_size)
print(test_size)

0.5998366680277665
0.2000816659861168
0.2000816659861168


## Q4 

In [26]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_train_scaled_df.tail(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
2933,1.102055,1.185956,1.314083,0.284372,0.718158,-0.074379,0.569648,0.851243,0.470388,0.781371,0.64561
2934,0.163942,-0.483471,0.151524,-0.971658,-0.459186,-0.255376,-0.597123,-0.93176,-0.193142,-0.089652,0.237645
2935,-0.422378,1.578763,1.314083,0.264747,-0.017682,-1.160364,0.73633,0.554076,-0.524907,-1.047777,-1.47581
2936,-0.656906,0.007537,-0.346716,0.088118,0.276654,0.287616,0.117227,-0.093088,1.598389,0.868473,0.400831
2937,-0.539642,0.007537,-1.343195,0.3825,-0.16485,1.554599,1.355433,0.577189,0.204976,-0.263856,-0.904658


## Q5

In [30]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize lists to store results
results = []

# List of classifiers with their respective hyperparameters
classifiers = [
    (KNeighborsClassifier(), [{'n_neighbors': n} for n in [1, 3, 5]]),
    (SVC(), [{'kernel': k} for k in ['rbf', 'linear', 'poly']]),
    (DecisionTreeClassifier(), [{'criterion': c} for c in ['gini', 'entropy']]),
    (LogisticRegression(solver='liblinear'), [{'penalty': p} for p in ['l1', 'l2']])
]

# Function to compute metrics
def compute_metrics(y_true, y_pred):
    return {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred),
        'F1': f1_score(y_true, y_pred)
    }

# Train and evaluate each classifier with its hyperparameters
for clf, hyperparams_list in classifiers:
    for params in hyperparams_list:
        clf.set_params(**params)
        clf.fit(X_train_scaled, y_train)
        y_val_pred = clf.predict(X_val_scaled)
        metrics = compute_metrics(y_val, y_val_pred)
        results.append((clf.__class__.__name__, params, metrics))

# Display the results
for result in results:
    print(f"Classifier: {result[0]}, Params: {result[1]}, Metrics: {result[2]}")
    print('\n')


Classifier: KNeighborsClassifier, Params: {'n_neighbors': 1}, Metrics: {'accuracy': 0.773469387755102, 'precision': 0.8257575757575758, 'recall': 0.8358895705521472, 'F1': 0.8307926829268293}


Classifier: KNeighborsClassifier, Params: {'n_neighbors': 3}, Metrics: {'accuracy': 0.7510204081632653, 'precision': 0.7947976878612717, 'recall': 0.843558282208589, 'F1': 0.818452380952381}


Classifier: KNeighborsClassifier, Params: {'n_neighbors': 5}, Metrics: {'accuracy': 0.753061224489796, 'precision': 0.8005865102639296, 'recall': 0.8374233128834356, 'F1': 0.8185907046476762}


Classifier: SVC, Params: {'kernel': 'rbf'}, Metrics: {'accuracy': 0.7612244897959184, 'precision': 0.7960339943342776, 'recall': 0.8619631901840491, 'F1': 0.8276877761413843}


Classifier: SVC, Params: {'kernel': 'linear'}, Metrics: {'accuracy': 0.736734693877551, 'precision': 0.7585301837270341, 'recall': 0.8865030674846626, 'F1': 0.8175388967468176}


Classifier: SVC, Params: {'kernel': 'poly'}, Metrics: {'accurac

## Q6 

Based on the F1 scores provided from the previous output, the best model is the k-Nearest Neighbors (kNN) classifier with n_neighbors = 1, which has an F1 score of approximately 0.8308

## Q7 

In [35]:
from sklearn.metrics import confusion_matrix

best_model = KNeighborsClassifier(n_neighbors=1)

best_model.fit(X_train_scaled, y_train)

y_test_pred = best_model.predict(X_test_scaled)

conf_matrix = confusion_matrix(y_test, y_test_pred)
accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)

print("Confusion Matrix:")
print(conf_matrix)
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Confusion Matrix:
[[212 116]
 [118 534]]
Accuracy: 0.7612
Precision: 0.8215
Recall: 0.8190
F1 Score: 0.8203
