In [None]:
#%%

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import preprocessing
%matplotlib inline

#%%

from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score

#%% md

### Load data

#%%

df = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
df.head()

#%% md

### Visualize data

#%%

plt.figure(figsize=(16, 8))
sns.distplot(df.quality,bins=20)

#%%

plt.figure(figsize=(16, 8))
ax = sns.jointplot(x=df.quality,y=df["alcohol"],data=df)

#%%

plt.figure(figsize=(16, 8))
sns.set(style="whitegrid")
corr = df.corr()
sns.heatmap(corr,annot=True,cmap="coolwarm")

#%%

sns.clustermap(corr,cmap="coolwarm")

#%%

plt.figure(figsize=(16, 8))
sns.set(style="whitegrid")
columns = ["volatile acidity", "citric acid", "sulphates", "alcohol"]
for col in columns:
    x = df.groupby("quality")[col].mean()
    ax= sns.lineplot(x=x.index,y=x,label=col)
ax.set_title('Wine characteristics over quality - closest correlation features')
ax.set_ylabel('Measure')
ax.set_xlabel('quality')

#%%

plt.figure(figsize=(16, 8))
sns.set(style="whitegrid")
columns = ["residual sugar","chlorides","density","pH", "fixed acidity","free sulfur dioxide","total sulfur dioxide"]
for col in columns:
    x = df.groupby("quality")[col].mean()
    ax= sns.lineplot(x=x.index,y=x,label=col)
ax.set_title('Wine characteristics over quality - furthest correlation features')
ax.set_ylabel('Measure')
ax.set_xlabel('quality')

#%% md
### Data Analysis

#### Filter Method - Pearson Correlation

#%%
corr_cutoff = -0.25

print("Close correlation defined as values close to 1 or -1 on the heat map above with target feature.\n"
      "Feature correlation cutoff: ", corr_cutoff, " ", -corr_cutoff)

quality_corr = df.corr()["quality"]

corr_features = quality_corr[(quality_corr < corr_cutoff) | (quality_corr > -corr_cutoff)]
corr_features.pop("quality")
corr_features.keys()

#%% md
### Pre-process data

#### Split labels for good and bad

#%%

quality_threshold = 6
df['quality'] = df['quality'].apply(lambda x: "good" if (x > quality_threshold)  else "bad")
df['quality'].value_counts()

#%% md

#### Normalize features

#%%

features = df[corr_features.keys()]
features = preprocessing.StandardScaler().fit(features).transform(features)
features[0:5]

#%%

X = features
y = df['quality'].values

#%% md

### Prepare algorithm

#### Create hold-out train/cv/test sets

#%%

from sklearn.model_selection import train_test_split
X_train, X_rest, y_train, y_rest = train_test_split( X, y, test_size=0.3, shuffle=True, random_state=4)
X_cv, X_test, y_cv, y_test = train_test_split( X_rest, y_rest, test_size=0.4, shuffle=True, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('CV set:', X_cv.shape,  y_cv.shape)
print ('Test set:', X_test.shape,  y_test.shape)

#%% md

#### Pick best K

#%%

from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

Ks = 10
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))

for n in range(1,Ks):
    neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train)
    yhat=neigh.predict(X_cv)
    mean_acc[n-1] = metrics.accuracy_score(y_cv, yhat)
    std_acc[n-1]=np.std(yhat==y_cv)/np.sqrt(yhat.shape[0])

mean_acc

#%%

plt.plot(range(1,Ks),mean_acc,'g')
plt.fill_between(range(1,Ks),mean_acc - 1 * std_acc,mean_acc + 1 * std_acc, alpha=0.10)
plt.fill_between(range(1,Ks),mean_acc - 3 * std_acc,mean_acc + 3 * std_acc, alpha=0.10,color="green")
plt.legend(('Accuracy ', '+/- 1xstd','+/- 3xstd'))
plt.ylabel('Accuracy ')
plt.xlabel('Number of Neighbors (K)')
plt.tight_layout()
plt.show()

#%%

k = mean_acc.argmax()+1
print( "The best accuracy was with", mean_acc.max(), "with k=", k)

#%% md

### Train

#%%

model = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)

yhat_cv = model.predict(X_cv)
print( "Accuracy on Train set: ", metrics.accuracy_score(y_cv, yhat_cv))
print( "Accuracy on CV set: ", metrics.accuracy_score(y_cv, yhat_cv))

#%% md

### Evaluation

#### Prepare functions

#%%

from sklearn.metrics import classification_report, confusion_matrix
import itertools
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


def compute_and_plot_confusion_matrix(y_true, y_pred, labels):
    cnf_matrix = confusion_matrix(y_true, y_pred, labels=labels)
    np.set_printoptions(precision=2)

    print (classification_report(y_true, y_pred))

    plt.figure()
    plot_confusion_matrix(cnf_matrix, classes=labels, normalize=True,  title='Confusion matrix')

#%% md

#### Predict

#%%

yhat = model.predict(X_test)

#%% md

#### Calculate Scores

#%%

jaccard = jaccard_score(y_test, yhat, pos_label="good")
f1 = f1_score(y_test, yhat, average='weighted')

print( "Jaccard Score", jaccard)
print( "F1", f1)
print( "Accuracy on test set: ", metrics.accuracy_score(y_cv, yhat_cv))

#%%

compute_and_plot_confusion_matrix(y_true=y_test, y_pred=yhat, labels=pd.unique(y))
