In [None]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import svm
from sklearn import neighbors
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

In [None]:
# Plot settings
sns.set(rc={'figure.figsize':(11,8)})
wine_label_palette = sns.color_palette(["#42a5f5", "#ff7043", "#66bb6a"])
wine_label_cmap = ListedColormap(["#bbdefb", "#ffccbc", "#c8e6c9"])

In [None]:
# Load the data and transfrom it to a pandas dataFrame
wine_df = pd.read_csv(
    "../input/wineuci/Wine.csv",
    names=['target','alcohol','malic_acid','ash','alcalinity_of_ash','magnesium','total_phenols','flavanoids','nonflavanoid_phenols','proanthocyanins','color_intensity','hue','od280/od315_of_diluted_wines','proline']
)
wine_df.head()

In [None]:
# Select just the first two attributes
X = wine_df[["alcohol", "malic_acid"]]
Y = wine_df["target"]

# Plot the data selected for the analysis
fig, axes = plt.subplots(1, 2, figsize=(22, 8))
sns.scatterplot(data=X, x="alcohol", y="malic_acid", hue=Y, linewidth=1, edgecolor="grey", s=100, palette=wine_label_palette, ax=axes[0]).set(xlabel="Alcohol", ylabel="Malic Acid", title="2D scatter")
sns.barplot(data=pd.DataFrame(data={"Label": [0, 1, 2], "Count": wine_df["target"].value_counts().sort_index()}), x="Label", y="Count", linewidth=1, edgecolor="grey", palette=wine_label_palette, ax=axes[1]).set(title="Distribution of labels");

In [None]:
# Split the data in training, validation and test 5:2:3
X_train_full, X_test, Y_train_full, Y_test = train_test_split(X, Y, test_size=3/10, random_state=176, stratify=Y)
X_train, X_val, Y_train, Y_val = train_test_split(X_train_full, Y_train_full, test_size=2/7, random_state=176, stratify=Y_train_full)

In [None]:
# Standardize the data
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), index=X_train.index, columns=X_train.columns)
X_val_scaled = pd.DataFrame(scaler.transform(X_val), index=X_val.index, columns=X_val.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

X_train_full_scaled = pd.DataFrame(scaler.transform(X_train_full), index=X_train_full.index, columns=X_train_full.columns)
X_scaled = pd.DataFrame(scaler.transform(X), index=X.index, columns=X.columns)

In [None]:
# Distributions of the attributes
fig, axes = plt.subplots(1, 2, figsize=(22, 8))
sns.distplot(X["malic_acid"], ax=axes[0])
sns.distplot(X["alcohol"], ax=axes[0]).set(xlabel="", title="Non-scaled distribution")
sns.distplot(X_scaled["malic_acid"], ax=axes[1])
sns.distplot(X_scaled["alcohol"], ax=axes[1]).set(xlabel="", title="Scaled distribution")
axes[0].legend(labels=["Malic Acid", "Alcohol"])
axes[1].legend(labels=["Malic Acid", "Alcohol"]);

In [None]:
# Difference between scaled data and non-scaled
fig, axes = plt.subplots(1, 2, figsize=(22, 8))

# Find the ranges for the plot
x_min_scaled = min(X_scaled["alcohol"].min(), -X_scaled["alcohol"].max())
y_min_scaled = min(X_scaled["malic_acid"].min(), -X_scaled["malic_acid"].max())
x_min = min(X["alcohol"].min(), -X["alcohol"].max())
y_min = min(X["malic_acid"].min(), -X["malic_acid"].max())

# Plot
sns.scatterplot(data=X, x="alcohol", y="malic_acid", hue=Y, linewidth=1, edgecolor="grey", palette=wine_label_palette, s=100, ax=axes[0])
sns.scatterplot(data=X_scaled, x="alcohol", y="malic_acid", hue=Y, linewidth=1, edgecolor="grey", palette=wine_label_palette, s=100, ax=axes[1])
axes[0].set(xlim=(x_min-1, -x_min+1), ylim=(y_min-1, -y_min+1), xlabel="Alcohol", ylabel="Malic Acid", title="Non-scaled")
axes[1].set(xlim=(x_min_scaled-1, -x_min_scaled+1), ylim=(y_min_scaled-1, -y_min_scaled+1), xlabel="Alcohol", ylabel="Malic Acid", title="Scaled");

In [None]:
x_ = np.arange(X_train_scaled["alcohol"].min()-1, X_train_scaled["alcohol"].max()+1, .01)
y_ = np.arange(X_train_scaled["malic_acid"].min()-1, X_train_scaled["malic_acid"].max()+1, .01)
xx, yy = np.meshgrid(x_, y_)

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(20, 16))
K_list = [1, 3, 5, 7]
accuracy = {}

# KNN for K in [1, 3, 5, 7]
for K in K_list:
    # Train the model
    model = neighbors.KNeighborsClassifier(K)
    model.fit(X_train_scaled, Y_train)
    accuracy[K] = model.score(X_val_scaled, Y_val)
    
    # Decision boundary
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
    
    # Plot
    index = (K_list.index(K) // 2, K_list.index(K) % 2)
    axes[index].pcolormesh(xx, yy, Z, cmap=wine_label_cmap)
    sns.scatterplot(data=X_train_scaled, x="alcohol", y="malic_acid", hue=Y_train, linewidth=1, edgecolor="grey", palette=wine_label_palette, s=100,
                    ax=axes[index]).set(xlabel="Alcohol", ylabel="Malic Acid", title="K="+str(K))

In [None]:
# Accuracy plot
sns.lineplot(x=K_list, y=list(accuracy.values())).set(xlabel="K", ylabel="Accuracy", title="Accuracy on the validation set");

In [None]:
# Test accuracy
model = neighbors.KNeighborsClassifier(5)
model.fit(X_train_full_scaled, Y_train_full)
model.score(X_test_scaled, Y_test)

In [None]:
fig, axes = plt.subplots(4, 2, figsize=(20, 32))
C_list = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
accuracy = {}

# SVC for C in [0.001, 0.01, 0.1, 1, 10, 100, 1000]
for C in C_list:
    # Train the model
    model = svm.SVC(kernel="linear", C=C)
    model.fit(X_train_scaled, Y_train)
    accuracy[C] = model.score(X_val_scaled, Y_val)
    
    # Decision boundary
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
    
    # Plot
    index = (C_list.index(C) // 2, C_list.index(C) % 2)
    axes[index].contourf(xx, yy, Z, cmap=wine_label_cmap)
    sns.scatterplot(data=X_train_scaled, x="alcohol", y="malic_acid", hue=Y_train, linewidth=1, edgecolor="grey", palette=wine_label_palette, s=100,
                    ax=axes[index]).set(xlabel="Alcohol", ylabel="Malic Acid", title="C="+str(C))

fig.delaxes(axes[3, 1])

In [None]:
# Accuracy plot
sns.lineplot(x=C_list, y=list(accuracy.values())).set(xlabel="C", ylabel="Accuracy", title="Accuracy on the validation set", xscale="log");

In [None]:
# Test accuracy
model = svm.SVC(kernel="linear", C=1000)
model.fit(X_train_full_scaled, Y_train_full)
model.score(X_test_scaled, Y_test)

In [None]:
fig, axes = plt.subplots(4, 2, figsize=(20, 32))
C_list = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
accuracy = {}

# SVC for C in [0.001, 0.01, 0.1, 1, 10, 100, 1000]
for C in C_list:
    # Train the model
    model = svm.SVC(kernel="rbf", C=C, gamma="auto")
    model.fit(X_train_scaled, Y_train)
    accuracy[C] = model.score(X_val_scaled, Y_val)
    
    # Decision boundary
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
    
    # Plot
    index = (C_list.index(C) // 2, C_list.index(C) % 2)
    axes[index].contourf(xx, yy, Z, cmap=wine_label_cmap)
    sns.scatterplot(data=X_train_scaled, x="alcohol", y="malic_acid", hue=Y_train, linewidth=1, edgecolor="grey", palette=wine_label_palette, s=100,
                    ax=axes[index]).set(xlabel="Alcohol", ylabel="Malic Acid", title="C="+str(C))

fig.delaxes(axes[3, 1])

In [None]:
# Accuracy plot
sns.lineplot(x=C_list, y=list(accuracy.values())).set(xlabel="C", ylabel="Accuracy", title="Accuracy on the validation set", xscale="log");

In [None]:
# Test accuracy
model = svm.SVC(kernel="rbf", C=1, gamma="auto")
model.fit(X_train_full_scaled, Y_train_full)
model.score(X_test_scaled, Y_test)

In [None]:
fig, axes = plt.subplots(7, 7, figsize=(40, 40))
C_list = np.logspace(-3, 3, 7).tolist()
G_list = np.logspace(-9, 3, 7).tolist()
accuracy = {}

# SVC for C in [0.001, 0.01, 0.1, 1, 10, 100, 1000] and gamma in [10e-9, 10e-7, 10e-5, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
for C in C_list:
    accuracy[C] = {}
    for G in G_list:
        # Train the model
        model = svm.SVC(kernel="rbf", C=C, gamma=G)
        model.fit(X_train_scaled, Y_train)
        accuracy[C][G] = model.score(X_val_scaled, Y_val)

        # Decision boundary
        Z = model.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)

        # Plot
        index = (G_list.index(G), C_list.index(C))
        axes[index].contourf(xx, yy, Z, cmap=wine_label_cmap)
        sns.scatterplot(data=X_train_scaled, x="alcohol", y="malic_acid", hue=Y_train, linewidth=1, edgecolor="grey", palette=wine_label_palette, s=100,
                        ax=axes[index]).set(xlabel="Alcohol", ylabel="Malic Acid", title="C="+str(C)+" gamma="+str(G))

In [None]:
# Accuracy plot
accuracy_matrix = np.zeros((7, 7))
for C in C_list:
    for G in G_list:
        accuracy_matrix[C_list.index(C)][G_list.index(G)] = accuracy[C][G]
sns.heatmap(pd.DataFrame(accuracy), cmap="YlGnBu").set(xlabel="C", ylabel="Gamma", title="Accuracy on the validation set");

In [None]:
# Test accuracy
model = svm.SVC(kernel="rbf", C=10, gamma=0.1)
model.fit(X_train_full_scaled, Y_train_full)
model.score(X_test_scaled, Y_test)

In [None]:
C_list = np.logspace(-3, 3, 7).tolist()
G_list = np.logspace(-9, 3, 7).tolist()
accuracy = {}

# Grid search
grid = GridSearchCV(svm.SVC(), param_grid=dict(gamma=G_list, C=C_list), cv=KFold(n_splits=5, random_state=176), iid=False)
result = grid.fit(X_train_full_scaled, Y_train_full)
search = pd.DataFrame(result.cv_results_)
sns.heatmap(search.pivot("param_gamma", "param_C", "mean_test_score"), cmap="YlGnBu").set(xlabel="C", ylabel="Gamma", title="Accuracy on the validation set");

In [None]:
result.best_params_

In [None]:
# Test accuracy
model = result.best_estimator_
model.fit(X_train_full_scaled, Y_train_full)
model.score(X_test_scaled, Y_test)