In [146]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn import preprocessing

In [118]:
df = pd.read_csv("wine_quality.csv")

In [119]:
df.head()

Unnamed: 0,country,description,points,price,variety
0,US,This tremendous 100% varietal wine hails from ...,96,235.0,Cabernet Sauvignon
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",96,110.0,Tinta de Toro
2,US,Mac Watson honors the memory of a wine once ma...,96,90.0,Sauvignon Blanc
3,US,"This spent 20 months in 30% new French oak, an...",96,65.0,Pinot Noir
4,France,"This is the top wine from La Bégude, named aft...",95,66.0,Provence red blend


In [120]:
# Pick out features X and labels y
X = df.iloc[:,2:4].values
y = df['country']

In [121]:
X, y.head()

(array([[ 96., 235.],
        [ 96., 110.],
        [ 96.,  90.],
        ...,
        [ 91.,  20.],
        [ 90.,  52.],
        [ 90.,  15.]]), 0        US
 1     Spain
 2        US
 3        US
 4    France
 Name: country, dtype: object)

In [122]:
# Split dataset into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [123]:
print(X_train)
y_train.head()

[[91. 60.]
 [84. 13.]
 [87. 26.]
 ...
 [87. 28.]
 [90. 26.]
 [93. 55.]]


17649      Spain
87040         US
44300         US
101288    France
81544      Italy
Name: country, dtype: object

In [124]:
# Initialise models
LR_model         = LogisticRegression(multi_class='auto', solver='lbfgs')
LDA_model        = LinearDiscriminantAnalysis()
KNN_model        = KNeighborsClassifier()
GaussianNB_model = GaussianNB()
DTree_model      = DecisionTreeClassifier()
SVC_model        = LinearSVC()



In [147]:
xgb_model = XGBClassifier(booster='gbtree', objective='multi:softprob', random_state=42, eval_metric="auc", num_class=4)

In [125]:
LR_model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='auto',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [126]:
LDA_model.fit(X_train, y_train)

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)

In [127]:
KNN_model.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [128]:
GaussianNB_model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [129]:
DTree_model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [130]:
SVC_model.fit(X_train, y_train)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [148]:
xgb_model.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, eval_metric='auc', gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, num_class=4, objective='multi:softprob',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=None, subsample=1, verbosity=1)

In [154]:
# Save predictions for evaluation
list_of_predictions = []

LR_prediction    = LR_model.predict(X_test)
list_of_predictions.append(LR_prediction)

LDA_prediction   = LDA_model.predict(X_test)
list_of_predictions.append(LDA_prediction)

KNN_prediction   = KNN_model.predict(X_test)
list_of_predictions.append(KNN_prediction)

GNB_prediction   = GaussianNB_model.predict(X_test)
list_of_predictions.append(GNB_prediction)

DTree_prediction = DTree_model.predict(X_test)
list_of_predictions.append(DTree_prediction)

SVC_prediction   = SVC_model.predict(X_test)
list_of_predictions.append(SVC_prediction)


In [155]:
XGB_prediction   = xgb_model.predict(X_test)
list_of_predictions.append(XGB_prediction)


In [157]:
# Accuracy score is the simplest way to evaluate
for prediction in list_of_predictions:
    print(accuracy_score(prediction, y_test))

0.6016655434677963
0.6017618176566862
0.5481370944449793
0.5994512371233273
0.6098488495234428
0.5244536439780495
0.60469818041783


In [151]:
y_test.value_counts()

US        12461
Italy      3724
France     2958
Spain      1631
Name: country, dtype: int64

In [158]:
# Accuracy score is the simplest way to evaluate
for prediction in list_of_predictions:
    print(confusion_matrix(prediction, y_test))

[[   44    12     3     6]
 [    0     0     0     0]
 [    0     0     0     0]
 [ 2914  3712  1628 12455]]
[[   57    21     7    17]
 [    0     0     0     0]
 [    0     0     0     0]
 [ 2901  3703  1624 12444]]
[[ 538  387  214  854]
 [ 421  792  233 1265]
 [ 103  107  125  410]
 [1896 2438 1059 9932]]
[[  292   169    51   293]
 [   13    27     5    34]
 [    0     0     0     0]
 [ 2653  3528  1575 12134]]
[[  249    99    27   113]
 [  162   335    72   252]
 [   21    23    55    66]
 [ 2526  3267  1477 12030]]
[[  229   141    55  1031]
 [  518   523   101  1287]
 [    0     0     0     0]
 [ 2211  3060  1475 10143]]
[[  131    47    23    37]
 [    0     5     0     1]
 [    1     0     8     5]
 [ 2826  3672  1600 12418]]


In [134]:
print(classification_report(KNN_prediction, y_test))

              precision    recall  f1-score   support

      France       0.18      0.27      0.22      1993
       Italy       0.21      0.29      0.25      2711
       Spain       0.08      0.17      0.11       745
          US       0.80      0.65      0.71     15325

   micro avg       0.55      0.55      0.55     20774
   macro avg       0.32      0.34      0.32     20774
weighted avg       0.64      0.55      0.58     20774

