In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.preprocessing import StandardScaler

In [3]:
df = pd.read_csv("wine_quality.csv")

In [4]:
df.head()

Unnamed: 0,country,description,points,price,variety
0,US,This tremendous 100% varietal wine hails from ...,96,235.0,Cabernet Sauvignon
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",96,110.0,Tinta de Toro
2,US,Mac Watson honors the memory of a wine once ma...,96,90.0,Sauvignon Blanc
3,US,"This spent 20 months in 30% new French oak, an...",96,65.0,Pinot Noir
4,France,"This is the top wine from La Bégude, named aft...",95,66.0,Provence red blend


In [10]:
# Pick out features X and labels y
X = df.iloc[:,2:4].values
y = df['country']

In [11]:
X, y.head()

(array([[ 96., 235.],
        [ 96., 110.],
        [ 96.,  90.],
        ...,
        [ 91.,  20.],
        [ 90.,  52.],
        [ 90.,  15.]]), 0        US
 1     Spain
 2        US
 3        US
 4    France
 Name: country, dtype: object)

In [50]:
# Split dataset into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [13]:
print(X_train)
y_train.head()

[[91. 60.]
 [84. 13.]
 [87. 26.]
 ...
 [87. 28.]
 [90. 26.]
 [93. 55.]]


17649      Spain
87040         US
44300         US
101288    France
81544      Italy
Name: country, dtype: object

In [14]:
# Initialise models
LR_model         = LogisticRegression(multi_class='auto', solver='lbfgs')
LDA_model        = LinearDiscriminantAnalysis()
KNN_model        = KNeighborsClassifier()
GaussianNB_model = GaussianNB()
DTree_model      = DecisionTreeClassifier()
SVC_model        = LinearSVC()

In [15]:
xgb_model = XGBClassifier(booster='gbtree', objective='multi:softprob', random_state=42, eval_metric="auc", num_class=4)

## Addition of scaling here

In [60]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)

In [61]:
X_test = scaler.transform(X_test)

In [62]:
LR_model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='auto',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [63]:
LDA_model.fit(X_train, y_train)

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)

In [64]:
KNN_model.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [65]:
GaussianNB_model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [66]:
DTree_model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [67]:
SVC_model.fit(X_train, y_train)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [68]:
xgb_model.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, eval_metric='auc', gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, num_class=4, objective='multi:softprob',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=None, subsample=1, verbosity=1)

In [69]:
# Save predictions for evaluation
list_of_predictions = []

LR_prediction    = LR_model.predict(X_test)
list_of_predictions.append(LR_prediction)

LDA_prediction   = LDA_model.predict(X_test)
list_of_predictions.append(LDA_prediction)

KNN_prediction   = KNN_model.predict(X_test)
list_of_predictions.append(KNN_prediction)

GNB_prediction   = GaussianNB_model.predict(X_test)
list_of_predictions.append(GNB_prediction)

DTree_prediction = DTree_model.predict(X_test)
list_of_predictions.append(DTree_prediction)

SVC_prediction   = SVC_model.predict(X_test)
list_of_predictions.append(SVC_prediction)


In [70]:
XGB_prediction   = xgb_model.predict(X_test)
list_of_predictions.append(XGB_prediction)


In [71]:
# Accuracy score is the simplest way to evaluate
for prediction in list_of_predictions:
    print(accuracy_score(prediction, y_test))

0.16621738711851353
0.16159622605179552
0.16313661307403485
0.31707904110907864
0.14782901704053145
0.17170501588524117
0.1425820737460287


In [72]:
y_test.value_counts()

US        12461
Italy      3724
France     2958
Spain      1631
Name: country, dtype: int64

In [73]:
# confusion_matrix for all predictions
for prediction in list_of_predictions:
    print(confusion_matrix(prediction, y_test))

[[1225 1496  328 5281]
 [1733 2228 1303 7180]
 [   0    0    0    0]
 [   0    0    0    0]]
[[1431 1798  417 6627]
 [1527 1926 1214 5834]
 [   0    0    0    0]
 [   0    0    0    0]]
[[1801 2194  583 8352]
 [1142 1515 1011 4036]
 [   0    0    0    0]
 [  15   15   37   73]]
[[1568 1955  479 7442]
 [   0    0    0    0]
 [   0    0    0    0]
 [1390 1769 1152 5019]]
[[ 2318  2974   902 10496]
 [  640   748   728  1960]
 [    0     0     0     0]
 [    0     2     1     5]]
[[ 894 1051  229 2961]
 [2064 2673 1402 9500]
 [   0    0    0    0]
 [   0    0    0    0]]
[[ 2950  3712  1607 12397]
 [    8    12    24    64]
 [    0     0     0     0]
 [    0     0     0     0]]


In [74]:
# classification_report for all predictions
for prediction in list_of_predictions:
    print(classification_report(prediction, y_test))

  'recall', 'true', average, warn_for)


              precision    recall  f1-score   support

      France       0.41      0.15      0.22      8330
       Italy       0.60      0.18      0.28     12444
       Spain       0.00      0.00      0.00         0
          US       0.00      0.00      0.00         0

   micro avg       0.17      0.17      0.17     20774
   macro avg       0.25      0.08      0.12     20774
weighted avg       0.52      0.17      0.25     20774

              precision    recall  f1-score   support

      France       0.48      0.14      0.22     10273
       Italy       0.52      0.18      0.27     10501
       Spain       0.00      0.00      0.00         0
          US       0.00      0.00      0.00         0

   micro avg       0.16      0.16      0.16     20774
   macro avg       0.25      0.08      0.12     20774
weighted avg       0.50      0.16      0.24     20774

              precision    recall  f1-score   support

      France       0.61      0.14      0.23     12930
       Italy       0.