In [26]:
import pandas as pd
import seaborn as sn

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn import preprocessing

In [4]:
df = pd.read_csv("wine_quality.csv")

In [27]:
df.head()

Unnamed: 0,country,description,points,price,variety
0,US,This tremendous 100% varietal wine hails from ...,96,235.0,Cabernet Sauvignon
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",96,110.0,Tinta de Toro
2,US,Mac Watson honors the memory of a wine once ma...,96,90.0,Sauvignon Blanc
3,US,"This spent 20 months in 30% new French oak, an...",96,65.0,Pinot Noir
4,France,"This is the top wine from La Bégude, named aft...",95,66.0,Provence red blend


# Pick out features X and labels y

In [6]:
X = df.iloc[:,2:4].values
y = df['country']

In [7]:
X, y.head()

(array([[ 96., 235.],
        [ 96., 110.],
        [ 96.,  90.],
        ...,
        [ 91.,  20.],
        [ 90.,  52.],
        [ 90.,  15.]]), 0        US
 1     Spain
 2        US
 3        US
 4    France
 Name: country, dtype: object)

# Split dataset into test and train


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

In [9]:
print(X_train)
y_train.head()

[[91. 38.]
 [89. 50.]
 [87. 38.]
 ...
 [83. 14.]
 [86. 30.]
 [83. 38.]]


102104        US
39339         US
57198         US
28598      Italy
85027     France
Name: country, dtype: object

# Initialise models

In [10]:
# Initialise models
LR_model         = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
LDA_model        = LinearDiscriminantAnalysis()
KNN_model        = KNeighborsClassifier()
GaussianNB_model = GaussianNB()
DTree_model      = DecisionTreeClassifier()
SVC_model        = LinearSVC(multi_class='ovr')

In [11]:
xgb_model = XGBClassifier(booster='gbtree', objective='multi:softprob', random_state=42, eval_metric="auc", num_class=4)

In [12]:
RandFC_model = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=0)

# Fit the models

In [13]:
LR_model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=0, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [14]:
LDA_model.fit(X_train, y_train)

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)

In [15]:
KNN_model.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [16]:
GaussianNB_model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [17]:
DTree_model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [18]:
SVC_model.fit(X_train, y_train)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [19]:
xgb_model.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, eval_metric='auc', gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, num_class=4, objective='multi:softprob',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=None, subsample=1, verbosity=1)

In [20]:
RandFC_model.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

# Save predictions for evaluation


In [35]:
test_set = X_test
test_labels = y_test
list_of_predictions = []

LR_prediction    = LR_model.predict(test_set)
list_of_predictions.append(LR_prediction)

LDA_prediction   = LDA_model.predict(test_set)
list_of_predictions.append(LDA_prediction)

KNN_prediction   = KNN_model.predict(test_set)
list_of_predictions.append(KNN_prediction)

GNB_prediction   = GaussianNB_model.predict(test_set)
list_of_predictions.append(GNB_prediction)

DTree_prediction = DTree_model.predict(test_set)
list_of_predictions.append(DTree_prediction)

SVC_prediction   = SVC_model.predict(test_set)
list_of_predictions.append(SVC_prediction)

XGB_prediction   = xgb_model.predict(test_set)
list_of_predictions.append(XGB_prediction)

RandFC_prediction = RandFC_model.predict(test_set)
list_of_predictions.append(RandFC_prediction)

# Evaluation on test set

In [36]:
# Accuracy score is the simplest way to evaluate
for prediction in list_of_predictions:
    print(accuracy_score(prediction, test_labels))

0.598408266743686
0.5987291807066526
0.5256891627354706
0.5945893905843843
0.6076505888771221
0.46086454221623185
0.6007509386733417
0.6026122396585475


In [37]:
# confusion_matrix for all predictions
for prediction in list_of_predictions:
    print(confusion_matrix(prediction, test_labels))

[[   77    20     8    14]
 [    0     0     0     0]
 [    0     0     0     0]
 [ 4377  5648  2447 18570]]
[[  103    31    16    30]
 [    0     0     0     0]
 [    0     0     0     0]
 [ 4351  5637  2439 18554]]
[[  963   918   366  1732]
 [  738  1149   437  2150]
 [  223   209   284   717]
 [ 2530  3392  1368 13985]]
[[  426   276   103   488]
 [   33    40     5    34]
 [    0     0     0     0]
 [ 3995  5352  2347 18062]]
[[  419   162    65   223]
 [  256   575    98   425]
 [   18    29    88    83]
 [ 3761  4902  2204 17853]]
[[    0     0     0     0]
 [ 1418  1616   660  5839]
 [    0     0     0     0]
 [ 3036  4052  1795 12745]]
[[  229    90    50   108]
 [    2    14     1     3]
 [    2     2    13     9]
 [ 4221  5562  2391 18464]]
[[  250    87    56   128]
 [   60   168    24    89]
 [   14    18    63    70]
 [ 4130  5395  2312 18297]]


In [40]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

# classification_report for all predictions
for prediction in list_of_predictions:
    print(classification_report(prediction, test_labels))

              precision    recall  f1-score   support

      France       0.02      0.65      0.03       119
       Italy       0.00      0.00      0.00         0
       Spain       0.00      0.00      0.00         0
          US       1.00      0.60      0.75     31042

   micro avg       0.60      0.60      0.60     31161
   macro avg       0.25      0.31      0.20     31161
weighted avg       1.00      0.60      0.75     31161

              precision    recall  f1-score   support

      France       0.02      0.57      0.04       180
       Italy       0.00      0.00      0.00         0
       Spain       0.00      0.00      0.00         0
          US       1.00      0.60      0.75     30981

   micro avg       0.60      0.60      0.60     31161
   macro avg       0.26      0.29      0.20     31161
weighted avg       0.99      0.60      0.74     31161

              precision    recall  f1-score   support

      France       0.22      0.24      0.23      3979
       Italy       0.

# Test models on training set

In [30]:
test_set = X_train
test_labels = y_train
list_of_predictions = []

LR_prediction    = LR_model.predict(test_set)
list_of_predictions.append(LR_prediction)

LDA_prediction   = LDA_model.predict(test_set)
list_of_predictions.append(LDA_prediction)

KNN_prediction   = KNN_model.predict(test_set)
list_of_predictions.append(KNN_prediction)

GNB_prediction   = GaussianNB_model.predict(test_set)
list_of_predictions.append(GNB_prediction)

DTree_prediction = DTree_model.predict(test_set)
list_of_predictions.append(DTree_prediction)

SVC_prediction   = SVC_model.predict(test_set)
list_of_predictions.append(SVC_prediction)

XGB_prediction   = xgb_model.predict(test_set)
list_of_predictions.append(XGB_prediction)

RandFC_prediction = RandFC_model.predict(test_set)
list_of_predictions.append(RandFC_prediction)

In [31]:
# Accuracy score is the simplest way to evaluate
for prediction in list_of_predictions:
    print(accuracy_score(prediction, test_labels))

0.6011938327808877
0.6015101709601551
0.5401955795177906
0.5974252822974404
0.6223472292901646
0.46266521792949783
0.6046047835834238
0.6093361024385547


In [32]:
y_test.value_counts()

US        18584
Italy      5668
France     4454
Spain      2455
Name: country, dtype: int64

In [33]:
# confusion_matrix for all predictions
for prediction in list_of_predictions:
    print(confusion_matrix(prediction, test_labels))

[[  183    59    11    27]
 [    0     0     0     0]
 [    0     0     0     0]
 [10148 13057  5694 43528]]
[[  242    90    23    63]
 [    0     0     0     0]
 [    0     0     0     0]
 [10089 13026  5682 43492]]
[[ 2503  1942   802  3967]
 [ 1593  2851  1021  4716]
 [  475   501   696  1646]
 [ 5760  7822  3186 33226]]
[[ 1004   708   209  1101]
 [   53    86    18   107]
 [    0     0     0     0]
 [ 9274 12322  5478 42347]]
[[ 1187   296   111   327]
 [  509  1576   221   871]
 [   42    59   242   113]
 [ 8593 11185  5131 42244]]
[[    0     0     0     0]
 [ 3430  3796  1568 13712]
 [    0     0     0     0]
 [ 6901  9320  4137 29843]]
[[  532   235    98   169]
 [   12    33     2     7]
 [    1     6    31    16]
 [ 9786 12842  5574 43363]]
[[  649   226   114   211]
 [  134   459    56   200]
 [   36    57   149    98]
 [ 9512 12374  5386 43046]]


In [159]:
# classification_report for all predictions
for prediction in list_of_predictions:
    print(classification_report(prediction, y_test))

  'recall', 'true', average, warn_for)


              precision    recall  f1-score   support

      France       0.01      0.68      0.03        65
       Italy       0.00      0.00      0.00         0
       Spain       0.00      0.00      0.00         0
          US       1.00      0.60      0.75     20709

   micro avg       0.60      0.60      0.60     20774
   macro avg       0.25      0.32      0.20     20774
weighted avg       1.00      0.60      0.75     20774

              precision    recall  f1-score   support

      France       0.02      0.56      0.04       102
       Italy       0.00      0.00      0.00         0
       Spain       0.00      0.00      0.00         0
          US       1.00      0.60      0.75     20672

   micro avg       0.60      0.60      0.60     20774
   macro avg       0.25      0.29      0.20     20774
weighted avg       0.99      0.60      0.75     20774

              precision    recall  f1-score   support

      France       0.18      0.27      0.22      1993
       Italy       0.