In [2]:
import pandas as pd
import seaborn as sn

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn import preprocessing

In [19]:
df = pd.read_csv("wine_quality.csv")

In [20]:
df.head()

Unnamed: 0,country,description,points,price,variety
0,US,This tremendous 100% varietal wine hails from ...,96,235.0,Cabernet Sauvignon
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",96,110.0,Tinta de Toro
2,US,Mac Watson honors the memory of a wine once ma...,96,90.0,Sauvignon Blanc
3,US,"This spent 20 months in 30% new French oak, an...",96,65.0,Pinot Noir
4,France,"This is the top wine from La Bégude, named aft...",95,66.0,Provence red blend


In [21]:
df.shape, df.dropna().shape

((103868, 5), (103868, 5))

In [22]:
col = ['country', 'points', 'price', 'variety']
df = df[col]
df['variety_id'] = df['variety'].factorize()[0]

variety_id_df = df[['price', 'variety_id']].drop_duplicates().sort_values('variety_id')
variety_to_id = dict(variety_id_df.values)
id_to_variety = dict(variety_id_df[['variety_id', 'price']].values)

df.head(n=20)

Unnamed: 0,country,points,price,variety,variety_id
0,US,96,235.0,Cabernet Sauvignon,0
1,Spain,96,110.0,Tinta de Toro,1
2,US,96,90.0,Sauvignon Blanc,2
3,US,96,65.0,Pinot Noir,3
4,France,95,66.0,Provence red blend,4
5,Spain,95,73.0,Tinta de Toro,1
6,Spain,95,65.0,Tinta de Toro,1
7,Spain,95,110.0,Tinta de Toro,1
8,US,95,65.0,Pinot Noir,3
9,US,95,60.0,Pinot Noir,3


# Pick out features X and labels y

In [25]:
X = df[['points', 'price', 'variety_id']].values
y = df['country']

In [26]:
X, y.head()

(array([[ 96., 235.,   0.],
        [ 96., 110.,   1.],
        [ 96.,  90.,   2.],
        ...,
        [ 91.,  20.,  34.],
        [ 90.,  52.,  78.],
        [ 90.,  15.,  32.]]), 0        US
 1     Spain
 2        US
 3        US
 4    France
 Name: country, dtype: object)

# Split dataset into test and train


In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

In [28]:
print(X_train)
y_train.head()

[[ 91.  38.   7.]
 [ 89.  50.  40.]
 [ 87.  38.  93.]
 ...
 [ 83.  14. 210.]
 [ 86.  30.  36.]
 [ 83.  38.  12.]]


102104        US
39339         US
57198         US
28598      Italy
85027     France
Name: country, dtype: object

# Initialise models

In [29]:
# Initialise models
LR_model         = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
LDA_model        = LinearDiscriminantAnalysis()
KNN_model        = KNeighborsClassifier()
GaussianNB_model = GaussianNB()
DTree_model      = DecisionTreeClassifier()
SVC_model        = LinearSVC(multi_class='ovr')

In [30]:
xgb_model = XGBClassifier(booster='gbtree', objective='multi:softprob', random_state=42, eval_metric="auc", num_class=4)

In [31]:
RandFC_model = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=0)

# Fit the models

In [32]:
LR_model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=0, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [33]:
LDA_model.fit(X_train, y_train)

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)

In [34]:
KNN_model.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [35]:
GaussianNB_model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [36]:
DTree_model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [37]:
SVC_model.fit(X_train, y_train)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [38]:
xgb_model.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, eval_metric='auc', gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, num_class=4, objective='multi:softprob',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=None, subsample=1, verbosity=1)

In [39]:
RandFC_model.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

# Save predictions for evaluation


In [40]:
test_set = X_test
test_labels = y_test
list_of_predictions = []

LR_prediction    = LR_model.predict(test_set)
list_of_predictions.append(LR_prediction)

LDA_prediction   = LDA_model.predict(test_set)
list_of_predictions.append(LDA_prediction)

KNN_prediction   = KNN_model.predict(test_set)
list_of_predictions.append(KNN_prediction)

GNB_prediction   = GaussianNB_model.predict(test_set)
list_of_predictions.append(GNB_prediction)

DTree_prediction = DTree_model.predict(test_set)
list_of_predictions.append(DTree_prediction)

SVC_prediction   = SVC_model.predict(test_set)
list_of_predictions.append(SVC_prediction)

XGB_prediction   = xgb_model.predict(test_set)
list_of_predictions.append(XGB_prediction)

RandFC_prediction = RandFC_model.predict(test_set)
list_of_predictions.append(RandFC_prediction)

# Evaluation on test set

In [41]:
# Accuracy score is the simplest way to evaluate
for prediction in list_of_predictions:
    print(accuracy_score(prediction, test_labels))

0.6330990661403678
0.6337408940663009
0.7867847630050383
0.6298899265107025
0.8386444594204294
0.6289913674143962
0.7719585379159847
0.7235005295080389


In [42]:
# confusion_matrix for all predictions
for prediction in list_of_predictions:
    print(confusion_matrix(prediction, test_labels))

[[   89    22     6    17]
 [  171  1564   262   492]
 [    0     0     0     0]
 [ 4194  4082  2187 18075]]
[[  112    29    11    30]
 [  149  1573   263   491]
 [    0     0     0     0]
 [ 4193  4066  2181 18063]]
[[ 2390   179   221  1008]
 [  213  4430   277  1023]
 [  100   131  1454   310]
 [ 1751   928   503 16243]]
[[  422   201    85   446]
 [  184  1584   319   519]
 [    9     7    35    32]
 [ 3839  3876  2016 17587]]
[[ 2821    84    72   902]
 [   97  4761   159   801]
 [   43    77  1914   244]
 [ 1493   746   310 16637]]
[[    0     0     0     0]
 [  220  1616   426   600]
 [    0     0     0     0]
 [ 4234  4052  2029 17984]]
[[ 1613    63   136   379]
 [  155  3972   284   848]
 [   45    18  1279   166]
 [ 2641  1615   756 17191]]
[[  957    53    83   146]
 [  198  2833   240   527]
 [   45    19   958   114]
 [ 3254  2763  1174 17797]]


In [43]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

# classification_report for all predictions
for prediction in list_of_predictions:
    print(classification_report(prediction, test_labels))

              precision    recall  f1-score   support

      France       0.02      0.66      0.04       134
       Italy       0.28      0.63      0.38      2489
       Spain       0.00      0.00      0.00         0
          US       0.97      0.63      0.77     28538

   micro avg       0.63      0.63      0.63     31161
   macro avg       0.32      0.48      0.30     31161
weighted avg       0.91      0.63      0.73     31161

              precision    recall  f1-score   support

      France       0.03      0.62      0.05       182
       Italy       0.28      0.64      0.39      2476
       Spain       0.00      0.00      0.00         0
          US       0.97      0.63      0.77     28503

   micro avg       0.63      0.63      0.63     31161
   macro avg       0.32      0.47      0.30     31161
weighted avg       0.91      0.63      0.73     31161

              precision    recall  f1-score   support

      France       0.54      0.63      0.58      3798
       Italy       0.

# Test models on training set

In [30]:
test_set = X_train
test_labels = y_train
list_of_predictions = []

LR_prediction    = LR_model.predict(test_set)
list_of_predictions.append(LR_prediction)

LDA_prediction   = LDA_model.predict(test_set)
list_of_predictions.append(LDA_prediction)

KNN_prediction   = KNN_model.predict(test_set)
list_of_predictions.append(KNN_prediction)

GNB_prediction   = GaussianNB_model.predict(test_set)
list_of_predictions.append(GNB_prediction)

DTree_prediction = DTree_model.predict(test_set)
list_of_predictions.append(DTree_prediction)

SVC_prediction   = SVC_model.predict(test_set)
list_of_predictions.append(SVC_prediction)

XGB_prediction   = xgb_model.predict(test_set)
list_of_predictions.append(XGB_prediction)

RandFC_prediction = RandFC_model.predict(test_set)
list_of_predictions.append(RandFC_prediction)

In [31]:
# Accuracy score is the simplest way to evaluate
for prediction in list_of_predictions:
    print(accuracy_score(prediction, test_labels))

0.6011938327808877
0.6015101709601551
0.5401955795177906
0.5974252822974404
0.6223472292901646
0.46266521792949783
0.6046047835834238
0.6093361024385547


In [32]:
y_test.value_counts()

US        18584
Italy      5668
France     4454
Spain      2455
Name: country, dtype: int64

In [33]:
# confusion_matrix for all predictions
for prediction in list_of_predictions:
    print(confusion_matrix(prediction, test_labels))

[[  183    59    11    27]
 [    0     0     0     0]
 [    0     0     0     0]
 [10148 13057  5694 43528]]
[[  242    90    23    63]
 [    0     0     0     0]
 [    0     0     0     0]
 [10089 13026  5682 43492]]
[[ 2503  1942   802  3967]
 [ 1593  2851  1021  4716]
 [  475   501   696  1646]
 [ 5760  7822  3186 33226]]
[[ 1004   708   209  1101]
 [   53    86    18   107]
 [    0     0     0     0]
 [ 9274 12322  5478 42347]]
[[ 1187   296   111   327]
 [  509  1576   221   871]
 [   42    59   242   113]
 [ 8593 11185  5131 42244]]
[[    0     0     0     0]
 [ 3430  3796  1568 13712]
 [    0     0     0     0]
 [ 6901  9320  4137 29843]]
[[  532   235    98   169]
 [   12    33     2     7]
 [    1     6    31    16]
 [ 9786 12842  5574 43363]]
[[  649   226   114   211]
 [  134   459    56   200]
 [   36    57   149    98]
 [ 9512 12374  5386 43046]]


In [159]:
# classification_report for all predictions
for prediction in list_of_predictions:
    print(classification_report(prediction, y_test))

  'recall', 'true', average, warn_for)


              precision    recall  f1-score   support

      France       0.01      0.68      0.03        65
       Italy       0.00      0.00      0.00         0
       Spain       0.00      0.00      0.00         0
          US       1.00      0.60      0.75     20709

   micro avg       0.60      0.60      0.60     20774
   macro avg       0.25      0.32      0.20     20774
weighted avg       1.00      0.60      0.75     20774

              precision    recall  f1-score   support

      France       0.02      0.56      0.04       102
       Italy       0.00      0.00      0.00         0
       Spain       0.00      0.00      0.00         0
          US       1.00      0.60      0.75     20672

   micro avg       0.60      0.60      0.60     20774
   macro avg       0.25      0.29      0.20     20774
weighted avg       0.99      0.60      0.75     20774

              precision    recall  f1-score   support

      France       0.18      0.27      0.22      1993
       Italy       0.