In [28]:
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [29]:
df = pd.read_csv("wine_quality.csv")

In [30]:
df.head()

Unnamed: 0,country,description,points,price,variety
0,US,This tremendous 100% varietal wine hails from ...,96,235.0,Cabernet Sauvignon
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",96,110.0,Tinta de Toro
2,US,Mac Watson honors the memory of a wine once ma...,96,90.0,Sauvignon Blanc
3,US,"This spent 20 months in 30% new French oak, an...",96,65.0,Pinot Noir
4,France,"This is the top wine from La Bégude, named aft...",95,66.0,Provence red blend


In [31]:
df.shape, df.dropna().shape

((103868, 5), (103868, 5))

In [32]:
df['variety_id'] = df['variety'].factorize()[0]

variety_id_df = df[['price', 'variety_id']].drop_duplicates().sort_values('variety_id')
variety_to_id = dict(variety_id_df.values)
id_to_variety = dict(variety_id_df[['variety_id', 'price']].values)

df.head()

Unnamed: 0,country,description,points,price,variety,variety_id
0,US,This tremendous 100% varietal wine hails from ...,96,235.0,Cabernet Sauvignon,0
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",96,110.0,Tinta de Toro,1
2,US,Mac Watson honors the memory of a wine once ma...,96,90.0,Sauvignon Blanc,2
3,US,"This spent 20 months in 30% new French oak, an...",96,65.0,Pinot Noir,3
4,France,"This is the top wine from La Bégude, named aft...",95,66.0,Provence red blend,4


In [33]:
# 100 gives 2363, 200 gives 1542, 500 gives 764, 1000 gives 416, 2000 gives 233
tfidf = TfidfVectorizer(min_df=2000, stop_words='english')
features = tfidf.fit_transform(df.description)
features.shape

(103868, 233)

In [35]:
sdf = pd.SparseDataFrame(features)

In [36]:
sdf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,223,224,225,226,227,228,229,230,231,232
0,,,,,,,,,,,...,,,,0.118902,,,,,0.48106,
1,,,,,,,,,,0.173599,...,,,,,,,,,,
2,,,,,,,,,,,...,,0.283014,,0.134655,,,,,,
3,,,,,,,,,,0.173996,...,,,,,,,,,,
4,0.206754,,,0.357698,,,,,,,...,,,,0.411407,,,0.347364,,,


In [37]:
sdf_filled = sdf.fillna(0)

In [38]:
df = df.join(sdf_filled)

In [39]:
df.head()

Unnamed: 0,country,description,points,price,variety,variety_id,0,1,2,3,...,223,224,225,226,227,228,229,230,231,232
0,US,This tremendous 100% varietal wine hails from ...,96,235.0,Cabernet Sauvignon,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.118902,0.0,0.0,0.0,0.0,0.48106,0.0
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",96,110.0,Tinta de Toro,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,US,Mac Watson honors the memory of a wine once ma...,96,90.0,Sauvignon Blanc,2,0.0,0.0,0.0,0.0,...,0.0,0.283014,0.0,0.134655,0.0,0.0,0.0,0.0,0.0,0.0
3,US,"This spent 20 months in 30% new French oak, an...",96,65.0,Pinot Noir,3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,France,"This is the top wine from La Bégude, named aft...",95,66.0,Provence red blend,4,0.206754,0.0,0.0,0.357698,...,0.0,0.0,0.0,0.411407,0.0,0.0,0.347364,0.0,0.0,0.0


# Pick out features X and labels y

In [40]:
X = df.drop(columns=['country', 'description', 'variety']).values
y = df['country']

In [41]:
X, y.head()

(array([[96, 235.0, 0, ..., 0.0, 0.48106018323469013, 0.0],
        [96, 110.0, 1, ..., 0.0, 0.0, 0.0],
        [96, 90.0, 2, ..., 0.0, 0.0, 0.0],
        ...,
        [91, 20.0, 34, ..., 0.0, 0.0, 0.0],
        [90, 52.0, 78, ..., 0.0, 0.0, 0.0],
        [90, 15.0, 32, ..., 0.0, 0.0, 0.0]], dtype=object), 0        US
 1     Spain
 2        US
 3        US
 4    France
 Name: country, dtype: object)

# Split dataset into test and train


In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

In [43]:
print(X_train)
y_train.head()

[[91 38.0 7 ... 0.0 0.0 0.0]
 [89 50.0 40 ... 0.35021814397275164 0.0 0.0]
 [87 38.0 93 ... 0.0 0.0 0.0]
 ...
 [83 14.0 210 ... 0.0 0.0 0.0]
 [86 30.0 36 ... 0.0 0.222452932550563 0.0]
 [83 38.0 12 ... 0.0 0.0 0.0]]


102104        US
39339         US
57198         US
28598      Italy
85027     France
Name: country, dtype: object

# Initialise models

In [44]:
# Initialise models
LR_model         = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial', max_iter=1000)
LDA_model        = LinearDiscriminantAnalysis()
KNN_model        = KNeighborsClassifier()
GaussianNB_model = GaussianNB()
DTree_model      = DecisionTreeClassifier()
SVC_model        = LinearSVC(multi_class='ovr')
xgb_model        = XGBClassifier(booster='gbtree', objective='multi:softprob', random_state=42, eval_metric="auc", num_class=4)
RandFC_model     = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=0)

# Fit the models

In [45]:
LR_model.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=0, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [46]:
LDA_model.fit(X_train, y_train)

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)

In [47]:
KNN_model.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [48]:
GaussianNB_model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [49]:
DTree_model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [50]:
SVC_model.fit(X_train, y_train)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [51]:
xgb_model.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, eval_metric='auc', gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, num_class=4, objective='multi:softprob',
       random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=None, subsample=1, verbosity=1)

In [52]:
RandFC_model.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

# Save predictions for evaluation


In [53]:
test_set = X_test
test_labels = y_test
list_of_predictions = []

LR_prediction    = LR_model.predict(test_set)
list_of_predictions.append(LR_prediction)

LDA_prediction   = LDA_model.predict(test_set)
list_of_predictions.append(LDA_prediction)

KNN_prediction   = KNN_model.predict(test_set)
list_of_predictions.append(KNN_prediction)

GNB_prediction   = GaussianNB_model.predict(test_set)
list_of_predictions.append(GNB_prediction)

DTree_prediction = DTree_model.predict(test_set)
list_of_predictions.append(DTree_prediction)

SVC_prediction   = SVC_model.predict(test_set)
list_of_predictions.append(SVC_prediction)

XGB_prediction   = xgb_model.predict(test_set)
list_of_predictions.append(XGB_prediction)

RandFC_prediction = RandFC_model.predict(test_set)
list_of_predictions.append(RandFC_prediction)

# Evaluation on test set

In [54]:
# Accuracy score is the simplest way to evaluate
for prediction in list_of_predictions:
    print(accuracy_score(prediction, test_labels))

0.8556849908539521
0.86049870029845
0.8297230512499599
0.7057860787522865
0.9070312249285967
0.8124578800423606
0.8585732165206508
0.7626199415936588


In [55]:
# confusion_matrix for all predictions
for prediction in list_of_predictions:
    print(confusion_matrix(prediction, test_labels))

[[ 3139   135    84   430]
 [  168  4690   172   526]
 [  103   181  1463   256]
 [ 1044   662   736 17372]]
[[ 3140   116    62   398]
 [  128  4569   105   385]
 [  167   240  1757   453]
 [ 1019   743   531 17348]]
[[ 2842   140   164   822]
 [  203  4737   246   854]
 [   78   111  1636   268]
 [ 1331   680   409 16640]]
[[ 3418   253   144  1777]
 [  251  4388   144  1397]
 [  479   722  2012  3235]
 [  306   305   155 12175]]
[[ 3726    86    41   587]
 [  101  5115    91   488]
 [   57    93  2111   197]
 [  570   374   212 17312]]
[[ 2997   242    62   291]
 [   48  2814    34   101]
 [  106   768  1562   248]
 [ 1303  1844   797 17944]]
[[ 2533    57    53   136]
 [  152  4563   157   371]
 [   81    98  1688   107]
 [ 1688   950   557 17970]]
[[  997     0     0     4]
 [   74  3677   108   142]
 [   16    21   662    10]
 [ 3367  1970  1685 18428]]


In [56]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

# classification_report for all predictions
for prediction in list_of_predictions:
    print(classification_report(prediction, test_labels))

              precision    recall  f1-score   support

      France       0.70      0.83      0.76      3788
       Italy       0.83      0.84      0.84      5556
       Spain       0.60      0.73      0.66      2003
          US       0.93      0.88      0.90     19814

   micro avg       0.86      0.86      0.86     31161
   macro avg       0.77      0.82      0.79     31161
weighted avg       0.87      0.86      0.86     31161

              precision    recall  f1-score   support

      France       0.70      0.84      0.77      3716
       Italy       0.81      0.88      0.84      5187
       Spain       0.72      0.67      0.69      2617
          US       0.93      0.88      0.91     19641

   micro avg       0.86      0.86      0.86     31161
   macro avg       0.79      0.82      0.80     31161
weighted avg       0.87      0.86      0.86     31161

              precision    recall  f1-score   support

      France       0.64      0.72      0.67      3968
       Italy       0.

# Test models on training set

In [30]:
test_set = X_train
test_labels = y_train
list_of_predictions = []

LR_prediction    = LR_model.predict(test_set)
list_of_predictions.append(LR_prediction)

LDA_prediction   = LDA_model.predict(test_set)
list_of_predictions.append(LDA_prediction)

KNN_prediction   = KNN_model.predict(test_set)
list_of_predictions.append(KNN_prediction)

GNB_prediction   = GaussianNB_model.predict(test_set)
list_of_predictions.append(GNB_prediction)

DTree_prediction = DTree_model.predict(test_set)
list_of_predictions.append(DTree_prediction)

SVC_prediction   = SVC_model.predict(test_set)
list_of_predictions.append(SVC_prediction)

XGB_prediction   = xgb_model.predict(test_set)
list_of_predictions.append(XGB_prediction)

RandFC_prediction = RandFC_model.predict(test_set)
list_of_predictions.append(RandFC_prediction)

In [31]:
# Accuracy score is the simplest way to evaluate
for prediction in list_of_predictions:
    print(accuracy_score(prediction, test_labels))

0.6011938327808877
0.6015101709601551
0.5401955795177906
0.5974252822974404
0.6223472292901646
0.46266521792949783
0.6046047835834238
0.6093361024385547


In [32]:
y_test.value_counts()

US        18584
Italy      5668
France     4454
Spain      2455
Name: country, dtype: int64

In [33]:
# confusion_matrix for all predictions
for prediction in list_of_predictions:
    print(confusion_matrix(prediction, test_labels))

[[  183    59    11    27]
 [    0     0     0     0]
 [    0     0     0     0]
 [10148 13057  5694 43528]]
[[  242    90    23    63]
 [    0     0     0     0]
 [    0     0     0     0]
 [10089 13026  5682 43492]]
[[ 2503  1942   802  3967]
 [ 1593  2851  1021  4716]
 [  475   501   696  1646]
 [ 5760  7822  3186 33226]]
[[ 1004   708   209  1101]
 [   53    86    18   107]
 [    0     0     0     0]
 [ 9274 12322  5478 42347]]
[[ 1187   296   111   327]
 [  509  1576   221   871]
 [   42    59   242   113]
 [ 8593 11185  5131 42244]]
[[    0     0     0     0]
 [ 3430  3796  1568 13712]
 [    0     0     0     0]
 [ 6901  9320  4137 29843]]
[[  532   235    98   169]
 [   12    33     2     7]
 [    1     6    31    16]
 [ 9786 12842  5574 43363]]
[[  649   226   114   211]
 [  134   459    56   200]
 [   36    57   149    98]
 [ 9512 12374  5386 43046]]


In [159]:
# classification_report for all predictions
for prediction in list_of_predictions:
    print(classification_report(prediction, y_test))

  'recall', 'true', average, warn_for)


              precision    recall  f1-score   support

      France       0.01      0.68      0.03        65
       Italy       0.00      0.00      0.00         0
       Spain       0.00      0.00      0.00         0
          US       1.00      0.60      0.75     20709

   micro avg       0.60      0.60      0.60     20774
   macro avg       0.25      0.32      0.20     20774
weighted avg       1.00      0.60      0.75     20774

              precision    recall  f1-score   support

      France       0.02      0.56      0.04       102
       Italy       0.00      0.00      0.00         0
       Spain       0.00      0.00      0.00         0
          US       1.00      0.60      0.75     20672

   micro avg       0.60      0.60      0.60     20774
   macro avg       0.25      0.29      0.20     20774
weighted avg       0.99      0.60      0.75     20774

              precision    recall  f1-score   support

      France       0.18      0.27      0.22      1993
       Italy       0.