In [20]:
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [10]:
df = pd.read_csv("wine_quality.csv")

In [3]:
df.head()

Unnamed: 0,country,description,points,price,variety
0,US,This tremendous 100% varietal wine hails from ...,96,235.0,Cabernet Sauvignon
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",96,110.0,Tinta de Toro
2,US,Mac Watson honors the memory of a wine once ma...,96,90.0,Sauvignon Blanc
3,US,"This spent 20 months in 30% new French oak, an...",96,65.0,Pinot Noir
4,France,"This is the top wine from La Bégude, named aft...",95,66.0,Provence red blend


In [5]:
# 100 gives 2363, 200 gives 1542, 500 gives 764, 1000 gives 416, 1500 gives 309, 2000 gives 233
tfidf = TfidfVectorizer(min_df=1500, stop_words='english')
features = tfidf.fit_transform(df.description)
features.shape

(103868, 309)

In [6]:
sdf = pd.SparseDataFrame(features)

In [7]:
sdf_filled = sdf.fillna(0)

In [13]:
df = df[['country']].join(sdf_filled)

In [14]:
df.head()

Unnamed: 0,country,0,1,2,3,4,5,6,7,8,...,299,300,301,302,303,304,305,306,307,308
0,US,0.27525,0.0,0.0,0.0,0.0,0.0,0.272659,0.0,0.0,...,0.0,0.0,0.0,0.0,0.378282,0.0,0.0,0.0,0.0,0.0
1,Spain,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,US,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,US,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,France,0.0,0.0,0.206754,0.0,0.0,0.0,0.0,0.357698,0.0,...,0.0,0.0,0.347364,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
X = df.drop(columns=['country']).values
y = df['country']

In [16]:
X, y.head()

(array([[0.27525047, 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.16626664, ..., 0.        , 0.        ,
         0.        ]]), 0        US
 1     Spain
 2        US
 3        US
 4    France
 Name: country, dtype: object)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

In [18]:
print(X_train)
y_train.head()

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.20404022 ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


102104        US
39339         US
57198         US
28598      Italy
85027     France
Name: country, dtype: object

In [21]:
# Initialise models
LR_model         = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial', max_iter=1000)
LDA_model        = LinearDiscriminantAnalysis()
KNN_model        = KNeighborsClassifier()
GaussianNB_model = GaussianNB()
MNB_model        = MultinomialNB()
DTree_model      = DecisionTreeClassifier()
SVC_model        = LinearSVC(multi_class='ovr')
xgb_model        = XGBClassifier(booster='gbtree', objective='multi:softprob', random_state=42, eval_metric="auc", num_class=4)
RandFC_model     = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=0)

In [22]:
LR_model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=0, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [23]:
LDA_model.fit(X_train, y_train)

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)

In [24]:
KNN_model.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [None]:
GaussianNB_model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [None]:
MNB_model.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [None]:
DTree_model.fit(X_train, y_train)

In [None]:
SVC_model.fit(X_train, y_train)

In [None]:
xgb_model.fit(X_train,y_train)

In [None]:
RandFC_model.fit(X_train,y_train)

In [None]:
test_set = X_test
test_labels = y_test
list_of_predictions = []

LR_prediction    = LR_model.predict(test_set)
list_of_predictions.append(LR_prediction)

LDA_prediction   = LDA_model.predict(test_set)
list_of_predictions.append(LDA_prediction)

KNN_prediction   = KNN_model.predict(test_set)
list_of_predictions.append(KNN_prediction)

GNB_prediction   = GaussianNB_model.predict(test_set)
list_of_predictions.append(GNB_prediction)

MNB_prediction   = MNB_model.predict(test_set)
list_of_predictions.append(MNB_prediction)

DTree_prediction = DTree_model.predict(test_set)
list_of_predictions.append(DTree_prediction)

SVC_prediction   = SVC_model.predict(test_set)
list_of_predictions.append(SVC_prediction)

XGB_prediction   = xgb_model.predict(test_set)
list_of_predictions.append(XGB_prediction)

RandFC_prediction = RandFC_model.predict(test_set)
list_of_predictions.append(RandFC_prediction)