# Wine Dataset

#### Importing and loading dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_wine
%matplotlib inline

In [2]:
wine = load_wine()
wine.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])

#### Dataset Details

In [3]:
wine["DESCR"]



In [4]:
data = pd.DataFrame(wine["data"], columns=wine["feature_names"])
data.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [5]:
data.shape

(178, 13)

In [6]:
target = pd.DataFrame(wine["target"])
target.head()

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0


### Splitting dataset

In [7]:
from sklearn.model_selection import train_test_split

X, X_test, Y, Y_test = train_test_split(wine["data"], wine["target"], test_size=0.2, random_state=10)
X.shape

(142, 13)

In [8]:
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=10)
X_train.shape

(113, 13)

### Selecting Model

In [9]:
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor

In [10]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [11]:
def model(name, model):
    cv_score = cross_val_score(model, X_train, Y_train, cv=5, scoring="accuracy")
    #scores.append(cv_score)
    #names.append(name)
    mod = "%s: %0.3f (%0.3f)" % (name, cv_score.mean(), cv_score.std())
    print(mod)

In [12]:
model('DTC', DecisionTreeRegressor())

DTC: 0.929 (0.044)


In [13]:
model('LOR', LogisticRegression(solver="liblinear", multi_class="auto"))

LOR: 0.973 (0.022)


In [14]:
model('KNN', KNeighborsClassifier())

KNN: 0.674 (0.067)


In [15]:
model("GNB", GaussianNB())

GNB: 0.991 (0.018)


In [16]:
model("LDA", LinearDiscriminantAnalysis())

LDA: 0.974 (0.035)


### Training Model

In [17]:
model = GaussianNB(var_smoothing=1e-7)
model.fit(X_train, Y_train)

GaussianNB(priors=None, var_smoothing=1e-07)

In [18]:
Tcvs = cross_val_score(model, X_train, Y_train, cv=2, scoring="accuracy")
Tcvs.mean()

0.9910714285714286

In [19]:
model.score(X_train, Y_train)

0.9911504424778761

In [20]:
cvp = cross_val_predict(model, X_train, Y_train, cv=2)
confusion_matrix(Y_train, cvp)

array([[34,  1,  0],
       [ 0, 46,  0],
       [ 0,  0, 32]], dtype=int64)

In [21]:
print(classification_report(Y_train, cvp))

              precision    recall  f1-score   support

           0       1.00      0.97      0.99        35
           1       0.98      1.00      0.99        46
           2       1.00      1.00      1.00        32

    accuracy                           0.99       113
   macro avg       0.99      0.99      0.99       113
weighted avg       0.99      0.99      0.99       113



### Tuning Model Parameteers

In [11]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

pipe = make_pipeline(GaussianNB())
parameter = {
    "gaussiannb__var_smoothing":[1e-09, 1e-08, 1e-07, 1e-06, 1e-05],
}

In [12]:
GSCV = GridSearchCV(pipe, param_grid=parameter, n_jobs=-1)
GSCV.fit(X_train, Y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('gaussiannb',
                                        GaussianNB(priors=None,
                                                   var_smoothing=1e-09))],
                                verbose=False),
             iid='warn', n_jobs=-1,
             param_grid={'gaussiannb__var_smoothing': [1e-09, 1e-08, 1e-07,
                                                       1e-06, 1e-05]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [13]:
print("Best parameter (CV score = %0.3f):" % GSCV.best_score_)
print(GSCV.best_params_)

Best parameter (CV score = 0.982):
{'gaussiannb__var_smoothing': 1e-09}


In [14]:
Tcvp = cross_val_predict(GSCV, X_train, Y_train, cv=2)
print(classification_report(Y_train, Tcvp))



              precision    recall  f1-score   support

           0       1.00      0.97      0.99        35
           1       0.96      1.00      0.98        46
           2       1.00      0.97      0.98        32

    accuracy                           0.98       113
   macro avg       0.99      0.98      0.98       113
weighted avg       0.98      0.98      0.98       113



In [15]:
Vcvp = cross_val_predict(GSCV, X_val, Y_val, cv=2)
confusion_matrix(Y_val, Vcvp)



array([[14,  0,  0],
       [ 0,  7,  0],
       [ 0,  0,  8]], dtype=int64)

In [16]:
print(classification_report(Y_val, Vcvp))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       1.00      1.00      1.00         7
           2       1.00      1.00      1.00         8

    accuracy                           1.00        29
   macro avg       1.00      1.00      1.00        29
weighted avg       1.00      1.00      1.00        29



### Testing model on new data

In [17]:
cvp = GSCV.predict(X_test)
confusion_matrix(Y_test, cvp)

array([[10,  0,  0],
       [ 0, 16,  2],
       [ 0,  0,  8]], dtype=int64)

In [18]:
accuracy_score(Y_test, cvp)

0.9444444444444444

In [19]:
print(classification_report(Y_test, cvp))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.89      0.94        18
           2       0.80      1.00      0.89         8

    accuracy                           0.94        36
   macro avg       0.93      0.96      0.94        36
weighted avg       0.96      0.94      0.95        36

