In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

#from lazypredict.Supervised import LazyClassifier, LazyRegressor
from sklearn.model_selection import train_test_split
from sklearn import datasets

red_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', sep = ';')
white_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv', sep = ';')

# create a new variable 'wine_type'
red_wine['wine_type'] = 'red'
white_wine['wine_type'] = 'white'


# bucket wine quality scores into qualitative quality labels
red_wine['quality_label'] = red_wine['quality'].apply(lambda value: 'low'
if value <= 5 else 'medium'
if value <= 7 else 'high')

red_wine['quality_label'] = pd.Categorical(red_wine['quality_label'],
categories=['low', 'medium', 'high'])

white_wine['quality_label'] = white_wine['quality'].apply(lambda value: 'low'
if value <= 5 else 'medium'
if value <= 7 else 'high')

white_wine['quality_label'] = pd.Categorical(white_wine['quality_label'],
categories=['low', 'medium', 'high'])

wines = pd.concat([red_wine, white_wine])

In [2]:
wines.shape

(6497, 14)

In [4]:
wines.loc[0,:]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,wine_type,quality_label
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red,low
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6,white,medium


In [None]:
wines.isnull().sum()


In [None]:
wine_sales = pd.read_csv('wine_sales_data.csv')

In [6]:
wine_sales.isnull().sum()

Unnamed: 0         0
country            5
description        0
designation    45735
points             0
price          13695
province           5
region_1       25060
region_2       89977
variety            0
winery             0
dtype: int64

In [7]:
winw_sales_red = wine_sales[['country','province','points','price']]

In [8]:
winw_sales_red.isnull().sum()

country         5
province        5
points          0
price       13695
dtype: int64

In [9]:
winw_sales_red.shape


(150930, 4)

In [10]:
winw_sales_red.head

<bound method NDFrame.head of        country            province  points  price
0           US          California      96 235.00
1        Spain      Northern Spain      96 110.00
2           US          California      96  90.00
3           US              Oregon      96  65.00
4       France            Provence      95  66.00
...        ...                 ...     ...    ...
150925   Italy      Southern Italy      91  20.00
150926  France           Champagne      91  27.00
150927   Italy      Southern Italy      91  20.00
150928  France           Champagne      90  52.00
150929   Italy  Northeastern Italy      90  15.00

[150930 rows x 4 columns]>

In [11]:
port_vv = winw_sales_red[(winw_sales_red['country'] == 'Portugal') & (winw_sales_red['province'] == 'Vinho Verde')]

In [12]:
port_vv.shape

(396, 4)

In [13]:
port_vv.isnull().sum()

country      0
province     0
points       0
price       86
dtype: int64

In [14]:
port_vv

Unnamed: 0,country,province,points,price
277,Portugal,Vinho Verde,92,35.00
993,Portugal,Vinho Verde,85,10.00
994,Portugal,Vinho Verde,85,9.00
1378,Portugal,Vinho Verde,91,
1558,Portugal,Vinho Verde,84,10.00
...,...,...,...,...
141129,Portugal,Vinho Verde,83,9.00
141143,Portugal,Vinho Verde,82,8.00
141148,Portugal,Vinho Verde,82,6.00
143472,Portugal,Vinho Verde,86,9.00


In [15]:
port_vv_clean = port_vv.dropna()

In [16]:
port_vv_clean.isnull().sum()

country     0
province    0
points      0
price       0
dtype: int64

In [18]:
port_vv_clean.drop(['country', 'province'], axis = 1, inplace = True)

In [19]:
port_vv_clean.describe()

Unnamed: 0,points,price
count,310.0,310.0
mean,86.35,11.4
std,2.51,4.82
min,80.0,5.0
25%,84.0,8.0
50%,86.0,10.0
75%,88.0,13.0
max,92.0,45.0


In [20]:
price_irq = port_vv_clean[port_vv_clean['price'].between(port_vv_clean['price'].quantile(0.25), port_vv_clean['price'].quantile(0.75))]

In [21]:
price_irq.describe()

Unnamed: 0,points,price
count,198.0,198.0
mean,85.98,10.29
std,2.28,1.72
min,80.0,8.0
25%,84.0,9.0
50%,85.5,10.0
75%,88.0,12.0
max,91.0,13.0


In [23]:
wines.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,wine_type,quality_label
0,7.4,0.7,0.0,1.9,0.08,11.0,34.0,1.0,3.51,0.56,9.4,5,red,low
1,7.8,0.88,0.0,2.6,0.1,25.0,67.0,1.0,3.2,0.68,9.8,5,red,low
2,7.8,0.76,0.04,2.3,0.09,15.0,54.0,1.0,3.26,0.65,9.8,5,red,low
3,11.2,0.28,0.56,1.9,0.07,17.0,60.0,1.0,3.16,0.58,9.8,6,red,medium
4,7.4,0.7,0.0,1.9,0.08,11.0,34.0,1.0,3.51,0.56,9.4,5,red,low


In [24]:
wtp_features = wines.iloc[:,:-3]
wtp_feature_names = wtp_features.columns
wtp_class_labels = np.array(wines['wine_type'])

X_train, X_test, y_train, y_test = train_test_split(wtp_features, wtp_class_labels, test_size=.2, random_state=42)

In [25]:
clf = LazyClassifier(predictions=True)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

100%|██████████| 30/30 [00:04<00:00,  6.43it/s]


In [26]:
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ExtraTreesClassifier,1.0,0.99,,1.0,0.29
LGBMClassifier,1.0,0.99,,1.0,0.08
RandomForestClassifier,0.99,0.99,,0.99,0.53
LabelPropagation,0.99,0.99,,0.99,0.89
LinearDiscriminantAnalysis,0.99,0.99,,0.99,0.06
LabelSpreading,0.99,0.99,,0.99,1.18
XGBClassifier,0.99,0.99,,0.99,0.14
SVC,0.99,0.99,,0.99,0.08
RidgeClassifier,0.99,0.99,,0.99,0.04
RidgeClassifierCV,0.99,0.99,,0.99,0.03


In [2]:
models.type()

NameError: name 'models' is not defined

In [27]:
predictions

Unnamed: 0,AdaBoostClassifier,BaggingClassifier,BernoulliNB,CalibratedClassifierCV,DecisionTreeClassifier,DummyClassifier,ExtraTreeClassifier,ExtraTreesClassifier,GaussianNB,KNeighborsClassifier,...,PassiveAggressiveClassifier,Perceptron,QuadraticDiscriminantAnalysis,RandomForestClassifier,RidgeClassifier,RidgeClassifierCV,SGDClassifier,SVC,XGBClassifier,LGBMClassifier
0,white,white,white,white,white,white,white,white,white,white,...,white,white,white,white,white,white,white,white,white,white
1,red,red,white,red,red,red,red,red,red,red,...,red,red,red,red,red,red,red,red,red,red
2,white,white,white,white,white,white,white,white,white,white,...,white,white,white,white,white,white,white,white,white,white
3,white,white,white,white,white,white,white,white,white,white,...,white,white,white,white,white,white,white,white,white,white
4,white,white,white,white,white,white,white,white,white,white,...,white,white,white,white,white,white,white,white,white,white
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1295,red,red,red,red,red,white,red,red,red,red,...,red,red,red,red,red,red,red,red,red,red
1296,white,white,white,white,white,white,white,white,white,white,...,white,white,white,white,white,white,white,white,white,white
1297,white,white,white,white,white,white,white,white,white,white,...,white,white,white,white,white,white,white,white,white,white
1298,white,white,white,white,white,white,white,white,white,white,...,white,white,white,white,white,white,white,white,white,white
