In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:93% !important; }</style>"))

# visualizations
import matplotlib.pyplot as plt
import seaborn as sn

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier


# machine learning processing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pickle
from sklearn.preprocessing import LabelEncoder

#import metrics
import yellowbrick.classifier as ybc
from sklearn.model_selection import StratifiedKFold
from yellowbrick.model_selection import CVScores

# from scikitplot.metrics import plot_roc, plot_confusion_matrix

#basic imports
import pandas as pd
import numpy as np
import os



In [4]:
file = "../data/Clean_Wine_Data2.csv"
df = pd.read_csv(file).drop("Unnamed: 0", axis=1)
labelencoder = LabelEncoder()
# Assigning numerical values and storing in another column
df['price_label'] = labelencoder.fit_transform(df['Wine_Bins'])
df['country_label'] = labelencoder.fit_transform(df['country'])
df['variety_label'] = labelencoder.fit_transform(df['variety'])
df['winery_label'] = labelencoder.fit_transform(df['winery'])
df['designation_label'] = labelencoder.fit_transform(df['designation'])
df['province_label'] = labelencoder.fit_transform(df['province'])
df.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery,Red?,wineType_encoded,Wine_Bins,price_label,country_label,variety_label,winery_label,designation_label,province_label
0,US,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,Cabernet Sauvignon,Heitz,True,1,Iconic: Over $50,0,37,20,6559,15420,50
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,Toro,Tinta de Toro,Bodega Carmen Rodríguez,True,1,Iconic: Over $50,0,34,145,1164,3975,251
2,US,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,Sauvignon Blanc,Macauley,False,0,Iconic: Over $50,0,37,123,8181,22767,50
3,US,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,Pinot Noir,Ponzi,True,1,Iconic: Over $50,0,37,102,10039,19979,259
4,Spain,"Deep, dense and pure from the opening bell, th...",Numanthia,95,73.0,Northern Spain,Toro,Toro,Tinta de Toro,Numanthia,True,1,Iconic: Over $50,0,34,145,9252,17071,251


In [5]:
wine_lookup = df[["price_label","Wine_Bins"]].groupby("Wine_Bins").mean()
wine_lookup

Unnamed: 0_level_0,price_label
Wine_Bins,Unnamed: 1_level_1
Iconic: Over $50,0
Popular: $10-15,1
Premium: $15-20,2
Super Premium: $20-30,3
Ultra Premium: $30-50,4
Value: Under $10,5


#### Found a random wine on Wine Enthusiast Magazine's website - now need to look up encoding values (Bryn Mawr Vineyards 2019 Estate Riesling (Eola-Amity Hills)) - mid priced

In [6]:
country_lookup = df.loc[df.country == "US"].country_label.mean()
country_lookup

37.0

In [7]:
variety_lookup = df.loc[df.variety == "Riesling"].variety_label.mean()
variety_lookup

115.0

In [8]:
winery_lookup = df.loc[df.winery == "Bryn Mawr Vineyards"].winery_label.mean()
winery_lookup

1562.0

In [9]:
designation_lookup = df.loc[df.designation == "Estate"].designation_label.mean()
designation_lookup

8487.0

In [10]:
province_lookup = df.loc[df.province == "Oregon"].province_label.mean()
province_lookup

259.0

In [11]:
data_test = np.array([0,country_lookup,variety_lookup,winery_lookup,designation_lookup,province_lookup]).reshape(1, -1)

In [16]:
loaded_scaler = pickle.load(open('../models/finalized_scaler.sav', 'rb'))
loaded_model = pickle.load(open('../models/finalized_model.sav', 'rb'))



In [17]:
newData_scaled = loaded_scaler.transform(data_test)
value_test = loaded_model.predict(newData_scaled)
predict_num = value_test[0]
predict_num

2

In [18]:
print(data_test)

[[   0.   37.  115. 1562. 8487.  259.]]


In [None]:
outcome = wine_lookup.loc[wine_lookup.price_label == predict_num].index[0]
outcome

In [None]:
probs_test = loaded_model.predict_proba(newData_scaled)
probs_test

In [None]:
wines2 = wine_lookup.copy().drop(columns = ['price_label'])
wines2['Probabilities'] = probs_test[0]
wines2

#### Actual price is 25 dollars so not too bad

### Another One - let's try Barefoot! Barefoot NV Pinot Noir (California)

In [None]:
country_lookup = df.loc[df.country == "US"].country_label.mean()
country_lookup

In [None]:
variety_lookup = df.loc[df.variety == "Pinot Noir"].variety_label.mean()
variety_lookup

In [None]:
winery_lookup = df.loc[df.winery == "Barefoot"].winery_label.mean()
winery_lookup

In [None]:
df.designation.value_counts()

In [None]:
designation_lookup = df.loc[df.designation == "*Unknown Wine*"].designation_label.mean()
designation_lookup

In [None]:
province_lookup = df.loc[df.province == "California"].province_label.mean()
province_lookup

In [None]:
data_test = np.array([1,country_lookup,variety_lookup,winery_lookup,designation_lookup,province_lookup]).reshape(1, -1)
newData_scaled = loaded_scaler.transform(data_test)
value_test = loaded_model.predict(newData_scaled)
predict_num = value_test[0]
outcome = wine_lookup.loc[wine_lookup.price_label == predict_num].index[0]
outcome

In [None]:
probs_test = loaded_model.predict_proba(newData_scaled)
wines2 = wine_lookup.copy().drop(columns = ['price_label'])
wines2['Probabilities'] = probs_test[0]
wines2

## Try Another One? - Arpepe 2009 Sassella Rocce Rosse (Valtellina Superiore)

In [None]:
country_lookup = df.loc[df.country == "Italy"].country_label.mean()
country_lookup

In [None]:
variety_lookup = df.loc[df.variety == "Nebbiolo"].variety_label.mean()
variety_lookup

In [None]:
df.loc[df.variety == "Nebbiolo"].winery_label.value_counts()

In [None]:
winery_lookup = 2333

In [None]:
designation_lookup = df.loc[df.designation == "*Unknown Wine*"].designation_label.mean()
designation_lookup

In [None]:
province_lookup = df.loc[df.province == "Lombardy"].province_label.mean()
province_lookup

In [None]:
data_test = np.array([1,country_lookup,variety_lookup,winery_lookup,designation_lookup,province_lookup]).reshape(1, -1)
newData_scaled = loaded_scaler.transform(data_test)
value_test = loaded_model.predict(newData_scaled)
predict_num = value_test[0]
outcome = wine_lookup.loc[wine_lookup.price_label == predict_num].index[0]
outcome

In [None]:
probs_test = loaded_model.predict_proba(newData_scaled)
wines2 = wine_lookup.copy().drop(columns = ['price_label'])
wines2['Probabilities'] = probs_test[0]
wines2

In [20]:
wine_lookup.to_csv("../data/wine_lookup.csv")