In [5]:
# V 1.13
# Standard Imports
import pandas as pd
import numpy as np
from zipfile import ZipFile

# Plotting 
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from bokeh.plotting import figure, output_notebook, show, output_file
import seaborn as sns

output_notebook()
%matplotlib inline

# Pandas Setup
pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999
pd.set_option('display.float_format', lambda x: '{:,.2f}'.format(x))

# Colour Management
MC = '#242a49'  # Main
SC = '#ededed'  # Secondary
TC = 'red'  # Tertiary 

colors = ['#202540', '#17b5ff', 'red', 'pink', '#443857', '#bed6f7', '#144c4a']
cmap = mcolors.LinearSegmentedColormap.from_list('n', ['red', 'white', MC])
cmap2 = mcolors.LinearSegmentedColormap.from_list('n', ['white', MC])
boxplot_params = dict(
    patch_artist=True,
    boxprops=dict(facecolor=SC, color=MC),
    capprops=dict(color=MC, linewidth=2),
    whiskerprops=dict(color=MC),
    flierprops=dict(color=MC, markeredgecolor=MC),
    medianprops=dict(color='red', linewidth=2),
)

# Pyplot Setup
plt.rcParams.update({
    'font.family': 'monospace',
    'font.size': 10,
    'figure.figsize': (12, 6),
    'grid.color': SC,
    'axes.titlesize': 16.0,
    'axes.facecolor': 'none',
    'axes.formatter.useoffset': False,  # Scientific notation
})


In [6]:
# Additional imports
from bokeh.layouts import layout, widgetbox
from bokeh.models import ColumnDataSource, HoverTool, BoxZoomTool, ResetTool, PanTool
from bokeh.models.widgets import Slider, Select, TextInput, Div
from bokeh.models import WheelZoomTool, SaveTool, LassoSelectTool
from bokeh.io import curdoc
from functools import lru_cache

In [8]:
df = pd.read_csv(ZipFile('data/wine.zip').open('winemag-data-130k-v2.csv'), index_col=0)
df.head(3)

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm


In [9]:
df.shape

(129971, 13)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 129971 entries, 0 to 129970
Data columns (total 13 columns):
country                  129908 non-null object
description              129971 non-null object
designation              92506 non-null object
points                   129971 non-null int64
price                    120975 non-null float64
province                 129908 non-null object
region_1                 108724 non-null object
region_2                 50511 non-null object
taster_name              103727 non-null object
taster_twitter_handle    98758 non-null object
title                    129971 non-null object
variety                  129970 non-null object
winery                   129971 non-null object
dtypes: float64(1), int64(1), object(11)
memory usage: 18.9+ MB


In [15]:
df.country.value_counts()

US                        54504
France                    22093
Italy                     19540
Spain                      6645
Portugal                   5691
Chile                      4472
Argentina                  3800
Austria                    3345
Australia                  2329
Germany                    2165
New Zealand                1419
South Africa               1401
Israel                      505
Greece                      466
Canada                      257
Hungary                     146
Bulgaria                    141
Romania                     120
Uruguay                     109
Turkey                       90
Slovenia                     87
Georgia                      86
England                      74
Croatia                      73
Mexico                       70
Moldova                      59
Brazil                       52
Lebanon                      35
Morocco                      28
Peru                         16
Ukraine                      14
Czech Re

In [20]:
# Column order for displaying the details of a specific review
col_order = ['price', 'points', 'variety', 'province', 'description']

all_provinces = [
    "All", "South Australia", "Victoria", "Western Australia",
    "Australia Other", "New South Wales", "Tasmania"
]

desc = Div(text='All provinces', width=800)
province = Select(title='Province', options=all_provinces, value='All')
price_max = Slider(start=0, end=900, value=200, title='Maximum Price')
title = TextInput(title='Title Contains')
details = Div(text='Selection Details:', width=800)

In [21]:
source = ColumnDataSource(data=df)

In [22]:
hover = HoverTool(tooltips=[
    ('title', '@title'),
    ('variety', '@variety')
])

TOOLS = [
    hover, BoxZoomTool(), LassoSelectTool(), WheelZoomTool(), PanTool(),
    ResetTool(), SaveTool()
]

In [26]:
p = figure(
    plot_height=600,
    plot_width=700,
    title='Australian Wine Analysis',
    tools=TOOLS,
    x_axis_label='points',
    y_axis_label='price (USD)',
    toolbar_location='above'
)

p.circle(
    y='price',
    x='points',
    source=source,
#     color='variety_color',
    size=7,
    alpha=.4
)


In [27]:
def select_reviews():
    """ Use the current selections to determine which filters to apply to the 
        data. Return a dataframe of the selected data
    """
    
    max_price = price_max.value
    province_val = province.value
    title_val = title.value
    
    if province_val == 'All':
        selected = df[df.price <= max_price]
    else:
        selected = df[(df.province == province_val) & (df.price <= max_price)]
    
    if title_val != '':
        selected = selected[selected.title.str.contains(title_val) == True]
    
    desc.text = 'Province: {} and Price < {}'.format(province_val, max_price)
    return selected

In [29]:
def update():
    """ Get the selected data and update the data in the source"""
    
    df_active = select_reviews()
    source.data = ColumnDataSource(data=df_active).data

In [39]:
def selection_change(attrname, old, new):
    """ Function will be called when the poly select (or other selection tool)
    is used. Determine which items are selected and show the details below the braph
    """
    selected = source.selected['1d']['indices']
    
    df_active = select_reviews()
    
    if selected:
        data = df_active.iloc[selected, :]
        temp = data.set_index('title').T.reindex(index=col_order)
        details.text = temp.style.render()
    else:
        details.text = 'Selection details'

In [40]:
controls = [province, price_max, title]

for control in controls:
    control.on_change('value', lambda attr, old, new: update())

In [41]:
source.on_change('selected', selection_change)

In [42]:
inputs = widgetbox(*controls, sizing_mode='fixed')
l = layout([desc], [inputs, p], [details], sizing_mode='fixed')

In [43]:
update()
curdoc().add_root(l)
curdoc().title = 'Australian Wine Analysis'