In [39]:
import pandas as pd
import ipywidgets as widgets
from IPython.display import display
import numpy as np
import math
from collections import defaultdict, Counter
from wordcloud import WordCloud
import re
import inflect
from typing import List, Dict, Tuple

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patheffects as path_effects

import constants as co
import util as util

In [40]:
full_data = pd.read_csv(co.data_dir / "full_wine_info_transformed.csv")
pairing_df = pd.read_csv(co.data_dir / "food_pairings.csv")
flavor_df = pd.read_csv(co.data_dir / "food_flavors.csv")
adjedctives_df = pd.read_csv(co.data_dir / "wine_description_words.csv")

In [41]:
p = inflect.engine()
def convert_plurals_to_singulars(str_: str) -> str:    
    try:
        list_ = str_.split('. ')
        for i, item in enumerate(list_):
            singular = p.singular_noun(item)
            if singular:
                list_[i] = singular
        str_ = '. '.join(list_)    
    except AttributeError:
        pass
    return str_

In [42]:
for col in pairing_df:
    pairing_df[col] = pairing_df[col].apply(convert_plurals_to_singulars)


In [43]:
assert len(full_data) == len(pairing_df) == len(flavor_df) == len(adjedctives_df)

In [44]:
full_data.head()

Unnamed: 0,country,description,points,price,variety,province,winery,points_group,log10_price
0,US,This tremendous 100% varietal wine hails from ...,96,235.0,Cabernet Sauvignon,California,Heitz,95 to 100,2.371068
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",96,110.0,Tinta de Toro,Northern Spain,Bodega Carmen Rodríguez,95 to 100,2.041393
2,US,Mac Watson honors the memory of a wine once ma...,96,90.0,Sauvignon Blanc,California,Macauley,95 to 100,1.954243
3,US,"This spent 20 months in 30% new French oak, an...",96,65.0,Pinot Noir,Oregon,Ponzi,95 to 100,1.812913
4,France,"This is the top wine from La Bégude, named aft...",95,66.0,Provence red blend,Provence,Domaine de la Bégude,95 to 100,1.819544


In [45]:
#Where there is no country, replace nan with Unkonwn
mask = full_data[co.country].isna()
full_data[co.country].mask(mask, 'Unknown', inplace=True)

In [46]:
# create a new columns with rare countries renamed as Other
full_data['filtered_country'] = full_data[co.country]
country_dict = full_data[co.country].value_counts().to_dict()
list_of_other_countries = [k for k,v in country_dict.items() if v<10]
# groups countries with small number of observations (<10) into one group named "Other"
mask = full_data[co.country].isin(list_of_other_countries)
full_data['filtered_country'].mask(mask, 'Other', inplace=True)
# full_data['filtered_country'].value_counts()

In [47]:
#create new row with rare wine varieties replaced with Other
full_data['filtered_variety'] = full_data[co.variety]
variety_dict = full_data[co.variety].value_counts().to_dict()
list_of_other_varieties = [k for k,v in variety_dict.items() if v<1000]
# groups varieties with small number of observations (<1000) into one group named "Other"
mask = full_data[co.variety].isin(list_of_other_varieties)
full_data['filtered_variety'].mask(mask, 'Other', inplace=True)
full_data['filtered_variety'].value_counts()

Other                            25576
Pinot Noir                        9283
Chardonnay                        9163
Cabernet Sauvignon                8272
Red Blend                         6485
Bordeaux-style Red Blend          5176
Sauvignon Blanc                   4037
Syrah                             3663
Riesling                          3585
Merlot                            3178
Zinfandel                         2409
Sangiovese                        2153
Malbec                            1968
Rosé                              1910
White Blend                       1858
Tempranillo                       1622
Portuguese Red                    1441
Nebbiolo                          1339
Sparkling Blend                   1315
Shiraz                            1254
Corvina, Rondinella, Molinara     1118
Rhône-style Red Blend             1046
Name: filtered_variety, dtype: int64

In [48]:
countries = full_data['filtered_country'].unique().tolist()
countries = sorted(countries)
countries.insert(0, 'All')

varieties = full_data['filtered_variety'].unique().tolist()
varieties = sorted(varieties)
varieties.insert(0, 'All')

min_price = full_data[co.price].min()
max_price = full_data[co.price].max()

min_score = full_data[co.points].min()
max_score = full_data[co.points].max()

In [49]:
#decorator to catch errors and output message
def error_handling(func):
    def inner(*args, **kwargs):
        try:
            func(*args, **kwargs)
            return "ok"
        except:
            text = "There is no data with such parameters. Please try to adjust your choices."
            text = util.format_text(text)
            util.display_text_box(text, axis=[0, 20, 0, 20], center_v = 10, center_h = 10)
            return "error"
    return inner

In [50]:
@error_handling
def make_variety_piechart(df: pd.DataFrame, n: int = 5) -> None:
    if df.empty:
        raise ValueError()
    variety_count = df[co.variety].value_counts()[:n].to_dict()
    plt.figure(figsize=(8,6))
    colors = sns.color_palette('pastel')[:len(variety_count)]
    plt.pie(list(variety_count.values()), labels=list(variety_count.keys()), colors=colors, autopct='%.0f%%',
           textprops={'fontsize': 14})
    plt.title(f"{n} most popular wine varieties.")
    plt.show()

In [51]:
def maybe_replace_rare_countries_with_other(df: pd.DataFrame, threshold: int = 10) -> Dict[str, int]:
    """Too many countries look messy on a bat plot.
    Reduce their number to 'threshold', by combining less frequent ones."""
    count = df[co.country].value_counts().to_dict()
    if len(count) > threshold:        
        list_of_other_countries = [k for k,v in count.items() if v<1000]
        # groups countries with small number of observations (<1000) into one group named "Other"
        mask = df[co.country].isin(list_of_other_countries)
        df[co.country].mask(mask, 'Other', inplace=True)
        count = df[co.country].value_counts().to_dict()
        count = util.return_first_n_dict_items(count, threshold)
    return count

In [52]:
@error_handling
def make_country_barplot(df: pd.DataFrame, n: int = 5) -> None:
    count = maybe_replace_rare_countries_with_other(df)
    bar = sns.barplot(y=list(count.keys()),x=list(count.values()), orient='h')
    bar.tick_params(labelsize=14)
    plt.title(f"Quantity of wine by first {len(count)} countries", size= 18)    
    plt.show()

In [53]:
@error_handling
def display_common_info(df: pd.DataFrame, country_str: str) -> None:
    if df.empty:
        raise ValueError()
    print(f"Number of observations: {len(df)}")
    print(f"Country: {country_str}")
    sns.set_style('whitegrid')
    sns.set_context('notebook', font_scale=2)
    fig, ax = plt.subplots(2, 1, figsize=(20, 7))
    fig.tight_layout(pad=3)
    sns.boxplot(x=df[co.price], ax=ax[0])
    sns.boxplot(x=df[co.points], ax=ax[1])
    ax[0].set_title("How prices are distributed")
    ax[1].set_title("How scores are distributed")
    plt.show()

In [54]:
def format_food_categories(food_categories: list) -> List[str]:
    for i, cat in enumerate(food_categories):
        cat = re.sub(r"FOOD_", '', cat)
        cat = re.subn(r"_", ' ', cat)[0]
        food_categories[i] = cat.capitalize()
    return food_categories
        

In [55]:
def get_dict_from_df(df: pd.DataFrame) -> Dict[str, List[str]]:
    """Get dictionary of food/flavour category to their values from food/flavout data frames."""
    df.dropna(how='all', inplace=True)
    df.dropna(axis=1, how='all', inplace=True)
    df.fillna('None', inplace=True)
    dict_ = defaultdict(list)
    for flavor_name in df:
        flavor_col = df.loc[:, flavor_name]   
        flavor_list = []
        for flavor_item in flavor_col:
            if flavor_item == 'None':
                continue
            flavor_list.extend(flavor_item.split('.'))
        flavor_list = [f.lower().strip() for f in flavor_list]
        dict_[flavor_name] = flavor_list
    dict_ = dict(sorted(dict_.items(), key=lambda x: len(x[1]), reverse=True))
    return dict_

In [56]:
def _extract_random_row(full_df: pd.DataFrame, feature_df: pd.DataFrame) -> Tuple[str, str, str, str, str, str]:
    feature_random_row = feature_df.sample()
    index = feature_random_row.index
    full_random_row = full_df.iloc[index]
    country = full_random_row[co.country].values[0]
    variety = full_random_row[co.variety].values[0]
    province = full_random_row[co.province].values[0]
    price = full_random_row[co.price].values[0]
    score = full_random_row[co.points].values[0]
    if 'other' in province.lower() or country.lower() == province.lower():
        province = ''
    feature_random_row = feature_random_row.values.tolist()[0]
    feature_random_row = [f.lower() for f in feature_random_row if f != 'None']
    food_str = '. '.join(feature_random_row).split('. ')    
    food_str = ', '.join(food_str)
    return country, variety, province, price, score, food_str

In [57]:
@error_handling
def display_random_pairing_info(full_df: pd.DataFrame, pairing_df: pd.DataFrame) -> None:
    country, variety, province, price, score, food_str = _extract_random_row(full_df, pairing_df)
    if province:
        text = f"{variety} wine from {country}, {province} goes well with {food_str}.\nPrice: ${price},\nScore: {score} points"
    else:
        text = f"{variety} wine from {country} goes well with {food_str}.\nPrice: ${price},\nScore: {score} points"
    text = util.format_text(text)
    text = 'Random pick:\n' + text
    util.display_text_box(text)

In [58]:
@error_handling
def display_random_flavor_info(full_df: pd.DataFrame, flavor_df: pd.DataFrame) -> None:
    country, variety, province, price, score, flavors_str = _extract_random_row(full_df, flavor_df)
    if province:
        text = f"{variety} wine from {country}, {province} with following flavour profile: {flavors_str}.\nPrice: ${price},\nScore: {score} points"
    else:
        text = f"{variety} wine from {country} with following flavour profile: {flavors_str}.\nPrice: ${price},\nScore: {score} points"
    text = util.format_text(text)
    text = 'Random pick:\n' + text
    util.display_text_box(text)

In [59]:
def get_data_from_df_based_on_index_df(df: pd.DataFrame, index_df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, List[str]]]:
    filtered_df = df.iloc[index_df.index]
    df_ = filtered_df.copy()
    dict_ = get_dict_from_df(df_)
    return df_, dict_

In [60]:
@error_handling
def dipslay_word_cloud(dict_: dict) -> None:
    dict_ = util.return_first_n_dict_items(dict_, 3)
    words_str = ""
    for category, words in dict_.items():
        words_str += ' '.join(words) + ' '
    stop_words = ["flavor", "flavors", "flavored", "aroma", "aromas", "the", "and", "a", "an"]
    
    try:
        wordcloud = WordCloud(background_color = "white",
                              stopwords=stop_words,
                              height=1000, width=1600,
                              max_words=100).generate(words_str)
        print(f"Main flavor categories are: {', '.join(format_food_categories(list(dict_.keys())))}")
        plt.figure(figsize=(10,8))
        plt.imshow(wordcloud, interpolation="lanczos")
        plt.axis("off")
        plt.show()
    except ValueError:
        raise ValueError()

In [61]:
@error_handling
def display_pairings_text(dict_: dict) -> None:
    if len(dict_) == 0:
        raise ValueError()
    most_popular = util.return_first_n_dict_items(dict_, 3)
    text = f"Such a wine goes best with"
    for cat, pairings in most_popular.items():
        cat = format_food_categories([cat])[0]
        if cat == 'Other':
            cat = 'Snacks/miscellaneous'
        counter = Counter(pairings)
        top_examples = dict(counter.most_common(3)).keys()
        text += f"\n{cat}: {', '.join(top_examples)}"
    text += '.'
    if 'FOOD_GENERAL_DISHES' not in most_popular and 'FOOD_GENERAL_DISHES' in dict_:
        counter = Counter(dict_['FOOD_GENERAL_DISHES'])
        top_examples = dict(counter.most_common(3)).keys()
        text += f"\nAlso, following dishes would go well: {', '.join(top_examples)}."
    text = util.format_text(text)
    util.display_text_box(text)


In [62]:
country_select = widgets.SelectMultiple(options=countries,
                                        value=['All'],
                                        continuous_update=False)
variety_select = widgets.SelectMultiple(options=varieties,
                                        value=['All'],
                                        continuous_update=False)
price_select = widgets.IntRangeSlider(value=[min_price, max_price],
                                      min=min_price,
                                      max=max_price,
                                      continuous_update=False,)
score_select = widgets.IntRangeSlider(value=[min_score, max_score],
                                      min=min_score,
                                      max=max_score,
                                      continuous_update=False,)

In [63]:
variety_output = widgets.Output()
country_output = widgets.Output()
info_output = widgets.Output()
flavor_output = widgets.Output()
random_flavor_output = widgets.Output()
food_pairings_output = widgets.Output()
random_pairing_output = widgets.Output()

def common_filter(countries, price_range, score_range, varieties):
    common_df = full_data.copy()
    variety_output.clear_output()
    country_output.clear_output()
    info_output.clear_output()
    flavor_output.clear_output()
    random_flavor_output.clear_output()
    food_pairings_output.clear_output()
    random_pairing_output.clear_output()
    
    country_str = ", ".join(countries)
    country_str.strip(", ")

    if isinstance(price_range, dict):
        price_range = price_range['value']
    if isinstance(score_range, dict):
        score_range = score_range['value']
        
    if "All" not in countries:
        common_df = common_df[common_df['filtered_country'].isin(countries)]
    if "All" not in varieties:
        common_df = common_df[common_df['filtered_variety'].isin(varieties)]
    common_df = common_df[common_df[co.price].isin(range(price_range[0], price_range[1]))]
    common_df = common_df[common_df[co.points].isin(range(score_range[0], score_range[1]))]

    with variety_output:
        make_variety_piechart(common_df)
    with country_output:
        make_country_barplot(common_df)
    with info_output:
        display_common_info(common_df, country_str)

    flavors_df, flavors_dict = get_data_from_df_based_on_index_df(flavor_df, common_df)
    with flavor_output:
        status = dipslay_word_cloud(flavors_dict)
    if status != "error":
        with random_flavor_output:
            display_random_flavor_info(full_data, flavors_df)

    pairings_df, pairings_dict = get_data_from_df_based_on_index_df(pairing_df, common_df)
    with food_pairings_output:
        status = display_pairings_text(pairings_dict)
    if status != "error":
        with random_pairing_output:
            display_random_pairing_info(full_data, pairings_df)       


In [64]:
def price_handler(change):
    common_filter(country_select.value, change.new, score_select.value, variety_select.value) 

def score_handler(change):
    common_filter(country_select.value, price_select.value, change.new, variety_select.value) 

def country_handler(change):
    common_filter(change.new, price_select.value, score_select.value, variety_select.value)
    
def variety_handler(change):
    common_filter(country_select.value, price_select.value, score_select.value, change.new)
                              
country_select.observe(country_handler, names='value')
price_select.observe(price_handler, names='value')  
score_select.observe(score_handler, names='value')
variety_select.observe(variety_handler, names='value')

In [65]:
layout = widgets.Layout(margin='0 0 50px 0')
sliders = widgets.VBox([widgets.HBox([widgets.Label("Price range (USD): "), price_select]),
                        widgets.HBox([widgets.Label("Score range: "), score_select])],
                      layout=widgets.Layout(margin='0 0 0 0'))

selectors = widgets.HBox([widgets.HBox([widgets.Label("Select coutry: "), country_select]),
                          widgets.HBox([widgets.Label("Select wine variety: "), variety_select],
                                      layout=widgets.Layout(margin='0 0 0 50px'))],
                         layout=widgets.Layout(margin='0 0 20px 0'))

input_widgets = widgets.VBox([selectors, sliders], layout=layout)

flavors_output = widgets.HBox([flavor_output, random_flavor_output])
pairings_output = widgets.HBox([food_pairings_output, random_pairing_output])

tab = widgets.Tab([info_output, variety_output, country_output, flavors_output, pairings_output], layout=layout)
tab.set_title(0, "Common info")
tab.set_title(1, "Wine varieties")
tab.set_title(2, "Countries")
tab.set_title(3, "Flavors")
tab.set_title(4, "Food pairings")

dashboard = widgets.VBox([input_widgets, tab])
display(dashboard)

VBox(children=(VBox(children=(HBox(children=(HBox(children=(Label(value='Select coutry: '), SelectMultiple(ind…