In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 1. Data preparation

### load data

In [157]:
df = pd.read_csv('./data/tasting_notes.csv')

In [37]:
df.head()

Unnamed: 0,Country,Company,Name,Full Name,Aroma,Flavor,Appearance,Body
0,Germany,Spaten,Munich Helles Premium Larger,Spaten Munich Helles Premium Lager,"toasty, hop","bitter, sweet, pomaceous fruit, floral",straw,3.0
1,Germany,Spaten,Oktoberfestbier,Spaten Oktoberfestbier,"sweet, caramel","citrus fruit, nutty",straw,4.0
2,Germany,Bitburger,Premium Pils,Bitbruger Premium Pils,"grassy, pomaceous fruit, fresh","neat, fine malt, fresh hop, nutty, malty",gold,2.0
3,Germany,Hofbr?u M?nchen,Original Larger,Hofbr?u M?nchen Original Larger,"floral, sweet, malty","nutty, malty, bitter, sweet",gold,3.0
4,Germany,Weltenburger,Barock Dunkel,Weltenburger Barock Dunkel,"roasty, malty, hop","sweet, toasty",brown,3.0


### drop all rows with NaN in 'Body' column

In [165]:
df = df[df['Body'].notna()]

In [166]:
len(df)

92

### split string in ['Aroma', 'Flavor'] into words

In [167]:
data = {'Aroma': [], 'Flavor': []}
for row in df[data.keys()].iterrows():
    _, (aroma, flavor) = row
    data['Aroma'] += [x.strip().replace(' ', '_') for x in aroma.split(',')]
    data['Flavor'] += [x.strip().replace(' ', '_') for x in flavor.split(',')]

### remove words with less than 3 counts

In [168]:
# convert list to Pandas Series and count unique values
data = {k: pd.Series(v).value_counts() for k, v in data.items()}

# remove words with less than 3 counts
data = {k: v.loc[v >= 3].index.values for k, v in data.items()}

### add two characteristics (Color, Body)

In [169]:
data['Color'] = ['straw', 'gold', 'amber', 'brown', 'black']
data['Body'] = ['light', 'light_medium', 'medium', 'medium_heavy', 'heavy']

In [171]:
data

{'Aroma': array(['sweet', 'citrus_fruit', 'spicy', 'pomaceous_fruit', 'fruity',
        'banana', 'caramel', 'dried_fruit', 'hop', 'malty', 'roasty',
        'fresh', 'citrus', 'berry', 'grassy', 'floral', 'toasty',
        'tropical_fruit', 'smoky', 'herb', 'bitter', 'smooth',
        'white_wine', 'nutty'], dtype=object),
 'Flavor': array(['sweet', 'citrus_fruit', 'bitter', 'nutty', 'caramel', 'citrus',
        'spicy', 'hop', 'fresh', 'roasty', 'pomaceous_fruit', 'fruity',
        'smoky', 'toasty', 'dried_fruit', 'bready', 'malty', 'berry',
        'banana', 'tropical_fruit', 'floral', 'herb'], dtype=object),
 'Color': ['straw', 'gold', 'amber', 'brown', 'black'],
 'Body': ['light', 'light_medium', 'medium', 'medium_heavy', 'heavy']}

### concat all characteristics into one list

In [175]:
columns = []
for k, v in data.items():
    k = k.lower()
    columns += [f'{k}_{x}' for x in list(v)]
columns = np.array(columns)

### make new DataFrame
(index: beer names, rows: binary values, columns: characteristics)

In [193]:
mat = []
for _, row in df[['Aroma', 'Flavor', 'Appearance', 'Body']].iterrows():
    new_row = np.zeros(len(columns), dtype=np.int32)
    
    targets = []
    for k, v in row.iteritems():
        k = k.lower()
        if k == 'aroma' or k == 'flavor':
            targets += [f"{k}_{x.strip().replace(' ', '_')}" for x in v.split(',')]
        elif k == 'appearance':
            targets += [f'{k}_{v.strip()}']
        else:
            v = data['Body'][int(v) - 1]
            targets += [f'{k}_{v}']
    
    for t in targets:
        target_idx = np.where(columns == t)[0]
        new_row[target_idx] = 1
    
    mat.append(new_row)
    
mat = np.array(mat, dtype=np.int32)

In [201]:
df2 = pd.DataFrame(data=mat, columns=columns, index=pd.Index(df['Full Name'], name='Name'))

In [202]:
df2

Unnamed: 0_level_0,aroma_sweet,aroma_citrus_fruit,aroma_spicy,aroma_pomaceous_fruit,aroma_fruity,aroma_banana,aroma_caramel,aroma_dried_fruit,aroma_hop,aroma_malty,...,color_straw,color_gold,color_amber,color_brown,color_black,body_light,body_light_medium,body_medium,body_medium_heavy,body_heavy
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Spaten Munich Helles Premium Lager,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
Spaten Oktoberfestbier,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
Bitbruger Premium Pils,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
Hofbr?u M?nchen Original Larger,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
Weltenburger Barock Dunkel,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Cerveceria Modelo Corona Extra,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
Cerveceria Modelo Negra Modelo,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
Tsingtao,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
Lion Stout,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
