# Beer Hops Data: Classification Model with Extreme-Gradient Boosting Algorithm (XG-Boost)

**Data Files:** *cln_hops_profile.csv, cln_hops_brewvalues.csv*

**Original Source:** *https://beermaverick.com/hops/*  (Data retrieved via web-scraping)

------------------------------------------------------------

### Setup

**Objective:** Import necessary modules for machine-learning models & visualization and read in CSV files into local dataframes for easier access.

In [5]:
# Import necessary packages
import numpy as np
import pandas as pd
import itertools
import folium
import functools
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [13]:
# Read in raw CSV data into local dataframes
CLEAN_HOPS_PATH = './clean_data/cln_hops_brewvalues.csv'  
CLEAN_HOPS_PROFILE_PATH = './clean_data/cln_hops_profile.csv'
hop_values_df = pd.read_csv(CLEAN_HOPS_PATH, index_col='Hop Name')
hop_profile_df = pd.read_csv(CLEAN_HOPS_PROFILE_PATH, index_col='Hop Name')

# Create a master dataframe indexed on hop name
master_df = hop_values_df.merge(hop_profile_df, left_index=True, right_index=True)

master_df

Unnamed: 0_level_0,Alpha Acid % - Min,Alpha Acid % - Max,Alpha Acid % - Avg,Beta Acid % - Min,Beta Acid % - Max,Beta Acid % - Avg,Alpha-Beta Ratio - Min,Alpha-Beta Ratio - Max,Alpha-Beta Ratio - Avg,Co-Humulone as % of Alpha - Min,...,violet,watermelon,whiskey,white_grape,white_wine,wild,wine,woody,yogurt,zest
Hop Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Astra,7.0,10.0,8.5,4.0,6.0,5.0,1.0,3.0,2.0,26.0,...,False,False,False,False,True,False,False,False,False,False
Eclipse,15.7,19.0,17.4,5.9,9.0,7.5,2.0,3.0,2.0,33.0,...,False,False,False,False,False,False,False,False,False,False
Ella,13.3,19.2,16.3,4.0,7.8,5.9,2.0,5.0,3.0,33.0,...,False,False,False,False,False,False,False,False,False,False
Enigma,13.5,19.4,16.5,4.5,7.1,5.8,2.0,4.0,3.0,37.0,...,False,False,False,False,True,False,False,False,False,False
Feux-Coeur Francais,12.0,16.0,14.0,3.1,6.0,4.6,2.0,5.0,4.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zappa,6.0,9.0,7.5,8.0,9.0,8.5,1.0,1.0,1.0,40.0,...,False,False,False,False,False,False,False,False,False,False
Zenia,11.0,14.0,12.5,4.4,5.1,4.8,2.0,3.0,3.0,,...,False,False,False,False,False,False,False,False,False,False
Zenith,9.0,11.0,10.0,,,3.0,inf,4.0,inf,,...,False,False,False,False,False,False,False,False,False,False
Zeus,13.0,17.5,15.3,4.0,6.5,5.3,2.0,4.0,3.0,28.0,...,False,False,False,False,False,False,False,False,False,False


### Pre-Processing

**Objective:** Prepare dataframe to be fed into boosting model with desired predictor variables of interest in the necessary formats.

In [29]:
df = hop_values_df.copy()
regions = {
'Australia': 'Australia',
'Canada': 'North America',
'China': 'Asia',
'Czech Republic': 'Europe',
'France': 'Europe',
'Germany': 'Europe',
'Japan': 'Asia',
'New Zealand': 'Australia',
'Poland': 'Europe',
'Slovenia': 'Europe',
'South Africa': 'Africa',
'Ukraine': 'Europe',
'United Kingdom': 'Europe',
'United States of America': 'North America'
}
df['Region'] = hop_profile_df.Country.map(lambda x: regions[x])

# Remove records of Asia (EDA script showed low amount of hops from China/Japan relative to other countries)
df = df[df['Region'] != 'Asia']

df.replace(float('inf'), np.nan, inplace=True)
df.isna().sum()

df.drop(columns=[
    'Alpha Acid % - Min',
    'Alpha Acid % - Max',
#     'Alpha Acid % - Avg',
    'Beta Acid % - Min',
    'Beta Acid % - Max',
#     'Beta Acid % - Avg',
    'Alpha-Beta Ratio - Min',
    'Alpha-Beta Ratio - Max',
#     'Alpha-Beta Ratio - Avg',
    'Co-Humulone as % of Alpha - Min',
    'Co-Humulone as % of Alpha - Max',
#     'Co-Humulone as % of Alpha - Avg',
    'Total Oils (mL/100g) - Min',
    'Total Oils (mL/100g) - Max',
#     'Total Oils (mL/100g) - Avg',
    'Myrcene - Min',
    'Myrcene - Max',
#     'Myrcene - Avg',
    'Humulene - Min',
    'Humulene - Max',
#     'Humulene - Avg',
    'Caryophyllene - Min',
    'Caryophyllene - Max',
#     'Caryophyllene - Avg',
    'Farnesene - Min',
    'Farnesene - Max',
#     'Farnesene - Avg',
    'Other Oils - Min',
    'Other Oils - Max'
], inplace=True)

for i in df.columns:
    print(i)
    print(df.groupby('Region')[i].describe())
    print('----------------------------')

Alpha Acid % - Avg
               count       mean       std  min     25%   50%     75%   max
Region                                                                    
Africa          10.0  12.280000  2.880895  5.5  12.025  12.5  13.875  15.4
Australia       38.0  11.371053  4.435843  3.0   7.625  11.6  14.300  18.5
Europe         109.0   7.831193  3.451964  2.2   5.000   7.3  10.000  20.5
North America  142.0   9.772535  4.062214  1.2   6.500   9.7  13.000  18.5
----------------------------
Beta Acid % - Avg
               count      mean       std  min   25%  50%    75%   max
Region                                                               
Africa          10.0  5.550000  1.162612  3.8  4.95  5.4  5.800   8.1
Australia       37.0  6.186486  1.408538  3.5  5.00  6.0  7.300   9.0
Europe         107.0  4.876636  1.909884  1.8  3.80  4.5  5.500  12.5
North America  138.0  5.346377  1.723185  1.5  4.00  5.1  6.275  10.5
----------------------------
Alpha-Beta Ratio - Avg
            

In [4]:
# Separate the dependent variable (outcome we want to predict) from the independent predictors
# X_data = master_df.copy()
master_df.dropna(inplace=True)
Y_data = master_df.Country.copy()
X_data = master_df.select_dtypes(exclude="number").copy()
X_data.drop(columns=['Country', 'Purpose'], inplace=True)
X_data = X_data.astype(int)

# Keep only one metric for each brew value (avg)
X_data.drop(columns=[
#     'Country',  # dropping the outcome variable 
#     'Alpha Acid % - Min',
#     'Alpha Acid % - Max',
#     'Alpha Acid % - Avg',
#     'Beta Acid % - Min',
#     'Beta Acid % - Max',
#     'Beta Acid % - Avg',
#     'Alpha-Beta Ratio - Min',
#     'Alpha-Beta Ratio - Max',
#     'Alpha-Beta Ratio - Avg',
#     'Co-Humulone as % of Alpha - Min',
#     'Co-Humulone as % of Alpha - Max',
#     'Co-Humulone as % of Alpha - Avg',
#     'Total Oils (mL/100g) - Min',
#     'Total Oils (mL/100g) - Max',
#     'Total Oils (mL/100g) - Avg',
#     'Myrcene - Min',
#     'Myrcene - Max',
#     'Myrcene - Avg',
#     'Humulene - Min',
#     'Humulene - Max',
#     'Humulene - Avg',
#     'Caryophyllene - Min',
#     'Caryophyllene - Max',
#     'Caryophyllene - Avg',
#     'Farnesene - Min',
#     'Farnesene - Max',
#     'Farnesene - Avg',
#     'Other Oils - Min',
#     'Other Oils - Max'
], inplace=True)

In [5]:
X_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 183 entries, Eclipse to Zythos
Columns: 169 entries, alfalfa to zest
dtypes: int64(169)
memory usage: 243.0+ KB


In [6]:
X_data.fillna(np.nan, inplace=True)
X_data.reset_index(drop=True)
Y_data.reset_index(drop=True)

0                     Australia
1                     Australia
2                     Australia
3                     Australia
4                     Australia
                 ...           
178    United States of America
179    United States of America
180    United States of America
181    United States of America
182    United States of America
Name: Country, Length: 183, dtype: object

### Data-Partitioning

**Objective:** Split dataset to prepare a training set and a testing set to be able to fit a model and evaluate its performance.

In [7]:
# Split X & Y data as per desired specifications
X_train, X_test, y_train, y_test = train_test_split(X_data, Y_data, test_size=0.2, random_state=123)

print(len(X_train))
print(len(X_test))
print(len(y_train))
print(len(y_test))

146
37
146
37


### Model Fitting

**Objective:** Execute boosting algorithm to train model & predict the country categorical variable on the test dataset. 

In [107]:
# Instantiate classifier object and fit the training data
model = XGBClassifier()
model.fit(X_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=12,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [108]:
# Apply model on test set to make country predictions
y_pred = model.predict(X_test)
predictions = [value for value in y_pred]

### Model Evaluation

**Objective:** Evaluate model based on the test data set results.

In [109]:
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 29.73%


In [None]:
#########################################################################################

In [8]:
regions = {
'Australia': 'Australia',
'Canada': 'North America',
'China': 'Asia',
'Czech Republic': 'Europe',
'France': 'Europe',
'Germany': 'Europe',
'Japan': 'Asia',
'New Zealand': 'Austrailia',
'Poland': 'Europe',
'Slovenia': 'Europe',
'South Africa': 'Africa',
'Ukraine': 'Europe',
'United Kingdom': 'Europe',
'United States of America': 'North America'
}

prof_df = hop_profile_df.copy()
prof_df.dropna(inplace=True)
countries = prof_df.Country.copy()
prof_df.drop(columns=['Country', 'Purpose'], inplace=True)
prof_df = prof_df.astype(int)
prof_df['Country'] = countries

master_df = hop_values_df.merge(prof_df, left_index=True, right_index=True)
master_df.Country = master_df.Country.map(lambda x: regions[x])

In [9]:
# Separate the dependent variable (outcome we want to predict) from the independent predictors
master_df.dropna(inplace=True)
X_data = master_df.copy()
Y_data = master_df.Country.copy()
# X_data = master_df.select_dtypes(include="number").copy()



# Keep only one metric for each brew value (avg)
X_data.drop(columns=[
    'Country',  # dropping the outcome variable 
#     'Alpha Acid % - Min',
#     'Alpha Acid % - Max',
#     'Alpha Acid % - Avg',
#     'Beta Acid % - Min',
#     'Beta Acid % - Max',
#     'Beta Acid % - Avg',
#     'Alpha-Beta Ratio - Min',
#     'Alpha-Beta Ratio - Max',
#     'Alpha-Beta Ratio - Avg',
#     'Co-Humulone as % of Alpha - Min',
#     'Co-Humulone as % of Alpha - Max',
#     'Co-Humulone as % of Alpha - Avg',
#     'Total Oils (mL/100g) - Min',
#     'Total Oils (mL/100g) - Max',
#     'Total Oils (mL/100g) - Avg',
#     'Myrcene - Min',
#     'Myrcene - Max',
#     'Myrcene - Avg',
#     'Humulene - Min',
#     'Humulene - Max',
#     'Humulene - Avg',
#     'Caryophyllene - Min',
#     'Caryophyllene - Max',
#     'Caryophyllene - Avg',
#     'Farnesene - Min',
#     'Farnesene - Max',
#     'Farnesene - Avg',
#     'Other Oils - Min',
#     'Other Oils - Max'
], inplace=True)

In [10]:
X_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 183 entries, Eclipse to Zythos
Columns: 198 entries, Alpha Acid % - Min to zest
dtypes: float64(29), int64(169)
memory usage: 284.5+ KB


In [11]:
# X_data.fillna(np.nan, inplace=True)
X_data.reset_index(drop=True)
Y_data.reset_index(drop=True)

0          Australia
1          Australia
2          Australia
3          Australia
4          Australia
           ...      
178    North America
179    North America
180    North America
181    North America
182    North America
Name: Country, Length: 183, dtype: object

### Data-Partitioning

**Objective:** Split dataset to prepare a training set and a testing set to be able to fit a model and evaluate its performance.

In [12]:
# Split X & Y data as per desired specifications
X_train, X_test, y_train, y_test = train_test_split(X_data, Y_data, test_size=0.2, random_state=123)

print(len(X_train))
print(len(X_test))
print(len(y_train))
print(len(y_test))

146
37
146
37


### Model Fitting

**Objective:** Execute boosting algorithm to train model & predict the country categorical variable on the test dataset. 

In [13]:
# Instantiate classifier object and fit the training data
model = XGBClassifier()
model.fit(X_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=12,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [14]:
# Apply model on test set to make country predictions
y_pred = model.predict(X_test)
predictions = [value for value in y_pred]

### Model Evaluation

**Objective:** Evaluate model based on the test data set results.

In [15]:
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 59.46%
