Beer Type Predictor Based on ABV and IBU

We will work to predict what type of beer each is based on the characteristics of that beer using machine learning.

The data we'll use here comes from a publicly-available [Kaggle dataset on craft beer](https://www.kaggle.com/nickhould/craft-cans).

In [None]:
# Update scipy to be compatible with numpy and avoid warnings 
!pip install --upgrade scipy

Defaulting to user installation because normal site-packages is not writeable
Collecting scipy
  Downloading scipy-1.13.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
[K     |████████████████████████████████| 38.6 MB 15.9 MB/s eta 0:00:01                     | 2.1 MB 15.9 MB/s eta 0:00:03�███████████████████▉          | 26.3 MB 15.9 MB/s eta 0:00:01██████        | 28.8 MB 15.9 MB/s eta 0:00:01███▍ | 36.6 MB 15.9 MB/s eta 0:00:01
Installing collected packages: scipy
Successfully installed scipy-1.13.1


In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_fscore_support


In [4]:
assert pd
assert np
assert SVC
assert confusion_matrix
assert classification_report
assert precision_recall_fscore_support

In [None]:
breweries = pd.read_csv('data/breweries.csv')

beers = pd.read_csv('data/beers.csv')

In [6]:
assert breweries.shape == (558, 4)
assert beers.shape == (2410, 8)

In [7]:
breweries.head()

Unnamed: 0.1,Unnamed: 0,name,city,state
0,0,NorthGate Brewing,Minneapolis,MN
1,1,Against the Grain Brewery,Louisville,KY
2,2,Jack's Abby Craft Lagers,Framingham,MA
3,3,Mike Hess Brewing Company,San Diego,CA
4,4,Fort Point Beer Company,San Francisco,CA


In [8]:
beers.head()

Unnamed: 0.1,Unnamed: 0,abv,ibu,id,name,style,brewery_id,ounces
0,0,0.05,,1436,Pub Beer,American Pale Lager,408,12.0
1,1,0.066,,2265,Devil's Cup,American Pale Ale (APA),177,12.0
2,2,0.071,,2264,Rise of the Phoenix,American IPA,177,12.0
3,3,0.09,,2263,Sinister,American Double / Imperial IPA,177,12.0
4,4,0.075,,2262,Sex and Candy,American IPA,177,12.0


In [9]:
null_beers = beers.isnull().sum()

In [11]:
beers = beers.dropna(subset=['style','abv', 'ibu'])

In [34]:
beer_df = pd.merge(beers, breweries, how="left")

In [36]:
beer_df.describe()

Unnamed: 0.1,Unnamed: 0,abv,ibu,id,brewery_id,ounces
count,1403.0,1403.0,1403.0,1403.0,1403.0,1403.0
mean,1241.128297,0.059919,42.739843,1413.88881,223.375624,13.510264
std,691.675612,0.013585,25.962692,757.572191,150.38751,2.254112
min,14.0,0.027,4.0,1.0,0.0,8.4
25%,681.5,0.05,21.0,771.0,95.5,12.0
50%,1228.0,0.057,35.0,1435.0,198.0,12.0
75%,1864.5,0.068,64.0,2068.5,350.0,16.0
max,2408.0,0.125,138.0,2692.0,546.0,32.0


In [37]:
beer_counts = beer_df['style'].value_counts()
print(beer_counts)

style
American IPA                          301
American Pale Ale (APA)               153
American Amber / Red Ale               77
American Double / Imperial IPA         75
American Blonde Ale                    61
                                     ... 
Roggenbier                              1
Smoked Beer                             1
Euro Pale Lager                         1
Other                                   1
American Double / Imperial Pilsner      1
Name: count, Length: 90, dtype: int64


In [39]:
#predicting only the four most common beer styles 
styles = beer_df['style'].value_counts()[:4].index.tolist()
beer_df = beer_df[beer_df['style'].isin(style)]

In [41]:
num_training = int(len(beer_df)*0.8)
num_testing = len(beer_df) - num_training

In [None]:
beer_X = beer_df[['abv', 'ibu']]
beer_Y = beer_df['style'].values

In [43]:
#beer_x contains the predictors and beer_Y contains the outcome variable
beer_X = beer_df[['abv', 'ibu']]
beer_Y = beer_df['style'].values

In [45]:
#split data into training and test sets 
beer_train_X = beer_X.iloc[:num_training]
beer_test_X = beer_X.iloc[num_training:]
beer_train_Y = beer_Y[:num_training]
beer_test_Y = beer_Y[num_training:]

In [47]:
#used a linear SVM classifier 
def train_SVM(X, y, kernel='linear'):
    clf = SVC(kernel=kernel)
    clf.fit(X, y)
    
    return clf

In [None]:
#train the model 
beer_clf = train_SVM(beer_train_X, beer_train_Y)

In [48]:
#train the model 
beer_clf = train_SVM(beer_train_X, beer_train_Y)

In [50]:
beer_predicted_train_Y = beer_clf.predict(beer_train_X)
beer_predicted_test_Y = beer_clf.predict(beer_test_X)

Model Assessment

In [52]:
#classification report for training data
class_report_train = classification_report(beer_train_Y, beer_predicted_train_Y)

print(class_report_train)

                                precision    recall  f1-score   support

      American Amber / Red Ale       0.82      0.45      0.58        69
American Double / Imperial IPA       0.76      0.25      0.37        53
                  American IPA       0.69      0.84      0.76       236
       American Pale Ale (APA)       0.57      0.64      0.60       126

                      accuracy                           0.67       484
                     macro avg       0.71      0.54      0.58       484
                  weighted avg       0.69      0.67      0.65       484



In [54]:
#generate a classification report 
class_report_test = classification_report(beer_test_Y, beer_predicted_test_Y)
print(class_report_test)

                                precision    recall  f1-score   support

      American Amber / Red Ale       0.62      0.62      0.62         8
American Double / Imperial IPA       0.78      0.32      0.45        22
                  American IPA       0.70      0.72      0.71        65
       American Pale Ale (APA)       0.55      0.78      0.65        27

                      accuracy                           0.66       122
                     macro avg       0.66      0.61      0.61       122
                  weighted avg       0.68      0.66      0.64       122



In [56]:
#confusion matrix for training data (where is the model going wrong)
conf_mat_train = confusion_matrix(beer_train_Y, beer_predicted_train_Y)
print(conf_mat_train)

[[ 31   1  10  27]
 [  0  13  40   0]
 [  0   3 198  35]
 [  7   0  38  81]]


In [58]:
#confusion matric for testing data 
conf_mat_test = confusion_matrix(beer_test_Y, beer_predicted_test_Y)
print(conf_mat_test)

[[ 5  0  2  1]
 [ 1  7 14  0]
 [ 0  2 47 16]
 [ 2  0  4 21]]
