In [1]:
from sklearn import tree
import pandas as pd
import os
import numpy as np
import tensorflow as tf
import unicodedata
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical
from sklearn.ensemble import RandomForestClassifier

In [2]:
### Import the data and do preliminary cleaning
df = pd.read_csv("../beer_reviews.csv")
parsed_data = df.drop_duplicates()
parsed_data = parsed_data.dropna()
parsed_data = parsed_data.reset_index()
parsed_data.head()

Unnamed: 0,index,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883


In [3]:
### Removing accent marks from beer styles
def strip_accents(text):
    try:
        text = unicode(text, 'utf-8')
    except NameError: 
        pass
    text = unicodedata.normalize('NFD', text)\
           .encode('ascii', 'ignore')\
           .decode("utf-8")
    return str(text)

In [4]:
### Use apply function
parsed_data["beer_style"] = parsed_data["beer_style"].apply(strip_accents)
parsed_data.head()

Unnamed: 0,index,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883


In [5]:
### Get style names to convert and replace them in the table
beer_types = pd.read_csv("Beer_Styles_All_Groups.csv")
# beer_types.head()

for name in beer_types.index:
    parsed_data = parsed_data.replace(to_replace=beer_types.iloc[name]['beer_style'], value=beer_types.loc[name]['subgroup_5'])

In [6]:
parsed_data.head()

Unnamed: 0,index,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Pale Ale,1.5,1.5,Sausa Weizen,5.0,47986
1,1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,Pale Ale,3.0,3.0,Red Moon,6.2,48213
2,2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Dark Ale,3.0,3.0,Black Horse Black Beer,6.5,48215
3,3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,Pale Lager,2.5,3.0,Sausa Pils,5.0,47969
4,4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,Pale Ale,4.0,4.5,Cauldron DIPA,7.7,64883


In [7]:
### Remove some styles to increase accuracy
# parsed_data = parsed_data.loc[parsed_data["beer_style"] != "Other"]
parsed_data = parsed_data.loc[parsed_data["beer_style"] != "Specialty"]
parsed_data = parsed_data.loc[parsed_data["beer_style"] != "Sour Ale"]
parsed_data.head()

Unnamed: 0,index,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Pale Ale,1.5,1.5,Sausa Weizen,5.0,47986
1,1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,Pale Ale,3.0,3.0,Red Moon,6.2,48213
2,2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Dark Ale,3.0,3.0,Black Horse Black Beer,6.5,48215
3,3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,Pale Lager,2.5,3.0,Sausa Pils,5.0,47969
4,4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,Pale Ale,4.0,4.5,Cauldron DIPA,7.7,64883


In [8]:
### Save newly categorized data
# parsed_data.to_csv("Beers_Pale_Dark_Ale_Lager.csv")

In [9]:
### Get feature data
# data = parsed_data[["review_overall", "review_aroma", "review_appearance", "review_palate","review_taste"]]
data = parsed_data[["review_overall", "review_aroma", "review_appearance", "review_palate","review_taste","beer_abv"]]
feature_names = data.columns
data.head()

Unnamed: 0,review_overall,review_aroma,review_appearance,review_palate,review_taste,beer_abv
0,1.5,2.0,2.5,1.5,1.5,5.0
1,3.0,2.5,3.0,3.0,3.0,6.2
2,3.0,2.5,3.0,3.0,3.0,6.5
3,3.0,3.0,3.5,2.5,3.0,5.0
4,4.0,4.5,4.0,4.0,4.5,7.7


In [10]:
### Get target names
target = parsed_data[["beer_style"]]
target_names = parsed_data["beer_style"].unique()
target_names
# target.to_csv("Beer_Styles.csv")

array(['Pale Ale', 'Dark Ale', 'Pale Lager', 'Dark Lager'], dtype=object)

In [11]:
### Split training and testing data
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [12]:
### Scale the data
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [13]:
### Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

  return f(**kwargs)


In [14]:
### Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [15]:
### Try Decision Tree Model
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train_scaled, y_train_categorical)
clf.score(X_test_scaled, y_test_categorical)

0.47015687264663586

In [16]:
### Try Random Forest
rf = RandomForestClassifier(n_estimators=100)
rf = rf.fit(X_train_scaled, y_train_categorical)
rf.score(X_test_scaled, y_test_categorical)

0.48514979197084296

In [17]:
### Check feature importances
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.5787832460682426, 'beer_abv'),
 (0.09337879773474768, 'review_aroma'),
 (0.08759631615517655, 'review_overall'),
 (0.08248460509420603, 'review_appearance'),
 (0.0790849433444624, 'review_taste'),
 (0.07867209160316481, 'review_palate')]

# Pickle Model

In [18]:
# import pickle

In [19]:
# # Save to file in the current working directory
# pickle_file = "pickle_rf.pkl"
# pickle_model = None
# with open(pickle_file, 'wb') as file:
#     pickle.dump(rf, file)  
    

In [20]:
# # Load from file
# with open(pickle_file, 'rb') as file:
#     pickle_model = pickle.load(file)

In [21]:
# # Convert on-hot encoded vectors to single digit
# import numpy as np
# rounded_y_train=np.argmax(y_train_categorical, axis=1)
# rounded_y_train[1]

In [22]:
# # Calculate the accuracy score and predict target values
# score = pickle_model.score()
# print("Test score: {0:.2f} %".format(100 * score))
# Ypredict = pickle_model.predict(X_test)

# Pickle Model with joblib

In [23]:
import joblib

In [24]:
### Save to file in the current working directory
joblib_model = None
joblib_file = "joblib_model.pkl"
joblib.dump(rf, joblib_file)

['joblib_model.pkl']

In [25]:
### Load from file
joblib_model = joblib.load(joblib_file)

In [26]:
### Calculate the accuracy and predictions
score = joblib_model.score(X_train_scaled, y_train_categorical)
print("Test score: {0:.2f} %".format(100 * score))
# Ypredict = joblib_model.predict(X_test)

Test score: 57.51 %
