In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os
from sklearn.preprocessing import LabelBinarizer
from config import pw

In [None]:
beer = pd.read_csv('data/beer_reviews.csv')
beer.head(10)

In [None]:
def plot_corr(beer, size=11):
    """
    Function plots a graphical correlation matrix for each pair of columns in the dataframe.

    Input:
        df: pandas DataFrame
        size: vertical and horizontal size of the plot

    Displays:
        matrix of correlation between columns.  Blue-cyan-yellow-red-darkred => less to more correlated
                                                0 ------------------>  1
                                                Expect a darkred line running from top left to bottom right
    """

    corr = beer.corr()    # data frame correlation function
    fig, ax = plt.subplots(figsize=(size, size))
    ax.matshow(corr)   # color code the rectangles by correlation value
    plt.xticks(range(len(corr.columns)), corr.columns)  # draw x tick marks
    plt.yticks(range(len(corr.columns)), corr.columns)  # draw y tick marks

In [None]:
plot_corr(beer)

In [None]:
beer.corr()

In [None]:
overall_review = beer.groupby("beer_name")["review_overall"].mean()
taste_review = beer.groupby("beer_name")["review_taste"].mean()
aroma_review = beer.groupby("beer_name")["review_aroma"].mean()
appearance_review = beer.groupby("beer_name")["review_appearance"].mean()
palate_review = beer.groupby("beer_name")["review_palate"].mean()

In [7]:
beer_df_updated = pd.DataFrame({"overall_review": overall_review, "taste_review": taste_review,"aroma_review": aroma_review,
                               "appearance_review": appearance_review, "palate_review": palate_review})


In [8]:
merged_df = pd.merge(beer, beer_df_updated, on="beer_name", how="left")
merged_df.head(10)

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid,overall_review,taste_review,aroma_review,appearance_review,palate_review
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986,1.5,1.5,2.0,2.5,1.5
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213,3.0,3.0,2.5,3.0,3.0
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215,3.0,3.0,2.5,3.0,3.0
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969,3.0,3.0,3.0,3.5,2.5
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883,4.0,4.5,4.5,4.0,4.0
5,1075,Caldera Brewing Company,1325524659,3.0,3.5,3.5,oline73,Herbed / Spiced Beer,3.0,3.5,Caldera Ginger Beer,4.7,52159,3.833333,3.777778,3.722222,3.666667,3.277778
6,1075,Caldera Brewing Company,1318991115,3.5,3.5,3.5,Reidrover,Herbed / Spiced Beer,4.0,4.0,Caldera Ginger Beer,4.7,52159,3.833333,3.777778,3.722222,3.666667,3.277778
7,1075,Caldera Brewing Company,1306276018,3.0,2.5,3.5,alpinebryant,Herbed / Spiced Beer,2.0,3.5,Caldera Ginger Beer,4.7,52159,3.833333,3.777778,3.722222,3.666667,3.277778
8,1075,Caldera Brewing Company,1290454503,4.0,3.0,3.5,LordAdmNelson,Herbed / Spiced Beer,3.5,4.0,Caldera Ginger Beer,4.7,52159,3.833333,3.777778,3.722222,3.666667,3.277778
9,1075,Caldera Brewing Company,1285632924,4.5,3.5,5.0,augustgarage,Herbed / Spiced Beer,4.0,4.0,Caldera Ginger Beer,4.7,52159,3.833333,3.777778,3.722222,3.666667,3.277778


In [9]:
beer_data_df = merged_df.drop_duplicates(["beer_name"])
beer_data_df.head(7)

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid,overall_review,taste_review,aroma_review,appearance_review,palate_review
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986,1.5,1.5,2.0,2.5,1.5
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213,3.0,3.0,2.5,3.0,3.0
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215,3.0,3.0,2.5,3.0,3.0
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969,3.0,3.0,3.0,3.5,2.5
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883,4.0,4.5,4.5,4.0,4.0
5,1075,Caldera Brewing Company,1325524659,3.0,3.5,3.5,oline73,Herbed / Spiced Beer,3.0,3.5,Caldera Ginger Beer,4.7,52159,3.833333,3.777778,3.722222,3.666667,3.277778
10,163,Amstel Brouwerij B. V.,1010963392,3.0,2.0,3.0,fodeeoz,Light Lager,2.5,2.5,Amstel Light,3.5,436,2.719355,2.363441,2.231183,2.568817,2.450538


In [10]:
# drop unecessary rows
clean_beer = beer_data_df.drop(["review_time", "review_profilename"], axis=1)

In [11]:
# encode for cluster analysis
style_cols = pd.get_dummies(beer_data_df.beer_style)

In [12]:
# combine updated df with new cols
beer_df = pd.concat([clean_beer, style_cols], axis=1)

In [13]:
# temporarily drop cols for observation clustering
dropped_df = beer_df.drop(["brewery_name", "beer_name", "beer_style", "brewery_id", "beer_beerid"], axis=1)

In [14]:
new_beer_df = dropped_df.fillna(0)

In [15]:
X = new_beer_df

In [16]:
Y = beer_df['beer_beerid']

In [17]:
X.shape

(37770, 115)

In [18]:
Y.shape

(37770,)

## K Nearest Neighbors 

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

In [20]:
def k_nearest_neighbors(x_train,y_train):
    classifier = KNeighborsClassifier(n_neighbors= 100)
    classifier.fit(x_train,y_train)
    
    return classifier

In [21]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size= 0.2)

In [22]:
def build_and_train_classifier(x_train,y_train,classification_fn):
    model = classification_fn(x_train,y_train)
    y_pred = model.predict(x_test)
    train_score = model.score(x_train,y_train)
    test_score = accuracy_score(y_test,y_pred)
    
    print("Training score: " , train_score)
    print("Testing score: " , test_score)

In [23]:
build_and_train_classifier(x_train,y_train,k_nearest_neighbors)

Training score:  0.007314005824728621
Testing score:  0.0


## K Means Clustering

In [33]:
from sklearn.cluster import KMeans
from sklearn.model_selection import ParameterGrid
from sklearn import metrics
from sklearn.cluster import DBSCAN
from sklearn.cluster import MeanShift


In [25]:
# sse_error = []
# for n_clusters in range(1000,1500):
#     kmeans = KMeans(n_clusters)
#     kmeans.fit(new_beer_df)
#     sse_error.append(kmeans.inertia_)

In [26]:

# fig = plt.figure()
# ax = fig.add_axes([0, 0, 1, 1])
# ax.plot(range(2000,2020), sse_error, marker='o')
# ax.set_xlabel('n_clusters')
# ax.set_ylabel('SSE')

In [27]:
# # set to 2000 based off functionality and final entry total
# kmeans = KMeans(n_clusters=2000)

In [28]:
# # Fit the model to the data
# kmeans.fit(new_beer_df)

In [29]:
# kmeans.labels_

# Silhouette Score 

Does not require labelled data. Definition - How similar an object is to objects in its own cluster and how different it is from objects in other clusters. Silhouette score averages silhouette coefficient of each of the samples. Bound is between -1 to +1 where +1 is perfect clustering and -1 is incorrect clustering, 0 = overalapping. The score tends to be higher for dense and well separated clusters.


In [30]:
# from sklearn.metrics import silhouette_score
# score = silhouette_score(new_beer_df,kmeans.labels_)
# print("Score = ",score)

In [37]:
parameters = {'n_clusters': [2500, 2700,3000,3200,3500,3700, 4000]}

parameter_grid = ParameterGrid(parameters)

In [38]:
list(parameter_grid)

[{'n_clusters': 2500},
 {'n_clusters': 2700},
 {'n_clusters': 3000},
 {'n_clusters': 3200},
 {'n_clusters': 3500},
 {'n_clusters': 3700},
 {'n_clusters': 4000}]

In [39]:
best_score = -1
model = KMeans()

In [None]:
for g in parameter_grid:
    model.set_params(**g)
    model.fit(new_beer_df)

    ss = metrics.silhouette_score(new_beer_df, model.labels_)
    print('Parameter: ', g, 'Score: ', ss)
    if ss > best_score:
        best_score = ss
        best_grid = g

Parameter:  {'n_clusters': 2500} Score:  0.10686654395627353
Parameter:  {'n_clusters': 2700} Score:  0.10908330407434084


In [None]:
# range_n_clusters = list (range(2000,2020))
# for n_clusters in range_n_clusters:
#     clusterer = KMeans(n_clusters=n_clusters)
#     preds = clusterer.fit_predict(new_beer_df)
#     centers = clusterer.cluster_centers_

#     score = silhouette_score(new_beer_df, preds)
#     print("For n_clusters = {}, silhouette score is {})".format(n_clusters, score))

In [None]:
# Predict the clusters
predicted_clusters = kmeans.predict(new_beer_df)
predicted_clusters

In [None]:
# concat the clusters and the temp dropped cols
beer_df["cluster"] = predicted_clusters

In [None]:
# reorganize cols in df
data_df = beer_df[['beer_name', 'beer_beerid', 'brewery_name', 'brewery_id', 'beer_style', 'cluster','beer_abv',
                        'overall_review', 'aroma_review', 'appearance_review', 'palate_review', 'taste_review']]

In [None]:
# rename cols
beer_data = data_df.rename(columns={'beer_beerid': 'beer_id', 'cluster': 'cluster_group'})
beer_data.head()