<h1>Import</h1>

In [None]:
import numpy as np
import warnings
warnings.simplefilter("ignore")

<h1>Visualization libraries</h1>

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

<h1>Loading the dataset</h1>

In [None]:
data = pd.read_csv("../input/beer_reviews.csv")

In [None]:
data.head(10)

In [None]:
data.tail()

In [None]:
data.info()

In [None]:
# Remove unnecessary columns
data = data.drop(["brewery_id"], axis=1)
data = data.drop(["review_time"], axis=1)
data = data.drop(["review_profilename"], axis=1)
data = data.drop(["beer_beerid"], axis=1)
data = data.dropna()
data.head()

In [None]:
data.describe()

<h1>Arrange the columns and sort by brewery name</h1>

In [None]:
'''
Translate (перевод):
brewery_name      - название пивоварни
beer_style        - вид пива
beer_name         - название пива
beer_abv          - уровень алкоголя
review_appearance - обзор внешнего вида
review_aroma      - обзор аромата
review_palate     - обзор вкуса (palate)
review_taste      - обзор вкуса (taste)
review_overall    - обзор в целом
'''
data = data[['brewery_name', 'beer_style', 'beer_name', 'beer_abv', 'review_appearance', 
             'review_aroma', 'review_palate', 'review_taste', 'review_overall']]
data = data.sort_values(by=['brewery_name', 'beer_style', 'beer_name' , 'beer_abv', 'review_overall'])
data = data.reset_index()
data = data.drop(["index"], axis=1)
data.head()

In [None]:
data.tail()

<h1>New column review_average</h1>

In [None]:
data['review_average'] = data.apply(lambda row: (row["review_overall"] + row["review_aroma"] + 
                                                 row["review_appearance"] + row["review_palate"] + 
                                                 row["review_taste"]) / 5, axis=1)

data = data.drop(data[(data["review_average"] < 1) | (data["review_average"] > 5)].index)
data.head()

<h1>Visualization of Top 15</h1>

In [None]:
def visualization_function_1(name, condition, ax_left, ax_right):
    
    unique_names = data[name].unique()
    length = len(unique_names)
    rev_aver = np.zeros(length)
    count = np.zeros(length, dtype=np.int32)
    
    for _, row in data.iterrows():
        idx = np.where(unique_names == row[name])
        rev_aver[idx] += row["review_average"]
        count[idx] += 1
  
    for i in range(length):
        if count[i] > condition:
            rev_aver[i] /= count[i]
        else:
            rev_aver[i] = 1
            
    zipped_left  = sorted(zip(unique_names, rev_aver), key=lambda x: x[1], reverse=True)
    names_left   = [zipped_left[i][0] for i in range(length)]
    sorted_score = [zipped_left[i][1] for i in range(length)]
    
    zipped_right = sorted(zip(unique_names, count), key=lambda x: x[1], reverse=True)
    names_right  = [zipped_right[i][0] for i in range(length)]
    sorted_count = [zipped_right[i][1] for i in range(length)]
    
    sns.barplot(sorted_score[:15], names_left[:15], ax=ax_left)
    ax_left.set_xlim(3, 5)
    ax_left.set_xlabel("Scores of review_average")
    # limit the length of names
    ax_left.set_yticklabels([i.get_text()[:17] + "..." if len(i.get_text()) > 17 else i.get_text() 
                             for i in ax_left.get_yticklabels()])
    
    sns.barplot(sorted_count[:15], names_right[:15], ax=ax_right).set_title(name)
    ax_right.set_xlabel("Total number of drinks")
    # limit the length of names
    ax_right.set_yticklabels([i.get_text()[:17] + "..." if len(i.get_text()) > 17 else i.get_text() 
                              for i in ax_right.get_yticklabels()])

sns.set(rc={"axes.grid": True})
fig, axs = plt.subplots(ncols=2, nrows=3, figsize=[16, 24])
fig.subplots_adjust(wspace=0.4)

visualization_function_1("brewery_name", 5, axs[0, 0], axs[0, 1])
visualization_function_1("beer_style", 5, axs[1, 0], axs[1, 1])
visualization_function_1("beer_name", 5, axs[2, 0], axs[2, 1])

axs[0, 0].set_title("The best quality Breweries")
axs[0, 1].set_title("Breweries that produce the most drinks")

axs[1, 0].set_title("The best styles of beer")
axs[1, 1].set_title("The most popular styles of beer")

axs[2, 0].set_title("The best quality beer")
axs[2, 1].set_title("The most popular beer");

<h1>Distribution by beer_abv</h1>

In [None]:
data = data.drop(data[data["beer_abv"] > 20].index)

sns.set(rc={"axes.grid": True})
fig, axs = plt.subplots(nrows=2, figsize=[16, 12])
plt.setp(axs, xticks=range(0, 21, 2))

sns.distplot(data["beer_abv"], bins=50, ax=axs[0], kde=False, color="m")
sns.kdeplot(data["beer_abv"], ax=axs[1], color="m", label="Original distribution")
sns.kdeplot(data["beer_abv"], ax=axs[1], color="r", bw=0.3, shade=True, label="Smoothed distribution")

axs[0].set_xlim(0, 20)
axs[0].set_ylabel("Total number of drinks")

axs[1].set_xlim(0, 20)
axs[1].set_xlabel("beer_abv")
axs[1].set_ylabel("Distribution");

<h1>The distribution of beverages via reviews and beer_abv</h1>

In [None]:
# Round off to integers
data["review_appearance"] = np.round(data["review_appearance"])
data["review_aroma"]      = np.round(data["review_aroma"])
data["review_palate"]     = np.round(data["review_palate"])
data["review_taste"]      = np.round(data["review_taste"])
data["review_overall"]    = np.round(data["review_overall"])
data["review_average"]    = np.round(data["review_average"])

data["review_appearance"] = data["review_appearance"].astype(int)
data["review_aroma"]      = data["review_aroma"].astype(int)
data["review_palate"]     = data["review_palate"].astype(int)
data["review_taste"]      = data["review_taste"].astype(int)
data["review_overall"]    = data["review_overall"].astype(int)
data["review_average"]    = data["review_average"].astype(int)

# Remove possible anomalous values
data = data.drop(data[(data["review_appearance"] < 1) | (data["review_appearance"] > 5)].index)
data = data.drop(data[(data["review_aroma"] < 1) | (data["review_aroma"] > 5)].index)
data = data.drop(data[(data["review_palate"] < 1) | (data["review_palate"] > 5)].index)
data = data.drop(data[(data["review_taste"] < 1) | (data["review_taste"] > 5)].index)
data = data.drop(data[(data["review_overall"] < 1) | (data["review_overall"] > 5)].index)
data = data.drop(data[(data["review_average"] < 1) | (data["review_average"] > 5)].index)

In [None]:
sns.set(rc={"axes.grid": True})
fig, axs = plt.subplots(nrows=3, ncols=2, figsize=[18, 20])
plt.setp(axs, yticks=range(0, 21, 2))
x_str = [['review_appearance', 'review_aroma'], 
         ['review_taste',      'review_palate'], 
         ['review_overall',    'review_average']]

for i in range(3):
    for j in range(2):
        sns.violinplot(x=x_str[i][j], y='beer_abv', data=data, ax=axs[i, j], 
                       saturation=0.9, width=0.9)
        axs[i, j].set_ylim(0, 20);

<h1>Number of drinks for each assessment</h1>

In [None]:
plt.figure(figsize=[8, 6])
sns.countplot(x='review_average', data=data, saturation=0.9)
plt.xlabel("review_average")
plt.ylabel("Total number of drinks");