In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

df = pd.read_csv('/kaggle/input/beerreviews/beer_reviews.csv')

In [None]:
### Exploring the Beer Reviews dataset
df.head()

In [None]:
### How many reviews are in the dataset?

df.shape[0]

In [None]:
### What are the top ten breweries by overall rating?
df.groupby('brewery_name').review_overall.mean().sort_values(ascending=False).head(10)

In [None]:
### Overall review rating might not be enough. How many breweries have an average overall rating of 5?
overall_reviews = df.groupby('brewery_name').agg({'review_overall': ['mean', 'count']}).reset_index()
overall_reviews.columns = overall_reviews.columns.droplevel(0)
overall_reviews.columns = ['brewery_name', 'review_overall', 'review_count']

overall_reviews[overall_reviews['review_overall'] == 5].sort_values('review_count', ascending=False)

In [None]:
### Perfect ratings don't seeme to be a very good indication of a top brewery 
#### as these breweries have very few ratings. 
### What are the highest rated breweries if the number of reviews are taken into account?
### Let's look at the breweries that have at least 1,000 reviews
most_rated = overall_reviews[overall_reviews['review_count'] >= 1000].sort_values('review_overall', ascending=False)
most_rated.head(10)

In [None]:
### What are the most common styles that these breweries make?

top_styles = df[df.brewery_name.isin(most_rated.head(10).brewery_name.to_list())].groupby('beer_style').brewery_name.count().sort_values(ascending=False).head(10)
top_styles

In [None]:
### What is the average ABV (alcohol content) of beers in these styles and how does this compare to the average ABV?

df['is_top_style'] = df.beer_style.isin(top_styles.index.to_list())
sns.boxplot(x='is_top_style', y='beer_abv', data=df, showfliers=False)

In [None]:
### It looks like the highest rated beer styles have a higher ABV than the less highly rated ones.
### Is there a correlation between ABV and beer rating?


sns.lmplot(data=df[df.brewery_name.isin(most_rated.brewery_name.to_list())].groupby('beer_name').agg(
    {'review_overall': 'mean', 'beer_abv': 'mean'}).reset_index(), 
           x="beer_abv", y="review_overall", height=6, aspect=1.5)

In [None]:
### There doesn't seem to be a strong correlation between beer ABV and rating
### Are there any other features that may have a correlation?

df[df.brewery_name.isin(most_rated.brewery_name.to_list())].corr()

In [None]:
### Unsurpringly, beer taste has the strongest correlation with the overall review, followed by palate and aroma
corr_df = df[df.brewery_name.isin(most_rated.brewery_name.to_list())].groupby('beer_name').agg({
    'review_overall': 'mean', 
    'review_aroma': 'mean',
    'review_taste': 'mean',
    'review_palate': 'mean'}).reset_index()

corr_df = pd.melt(corr_df, id_vars=['review_overall', 'beer_name'])

sns.lmplot(data=corr_df, x='value', y='review_overall', col='variable')