##Beer Reviews Data Analysis using Machine Learning and exploratory data visulization and analysis techniques

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.graph_objs as plgo
import plotly.figure_factory as plff

In [None]:
reviews = pd.read_csv('../input/beerreviews/beer_reviews.csv')
reviews.head()

### About Dataset

In [None]:
reviews.shape

There are almost 1.5 million entries on the reviews of the beer and have 13 attributes repreesenting the data

In [None]:
reviews.columns

In [None]:
print('Information about the columns; type,entries,missing value info:  \n')
print(reviews.info())

In [None]:
## Counting unqiueness by Brewery/Company
print('No of unique brewery by name:', reviews.brewery_name.nunique())
print('No of unique brewery by ids:',reviews.brewery_id.nunique())

In [None]:
## Counting uniqueness by Beer
print('No of unique beer by name:', reviews.beer_name.nunique())
print('No of unique beer by ids:',reviews.beer_beerid.nunique())

In [None]:
## Counting number of users,reviewed the beer
print('No of unique users, reviewing the given beers: ',reviews.review_profilename.nunique())

So, in nutshell, These dataset contains 1.5 million reviews of around more than 60 thousand beers and which are considered from more than 5 thousand breweries

### Data Preprocessing

Handling Missing values

In [None]:
print("Missing values in Dataset: /n", reviews.isnull().sum())

In [None]:
reviews = reviews.dropna(axis = 0)
print("After removing the missing entries in dataset: /n", reviews.info())

Handling Duplicate values

In [None]:
print('User reviewed one beer more than ones,categorised by beer name: \n', 
      reviews.loc[reviews.duplicated(['review_profilename','beer_name'],keep= False)])

In [None]:
print('User reviewed one beer more than ones,categorised by beer ids: \n',
      reviews.loc[reviews.duplicated(['review_profilename','beer_beerid'],keep=False)])

In [None]:
## Sorting the ratings with descending order,so that with removing duplicates for beer reviews,user gave highest ratings entry can be kept
reviews = reviews.sort_values('review_overall',ascending= False)

In [None]:
## Removing the duplicates
reviews = reviews.drop_duplicates(subset=['review_profilename','beer_name'],keep = 'first')
reviews = reviews.drop_duplicates(subset=['review_profilename','beer_beerid'],keep = 'first')
reviews.shape

In [None]:
round(reviews.describe())

In the dataset, review time column has datatype of int64, converting it in form of datetime format

In [None]:
reviews.review_time = pd.to_datetime(reviews.review_time,unit = 's')
reviews.dtypes

### Data visualization and analysis

In [None]:
reviews.hist(figsize = (20,20))

### Top 10 popular Brewaries w.r.t Reviews counts

In [None]:
popular_brewery = reviews.groupby('brewery_name').brewery_name.count()
popular_brewery = popular_brewery.sort_values(ascending= False)
popular_breweries = popular_brewery.iloc[0:10]
popular_breweries

In [None]:
Top_10_popular_breweries = pd.DataFrame(data = popular_breweries)
Top_10_popular_breweries.columns = ['Top Brewery Name wrt to its review counts']
Top_10_popular_breweries

### Top 10 Breweries with most beer types present

In [None]:
top_10_breweries_by_beertype = reviews.groupby(['brewery_name']).beer_name.nunique().sort_values(ascending = False)
top_10_breweries_by_beertype = top_10_breweries_by_beertype.iloc[0:10]
top_10_breweries_by_beertype = pd.DataFrame(top_10_breweries_by_beertype)
top_10_breweries_by_beertype.columns = ['Top Breweries wrt to Beer type presence']
top_10_breweries_by_beertype

### Top 10 popular Beers wrt reviews counts

In [None]:
top_10_popular_beers = pd.DataFrame(reviews.groupby('beer_name').beer_name.count().sort_values(ascending = False).iloc[0:10])
top_10_popular_beers.columns = ['Beer Names wrt Reviews counts']
top_10_popular_beers

### Top 10 beers with Highest Ratings

In [None]:
Top_beer_ratings = reviews[['beer_name','review_overall']].groupby('beer_name').review_overall.agg('mean').sort_values(ascending = False)
Top_beer_ratings = Top_beer_ratings.iloc[0:10]
Top_beer_ratings.reset_index()

### Top 10 popular Beer styles

In [None]:
popular_beer_styles = pd.DataFrame(reviews.groupby('beer_style').beer_style.count().sort_values(ascending = False).iloc[0:10])
popular_beer_styles.columns = ['Beer Styles wrt review counts']
popular_beer_styles

In [None]:
## Beer Styles wrt top ratings

Top_rating_beer_styles = reviews[['beer_style','review_overall']].groupby('beer_style').review_overall.agg('mean').sort_values(ascending = False).iloc[0:10]
Top_rating_beer_styles = pd.DataFrame(Top_rating_beer_styles)
Top_rating_beer_styles

### Beers categorised as highly rated wrt appreance,aroma and taste

In [None]:
## Aroma,taste and Appearances

beers_looks_smell = reviews[['beer_name', 'review_aroma','review_appearance','review_taste']]

In [None]:
beer_looks_smell = beers_looks_smell.groupby('beer_name').aggregate(['max'])
beer_looks_smell = beers_looks_smell.sort_values(by = ['review_aroma','review_appearance','review_taste'],ascending = False)
beers_looks_smell.iloc[0:15]

### Top Breweries/beers with highest ratings and great taste

In [None]:
top_ratings_taste = reviews[['brewery_name','beer_name','review_overall','review_taste']]

In [None]:
top_ratings_taste = top_ratings_taste.groupby(['brewery_name','beer_name']).aggregate(['max'])
top_ratings_taste.iloc[0:20]

In [None]:
reshape = reviews[['review_overall','beer_name']].groupby('beer_name').agg(['count','mean'])
print('Beer with review_overall more than 4: \n',reshape[reshape['review_overall','mean']>4])
print('Beer with review_overall more than 4 and review counts are greater than 200: \n')
Top_beers = pd.DataFrame(reshape[(reshape['review_overall','mean']>4) & (reshape['review_overall','count']>200)])
Top_beers

In [None]:
## Now getting Top 20 Most reviewed and Most highly Rated beers
Top_beers = Top_beers.sort_values(by = ([('review_overall','count'),('review_overall','mean')]),ascending = False).iloc[0:20]
Top_beers

Above 20 Beers are mostly reviewed and highly rated,so recommended for users with it. 