## Importing the Data...

In [None]:
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')

In [None]:
beer = pd.read_csv('../input/beers.csv')
del beer['Unnamed: 0'] # Remove index column from original .csv data

## Engineer the Data...

### View and Explore

In [None]:
beer.head(5)

In [None]:
beer.columns

In [None]:
print(beer.shape)

### Address Gaps

In [None]:
beer.isnull().sum(0)/len(beer)*100

In [None]:
del beer['ibu']

## Analyze the Data...

### Most Popular Craft Beer Styles

In [None]:
style = beer.groupby('style')['id'].count()
style = style.sort_values(ascending = False)
top25_style = style[:25]

In [None]:
fig, ax = plt.subplots(figsize = (15,12))
plt.xticks(rotation='90')
sns.barplot(top25_style.index, y=top25_style)
plt.xlabel('Styles', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.title('Number of Craft Beers by Style', fontsize=15)

### Naming a Craft Beer by Style

In [None]:
names_by_style = beer.groupby('style')['name'].apply(lambda x: list(x)).reset_index()

In [None]:
len(names_by_style)

In [None]:
names_by_style.style

In [None]:
names_by_style['num_names'] = names_by_style.name.apply(lambda x: len(x))

In [None]:
names = names_by_style.sort_values(by='num_names', ascending = False)

In [None]:
names.head(10)

In [None]:
names.num_names.median()

In [None]:
names = names[names.num_names > 50]

In [None]:
len(names)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
def get_names(x):
    stop_words = ['american', 'pale', 'ipa', 'ale', 'apa','red', 'amber', 'blonde', 'double', 'imperial', 'beer',
                 'wheat', 'brown', 'porter', 'saison', 'farmhouse', 'witbier', 'the', 'a', 'an', 'is', 'am', 'be',
                 'rye', 'india', 'session', 'extra', 'of', 'wit', 'style','2010', '2011', '2012', '2013', '2004',
                 '2014', 'on', '1881', '2009', '2006', '12', '16', '1335', '413', '88', '2006', '2007','2015','805',
                 'hop', 'hopped','full', 'farm', 'white','belgian']
    cv = CountVectorizer(stop_words = stop_words)
    cv_fit = cv.fit_transform(x)
    names = cv.get_feature_names()
    counts = list(cv_fit.toarray().sum(axis = 0))
    top10 = sorted(list(zip(names, counts)), key = lambda x: x[1], reverse = True)[:10]
    return str([feature[0] for feature in top10])
names['feat_counts'] = names.name.apply(lambda x: get_names(x))

In [None]:
pd.set_option('display.max_colwidth', -1)
names[['style', 'feat_counts']]

### 1. American IPA
   * The Big & Green, High Seas
   * Fistful of Grapefruit
   
### 2. American Pale Ale
   * Caldera Trail
   * Bitter River
   * Oatmeal Harvest
   
### 3. American Amber / Red Ale
   * Big Atlantic
   * Ashyard
   * Fat Mountain
   
### 4. American Blonde Ale
   * Summer Beach
   * The Great Carolina Farmer
   * Black River
   
### 5. American Double / Imperial IPA
   * The Knight Crusher
   * The Knight Slayer
   * Cockaded Biker Named Oz

### 6. American Pale Wheat Ale
   * Nonstop Nude
   * Wicked Pete
   * Nude Beach
   
### 7. American Brown Ale
   * Moose Drool
   * Big Black Bear
   * Coffee Nut

### 8. American Porter
   * Black Panther
   * Bourbon Barrel Cowbell
   * Coconut Crescent
   
### 9. Siason / Farmhouse Ale
   * Acidulated River
   * Barn Alter
   * Walloon House
   
### 10. Witbier
   * Zombie Patrol
   * Orange Summer Moon
   * Moon Archer
   