## NLP

### The original dataset is available for download on [Kaggle](https://www.kaggle.com/ehallmar/beers-breweries-and-beer-reviews#reviews.csv). 

### The following steps demonstrate the data cleaning process for NLP use. Only entries with text reviews were preserved.

In [1]:
import pandas as pd
import numpy as np
import tables
import os

In [2]:
df = pd.read_csv('../data/csv/reviews.csv')

# Displays the number of rows in df
print('The number of rows are: ' + str(len(df.index)))

The number of rows are: 9073128


In [3]:
df.head(5)

Unnamed: 0,beer_id,username,date,text,look,smell,taste,feel,overall,score
0,271781,bluejacket74,2017-03-17,"750 ml bottle, 2016 vintage, bottle #304 of...",4.0,4.0,4.0,4.25,4.0,4.03
1,125646,_dirty_,2017-12-21,,4.5,4.5,4.5,4.5,4.5,4.5
2,125646,CJDUBYA,2017-12-21,,4.75,4.75,4.75,4.75,4.75,4.75
3,125646,GratefulBeerGuy,2017-12-20,0% 16 oz can. Funny story: As I finally wal...,4.75,4.75,4.5,4.5,4.5,4.58
4,125646,LukeGude,2017-12-20,Classic TH NEIPA. Overflowing head and bouq...,4.25,4.5,4.25,4.25,4.25,4.31


In [4]:
mid_df = df.drop(columns=['date','look','smell','taste','feel','overall','score'])

In [5]:
# list comprehension to strip each row
mid_df['og_review'] = [x.strip(" \xa0") for x in mid_df['text']]

In [6]:
# Remove column with extraneous string characters
mid_df = mid_df.drop(['text'], axis = 1)
mid_df.head()

Unnamed: 0,beer_id,username,og_review
0,271781,bluejacket74,"750 ml bottle, 2016 vintage, bottle #304 of 36..."
1,125646,_dirty_,
2,125646,CJDUBYA,
3,125646,GratefulBeerGuy,0% 16 oz can. Funny story: As I finally walked...
4,125646,LukeGude,Classic TH NEIPA. Overflowing head and bouquet...


In [7]:
# Extract rows with reviews
final_df = mid_df[mid_df['og_review'] != '']
final_df.head()

Unnamed: 0,beer_id,username,og_review
0,271781,bluejacket74,"750 ml bottle, 2016 vintage, bottle #304 of 36..."
3,125646,GratefulBeerGuy,0% 16 oz can. Funny story: As I finally walked...
4,125646,LukeGude,Classic TH NEIPA. Overflowing head and bouquet...
7,125646,MFMB,Pours a creamy opaque light straw yellow with ...
13,125646,jngrizzaffi,Pours a cloudy yellow color with a thin foamy ...


### Final data to contain beer_id, username, review

In [8]:
# List all possible punctuations
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''

# For each row, remove punctuations, handle case and split on spaces to convert individual words to elements
final_df['review'] = [''.join(character for character in strings if character not in punctuations).lower().split(" ") for strings in final_df['og_review']]

# Remove column with extraneous string characters
final_df = final_df.drop(['og_review'], axis = 1)
final_df.head(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['review'] = [''.join(character for character in strings if character not in punctuations).lower().split(" ") for strings in final_df['og_review']]


Unnamed: 0,beer_id,username,review
0,271781,bluejacket74,"[750, ml, bottle, 2016, vintage, bottle, 304, ..."
3,125646,GratefulBeerGuy,"[0, 16, oz, can, funny, story, as, i, finally,..."
4,125646,LukeGude,"[classic, th, neipa, overflowing, head, and, b..."
7,125646,MFMB,"[pours, a, creamy, opaque, light, straw, yello..."
13,125646,jngrizzaffi,"[pours, a, cloudy, yellow, color, with, a, thi..."
15,125646,PDOR1960,"[another, great, brew, from, treehouse]"
19,125646,Lucular,"[pours, with, a, cloudy, yelloworange, haze, w..."
28,205644,Brutaltruth,"[from, the, tall, boy, can, and, enjoyed, in, ..."
32,205644,secondtooth,"[appearance, pours, a, deep, black, with, tan,..."
36,150672,Derek,"[beautiful, crystal, clear, pour, with, a, nic..."


In [9]:
print('The number of rows are: ' + str(len(final_df.index)))

The number of rows are: 2987991


In [10]:
final_df.to_csv('../data/csv/cleaned_reviews.csv', index = False)

In [12]:
final_df.to_hdf('../data/csv/cleaned_reviews.hdf', key = 'df', mode = 'w')

### Create document for each beer_id

In [74]:
# Sample 2000 beers, drop column username and sort by beer_id
df = pd.read_csv('../data/csv/cleaned_reviews.csv').head(2000).drop(['username'], axis =1 ).sort_values('beer_id')
df

Unnamed: 0,beer_id,review
325,47604,"['hazy', 'amber', 'with', 'a', 'goodsize', 'tw..."
395,47604,"['a=a', 'light', 'golden', 'color', 'with', 'f..."
396,47604,"['a', 'a', 'nice', 'copperyorange', 'color', '..."
397,47604,"['i', 'brought', 'this', 'to', 'the', 'inaugur..."
398,47604,"['rye', 'ipa', 'hopped', 'with', 'citra', 'pou..."
...,...,...
487,367068,"['saisons', 'can', 'encompass', 'a', 'wide', '..."
238,367910,"['lhazy', 'nectar12', 'inch', 'head', 'good', ..."
1856,368662,"['had', 'on', 'draft', 'at', 'breweries', 'var..."
486,369858,"['tasted', 'in', 'a', 'tall', 'tulip', 'from',..."


In [79]:
print(str(df['beer_id'].nunique()) + ' unique beers')

232 unique beers


In [80]:
df.iloc[0:7]

Unnamed: 0,beer_id,review
325,47604,"['hazy', 'amber', 'with', 'a', 'goodsize', 'tw..."
395,47604,"['a=a', 'light', 'golden', 'color', 'with', 'f..."
396,47604,"['a', 'a', 'nice', 'copperyorange', 'color', '..."
397,47604,"['i', 'brought', 'this', 'to', 'the', 'inaugur..."
398,47604,"['rye', 'ipa', 'hopped', 'with', 'citra', 'pou..."
399,47604,"['bottle', 'into', 'a', 'pint', 'glass', 'got'..."
400,47604,"['been', 'seeing', 'this', 'beer', 'on', 'shel..."


In [81]:
df['review'].iloc[0]

"['hazy', 'amber', 'with', 'a', 'goodsize', 'two', 'inch', 'plus', 'head', 'on', 'the', 'top', 'decent', 'retention', 'as', 'it', 'stayed', 'around', 'bitter', 'citrus', 'hops', 'tones', 'of', 'grapefruit', 'and', 'tangerine', 'dominated', 'the', 'brew', 'not', 'much', 'room', 'for', 'anything', 'else', 'in', 'there', 'a', 'bit', 'of', 'sugar', 'orange', 'peel', 'huge', 'hoppy', 'bitter', 'bite', 'right', 'out', 'of', 'the', 'gate', 'good', 'dry', 'finish', 'lots', 'of', 'resiney', 'grapefruit', 'tones', 'throughout', 'a', 'touch', 'of', 'sugar', 'trying', 'to', 'balance', 'out', 'the', 'hops', 'but', 'it', 'was', 'not', 'really', 'going', 'anywhere', 'huge', 'hops', 'huge', 'bitter', 'big', 'ipa']"

In [82]:
# Concatenate the rows by beer_id
df = df.groupby(['beer_id'])['review'].apply(lambda x: ','.join(x))
df

beer_id
47604     ['hazy', 'amber', 'with', 'a', 'goodsize', 'tw...
47606     ['had', 'some', 'of', 'this', 'on', 'tap', 'la...
47630     ['day', 'two', 'at', 'nerax', '2009', 'from', ...
47655     ['a', 'this', 'poured', 'a', 'clear', 'straw',...
47658     ['very', 'dark', 'appearance', 'with', 'tan', ...
                                ...                        
367068    ['saisons', 'can', 'encompass', 'a', 'wide', '...
367910    ['lhazy', 'nectar12', 'inch', 'head', 'good', ...
368662    ['had', 'on', 'draft', 'at', 'breweries', 'var...
369858    ['tasted', 'in', 'a', 'tall', 'tulip', 'from',...
370956    ['taste', 'crisp', 'semi', 'sweet', 'caramel',...
Name: review, Length: 232, dtype: object

In [85]:
# Verify the concatenation
df.loc[47604]

"['hazy', 'amber', 'with', 'a', 'goodsize', 'two', 'inch', 'plus', 'head', 'on', 'the', 'top', 'decent', 'retention', 'as', 'it', 'stayed', 'around', 'bitter', 'citrus', 'hops', 'tones', 'of', 'grapefruit', 'and', 'tangerine', 'dominated', 'the', 'brew', 'not', 'much', 'room', 'for', 'anything', 'else', 'in', 'there', 'a', 'bit', 'of', 'sugar', 'orange', 'peel', 'huge', 'hoppy', 'bitter', 'bite', 'right', 'out', 'of', 'the', 'gate', 'good', 'dry', 'finish', 'lots', 'of', 'resiney', 'grapefruit', 'tones', 'throughout', 'a', 'touch', 'of', 'sugar', 'trying', 'to', 'balance', 'out', 'the', 'hops', 'but', 'it', 'was', 'not', 'really', 'going', 'anywhere', 'huge', 'hops', 'huge', 'bitter', 'big', 'ipa'],['a=a', 'light', 'golden', 'color', 'with', 'faint', 'amber', 'hues', 'one', 'finger', 'head', 'with', 'subpar', 'retention', 'and', 'lacing', 's=subdued', 'caramel', 'malt', 'and', 'rye', 'not', 'strong', 'at', 'all', 'definitely', 'the', 'low', 'point', 'of', 'this', 'beer', 'the', 'rye', 

In [86]:
df.to_csv('../data/csv/beer_docs.csv', index = False)