In [189]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
from wordcloud import WordCloud
from wordcloud import STOPWORDS
from langdetect import detect_langs
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('ratebeer.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df.head()

Unnamed: 0,name,id,brewerID,abv,style,appearance,aroma,palate,taste,overall,time,profileName,reviewtext
0,John Harvards Simcoe IPA,63836,8481,5.4,India Pale Ale &#40;IPA&#41;,5-Apr,10-Jun,5-Mar,10-Jun,13/20,1157587200,hopdog,"On tap at the Springfield, PA location. Poured..."
1,John Harvards Simcoe IPA,63836,8481,5.4,India Pale Ale &#40;IPA&#41;,5-Apr,10-Jun,5-Apr,10-Jul,13/20,1157241600,TomDecapolis,On tap at the John Harvards in Springfield PA....
2,John Harvards Cristal Pilsner,71716,8481,5.0,Bohemian Pilsener,5-Apr,10-May,5-Mar,10-Jun,14/20,958694400,PhillyBeer2112,"UPDATED: FEB 19, 2003 Springfield, PA. I've ne..."
3,John Harvards Fancy Lawnmower Beer,64125,8481,5.4,Kölsch,5-Feb,10-Apr,5-Feb,10-Apr,20-Aug,1157587200,TomDecapolis,On tap the Springfield PA location billed as t...
4,John Harvards Fancy Lawnmower Beer,64125,8481,5.4,Kölsch,5-Feb,10-Apr,5-Feb,10-Apr,20-Aug,1157587200,hopdog,"On tap at the Springfield, PA location. Poured..."


In [6]:
df_1 = df.sample(frac=0.25)

In [7]:
df_1.shape

(262144, 13)

In [9]:
df_2 = df.sample(frac=0.05)
df_2.shape

(52429, 13)

In [11]:
df_3 = df_2[['name', 'reviewtext']]

In [12]:
df_3.head()

Unnamed: 0,name,reviewtext
846466,Hydes Doctors Orders,"Cask conditioned @ Crispin, Wokingham. Dr No i..."
957132,McKenzie Doppelbock,"Enjoyed @ the pub. Poured a small tan head, fa..."
110111,Weeping Radish Fest Amber Lager,Oct. 2005 - A very ordinary fest. Pour was red...
462983,AleSmith Decadence 2010,Bottle poured reddish brown with small off whi...
127916,Bar Harbor True Blue Blueberry Ale,Sample from a bomber. Pours copper with a tan...


In [13]:
df_3 = df_3.rename(columns={'reviewtext': 'review'})

In [14]:
# length of the reviews
length_reviews = df_3.review.str.len()

In [15]:
df_3['length'] = length_reviews

In [16]:
df_3.head()

Unnamed: 0,name,review,length
846466,Hydes Doctors Orders,"Cask conditioned @ Crispin, Wokingham. Dr No i...",152.0
957132,McKenzie Doppelbock,"Enjoyed @ the pub. Poured a small tan head, fa...",424.0
110111,Weeping Radish Fest Amber Lager,Oct. 2005 - A very ordinary fest. Pour was red...,232.0
462983,AleSmith Decadence 2010,Bottle poured reddish brown with small off whi...,232.0
127916,Bar Harbor True Blue Blueberry Ale,Sample from a bomber. Pours copper with a tan...,197.0


In [28]:
avg_length = df_3.length.mean()
print('Average Length: {}'.format(avg_length))

Average Length: 309.505244254246


In [22]:
longest_review = df_3['length'].max()
shortest_review = df_3['length'].min()
print('Longest Review: {}'.format(longest_review))
print('Shortest Review: {}'.format(shortest_review))

Longest Review: 7656.0
Shortest Review: 4.0


In [27]:
df_4 = df_3.sort_values(by=['length'], ascending=False)
df_4.head()

Unnamed: 0,name,review,length
425780,Gordon Biersch Hefeweizen,A - poured a hazy golden color\t\tS - Dominant...,7656.0
788588,Orval,"UPDATED: MAR 28, 2008 Rerate: 26/03/2008, bott...",5369.0
225782,Allagash Black,The Allagash Brewery is a unique artisan brewh...,3831.0
323862,Snoqualmie Falls Summer Beer,"UPDATED: MAY 15, 2010 Snoqualmie Falls Brewing...",3688.0
1013377,North Coast Old Stock Ale,"UPDATED: JAN 18, 2008 2006 bottle, probably 6m...",3361.0


In [29]:
# Detect the language of the review

In [56]:
reviews = df_4.sample(frac=0.50)
reviews.head()

Unnamed: 0,name,review,length
389032,Shipyard Summer Ale,Doesnt look or smell like a summer ale. Too d...,265.0
913387,Molson Golden,"Not for me. Went down OK, but nice and fast. N...",140.0
884908,21st Amendment Bitter American,"12oz canned February 15th, 2011. Pours lightl...",485.0
27811,Warburger Export,Bottle 50 cl. Courtesy of fonefan. Pours golde...,182.0
662568,Cigar City Humidor Series Jai Alai Cedar Aged ...,"750mL bottle, courtesy of The Beer Wench. Than...",605.0


In [57]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26214 entries, 389032 to 821143
Data columns (total 3 columns):
name      26214 non-null object
review    26163 non-null object
length    26163 non-null float64
dtypes: float64(1), object(2)
memory usage: 819.2+ KB


In [58]:
review = str(reviews.review)

In [59]:
languages = []

for r in range(len(reviews)):
    languages.append(detect_langs(reviews.iloc[r, 0]))

In [60]:
languages = [str(lang).split(':')[0][1] for lang in languages]

In [61]:
reviews['lang'] = languages

In [62]:
reviews.head()

Unnamed: 0,name,review,length,lang
389032,Shipyard Summer Ale,Doesnt look or smell like a summer ale. Too d...,265.0,e
913387,Molson Golden,"Not for me. Went down OK, but nice and fast. N...",140.0,d
884908,21st Amendment Bitter American,"12oz canned February 15th, 2011. Pours lightl...",485.0,n
27811,Warburger Export,Bottle 50 cl. Courtesy of fonefan. Pours golde...,182.0,d
662568,Cigar City Humidor Series Jai Alai Cedar Aged ...,"750mL bottle, courtesy of The Beer Wench. Than...",605.0,e


In [144]:
# Removing non english reviews and Keeping only reviews in english

reviews_e = reviews.loc[lambda reviews: reviews['lang'] == 'e']
reviews_e

Unnamed: 0,name,review,length,lang
389032,Shipyard Summer Ale,Doesnt look or smell like a summer ale. Too d...,265.0,e
662568,Cigar City Humidor Series Jai Alai Cedar Aged ...,"750mL bottle, courtesy of The Beer Wench. Than...",605.0,e
824243,New Glarus Raspberry Tart,Grape juice was the first thing I thought when...,205.0,e
669903,Cigar City Cubano-Style Espresso Brown Ale,"Draught at Capones: Aroma of chocolate, malt,...",466.0,e
31049,Archibald Orchards Spiced Winter Apple,"UPDATED: AUG 13, 2008 [1328-20080504] Bottle. ...",702.0,e
...,...,...,...,...
340233,Grand Ridge Black & Tan,"UPDATED: MAY 6, 2004 this black and tan pours ...",401.0,e
589323,Westmalle Tripel,"UPDATED: DEC 12, 2006 Pours a hazy orange-yell...",303.0,e
133793,Badger Tanglefoot &#40;Bottle/Can&#41;,Bottle from Sainsburys. Amber. Aroma of citrus...,116.0,e
282536,High Water No Boundary IPA,"Bomber from The Davis Beer Shoppe in Davis, CA...",255.0,e


In [145]:
avg_length_e = reviews_e.length.mean()
longest_review_e = reviews_e['length'].max()
shortest_review_e = reviews_e['length'].min()
print('Average Length: {}'.format(avg_length_e))
print('Longest Review: {}'.format(longest_review_e))
print('Shortest Review: {}'.format(shortest_review_e))

Average Length: 321.94076680595265
Longest Review: 2780.0
Shortest Review: 7.0


In [148]:
reviews_e['review'] = reviews_e['review'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [161]:
reviews_e['sentiment'] = round(reviews_e['review'].apply(lambda x: TextBlob(x).sentiment[0]),2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [162]:
reviews_e

Unnamed: 0,name,review,length,lang,sentiment
389032,Shipyard Summer Ale,Doesnt look or smell like a summer ale. Too d...,265.0,e,0.14
662568,Cigar City Humidor Series Jai Alai Cedar Aged ...,"750mL bottle, courtesy of The Beer Wench. Than...",605.0,e,0.26
824243,New Glarus Raspberry Tart,Grape juice was the first thing I thought when...,205.0,e,0.37
669903,Cigar City Cubano-Style Espresso Brown Ale,"Draught at Capones: Aroma of chocolate, malt,...",466.0,e,0.16
31049,Archibald Orchards Spiced Winter Apple,"UPDATED: AUG 13, 2008 [1328-20080504] Bottle. ...",702.0,e,0.20
...,...,...,...,...,...
340233,Grand Ridge Black & Tan,"UPDATED: MAY 6, 2004 this black and tan pours ...",401.0,e,0.14
589323,Westmalle Tripel,"UPDATED: DEC 12, 2006 Pours a hazy orange-yell...",303.0,e,0.06
133793,Badger Tanglefoot &#40;Bottle/Can&#41;,Bottle from Sainsburys. Amber. Aroma of citrus...,116.0,e,0.22
282536,High Water No Boundary IPA,"Bomber from The Davis Beer Shoppe in Davis, CA...",255.0,e,0.39


In [178]:
reviews_e['sentiment'] = pd.cut(reviews_e['sentiment'], bins=3, labels=[1,0,2])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [179]:
reviews_e

Unnamed: 0,name,review,length,lang,sentiment
389032,Shipyard Summer Ale,Doesnt look or smell like a summer ale. Too d...,265.0,e,0
662568,Cigar City Humidor Series Jai Alai Cedar Aged ...,"750mL bottle, courtesy of The Beer Wench. Than...",605.0,e,0
824243,New Glarus Raspberry Tart,Grape juice was the first thing I thought when...,205.0,e,2
669903,Cigar City Cubano-Style Espresso Brown Ale,"Draught at Capones: Aroma of chocolate, malt,...",466.0,e,0
31049,Archibald Orchards Spiced Winter Apple,"UPDATED: AUG 13, 2008 [1328-20080504] Bottle. ...",702.0,e,0
...,...,...,...,...,...
340233,Grand Ridge Black & Tan,"UPDATED: MAY 6, 2004 this black and tan pours ...",401.0,e,0
589323,Westmalle Tripel,"UPDATED: DEC 12, 2006 Pours a hazy orange-yell...",303.0,e,0
133793,Badger Tanglefoot &#40;Bottle/Can&#41;,Bottle from Sainsburys. Amber. Aroma of citrus...,116.0,e,0
282536,High Water No Boundary IPA,"Bomber from The Davis Beer Shoppe in Davis, CA...",255.0,e,2


In [180]:
reviews_e['sentiment'].value_counts()

0    12024
2     1576
1       66
Name: sentiment, dtype: int64

In [181]:
sentiment = reviews_e.sentiment

In [195]:
review = reviews_e.review
positive = []

for i in range(len(reviews_e.sentiment)):
    if reviews_e.sentiment[i] == 2:
        positive.append(review[i])

KeyError: 0

In [190]:
vectorizer = TfidfVectorizer()
vectorizer.fit(reviews_e.review)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [191]:
reviews_bow = vectorizer.transform(reviews_e.review)