## NLP

### The original dataset is available for download on [Kaggle](https://www.kaggle.com/ehallmar/beers-breweries-and-beer-reviews#reviews.csv). 

### The following steps demonstrate the data cleaning process for NLP use. Only entries with text reviews were preserved.

In [11]:
import pandas as pd
import numpy as np
import os
import platform
import ipywidgets
import re
import sklearn
from sklearn.feature_extraction import stop_words
from scipy.sparse import vstack
from sklearn.feature_extraction.text import HashingVectorizer
import joblib

In [20]:
print("System")
print("OS name: %s" % os.name)
print("System: %s" % platform.system())
print("Release: %s" % platform.release())
print()
print("Python")
print("version: %s" % python_version())
print()
print("Python Packages")
print("pandas == %s" % pd.__version__)
print("numpy == %s" % np.__version__)
print("sklearn == %s" % sklearn.__version__)
print("jupyter == 1.0.0")

System
OS name: posix
System: Darwin
Release: 17.7.0

Python
version: 3.8.2

Python Packages
pandas == 1.0.3
numpy == 1.18.3
sklearn == 0.22.2.post1
jupyter == 1.0.0


## Data Cleaning

In [2]:
df = pd.read_csv('../data/csv/reviews.csv')

# Displays the number of rows in df
print('The number of rows are: ' + str(len(df.index)))

The number of rows are: 9073128


In [3]:
df.head(5)

Unnamed: 0,beer_id,username,date,text,look,smell,taste,feel,overall,score
0,271781,bluejacket74,2017-03-17,"750 ml bottle, 2016 vintage, bottle #304 of...",4.0,4.0,4.0,4.25,4.0,4.03
1,125646,_dirty_,2017-12-21,,4.5,4.5,4.5,4.5,4.5,4.5
2,125646,CJDUBYA,2017-12-21,,4.75,4.75,4.75,4.75,4.75,4.75
3,125646,GratefulBeerGuy,2017-12-20,0% 16 oz can. Funny story: As I finally wal...,4.75,4.75,4.5,4.5,4.5,4.58
4,125646,LukeGude,2017-12-20,Classic TH NEIPA. Overflowing head and bouq...,4.25,4.5,4.25,4.25,4.25,4.31


In [4]:
# Drop columns
df = df.drop(columns = ['date','username','look','smell','taste','feel','overall','score'])
df.head()

Unnamed: 0,beer_id,text
0,271781,"750 ml bottle, 2016 vintage, bottle #304 of..."
1,125646,
2,125646,
3,125646,0% 16 oz can. Funny story: As I finally wal...
4,125646,Classic TH NEIPA. Overflowing head and bouq...


In [5]:
# list comprehension to strip each row of extraneous characters
df['og_review'] = [x.strip(" \xa0") for x in df['text']]

In [6]:
# drop unwanted column
df = df.drop(columns=['text'])
df.head()

Unnamed: 0,beer_id,og_review
0,271781,"750 ml bottle, 2016 vintage, bottle #304 of 36..."
1,125646,
2,125646,
3,125646,0% 16 oz can. Funny story: As I finally walked...
4,125646,Classic TH NEIPA. Overflowing head and bouquet...


In [7]:
# Extract rows with reviews
df = df[df['og_review'] != '']
df.head()

Unnamed: 0,beer_id,og_review
0,271781,"750 ml bottle, 2016 vintage, bottle #304 of 36..."
3,125646,0% 16 oz can. Funny story: As I finally walked...
4,125646,Classic TH NEIPA. Overflowing head and bouquet...
7,125646,Pours a creamy opaque light straw yellow with ...
13,125646,Pours a cloudy yellow color with a thin foamy ...


In [9]:
# Rename column
df = df.rename(columns={"og_review": "text"})
df.head()

Unnamed: 0,beer_id,text
0,271781,"750 ml bottle, 2016 vintage, bottle #304 of 36..."
3,125646,0% 16 oz can. Funny story: As I finally walked...
4,125646,Classic TH NEIPA. Overflowing head and bouquet...
7,125646,Pours a creamy opaque light straw yellow with ...
13,125646,Pours a cloudy yellow color with a thin foamy ...


## Grouping

In [10]:
# Group by beer_id
agg_df = df.groupby("beer_id")["text"].sum()
agg_df

beer_id
3         The label is very informative, except it didn'...
4         No dating. Had this one in the fridge, then on...
5         Beautiful beer. Light and tasty.I travel all o...
6         great brown ale...one of my favorites.Might ha...
7         The labeling with the purple haze guy with the...
                                ...                        
373112    kind of thin(ner).... but good! actually taste...
373116                                             A11, A12
373121                                                  A14
373122    creamy head. fruits in the aroma. slightly ora...
373128    Buried deep on Sleeman’s website. Actually not...
Name: text, Length: 210311, dtype: object

In [46]:
print(str(len(agg_df.index)) + ' unique beers')

210311 unique beers


## Save df aggregated by beer_id

In [11]:
# agg_df = agg_df.reset_index()
# agg_df.to_csv('../data/csv/agg_beer_docs.csv', index = False)

In [3]:
# df = pd.read_csv('../data/csv/agg_beer_docs.csv')
# df

Unnamed: 0,beer_id,text
0,3,"The label is very informative, except it didn'..."
1,4,"No dating. Had this one in the fridge, then on..."
2,5,Beautiful beer. Light and tasty.I travel all o...
3,6,great brown ale...one of my favorites.Might ha...
4,7,The labeling with the purple haze guy with the...
...,...,...
210306,373112,kind of thin(ner).... but good! actually taste...
210307,373116,"A11, A12"
210308,373121,A14
210309,373122,creamy head. fruits in the aroma. slightly ora...


## Parallel Hashing Vectorizer

In [5]:
class ParallelHashingVectorizer(HashingVectorizer):
    
    def __init__(self, n_jobs=1, **kwargs):
        super().__init__(**kwargs)
        self.n_jobs = n_jobs
    
    def transform(self, X, y=None, **fit_params):
        
        delayed_hashing_vectorizer = delayed(super().transform)
        
        X_parts = np.array_split(X, effective_n_jobs(self.n_jobs))
        
        X_parts_transformed = Parallel(n_jobs=effective_n_jobs(self.n_jobs))(delayed_hashing_vectorizer(X_part) for X_part in X_parts)
        
        X_transformed = vstack(X_parts_transformed)
        
        return X_transformed

In [6]:
%%time

hv = ParallelHashingVectorizer(
    n_jobs = -1,
    alternate_sign = False
)

df_iter = pd.read_csv('../data/csv/agg_beer_docs.csv', chunksize = 10000)

X_vects = []

for df in df_iter:
    X = df['text'].values
    X_vect = hv.fit_transform(X)
    X_vects.append(X_vect)

X_vect_all = vstack(X_vects)

CPU times: user 25.8 s, sys: 12.4 s, total: 38.3 s
Wall time: 4min 39s


In [7]:
print(X_vect_all)

  (0, 1907)	0.026108681666265243
  (0, 5230)	0.026108681666265243
  (0, 15860)	0.026108681666265243
  (0, 25602)	0.026108681666265243
  (0, 39493)	0.052217363332530485
  (0, 44771)	0.026108681666265243
  (0, 65517)	0.026108681666265243
  (0, 70406)	0.13054340833132622
  (0, 77592)	0.026108681666265243
  (0, 97469)	0.07832604499879572
  (0, 127771)	0.026108681666265243
  (0, 129237)	0.026108681666265243
  (0, 139455)	0.026108681666265243
  (0, 142771)	0.052217363332530485
  (0, 143803)	0.026108681666265243
  (0, 144749)	0.1827607716638567
  (0, 154172)	0.026108681666265243
  (0, 170062)	0.10443472666506097
  (0, 174171)	0.10443472666506097
  (0, 174974)	0.026108681666265243
  (0, 179826)	0.026108681666265243
  (0, 180233)	0.026108681666265243
  (0, 180525)	0.3655215433277134
  (0, 186465)	0.026108681666265243
  (0, 204878)	0.026108681666265243
  :	:
  (210307, 193869)	0.7071067811865475
  (210308, 167377)	1.0
  (210309, 154172)	0.2581988897471611
  (210309, 240984)	0.2581988897471611
  

In [10]:
# Save using joblib
joblib.dump(X_vect_all, '../data/nlp_beer.pk1', compress =1)

['../data/nlp_beer.pk1']

In [44]:
# cv = make_pipeline(
#     CountVectorizer(
#             ngram_range=(3, 7),
#             analyzer="char"
#         ),
#     Normalizer()
# )
# cv.fit(X_sparse)
# X = cv.transform(X_sparse)

In [33]:
# Use cosine similarity to compare every beer to each other
# def search(term):
#     X_term = cv.transform([term])
#     simularities = cosine_similarity(X_term, X)
#     idxmax = np.argmax(simularities[0])
#     return df.loc[idxmax]

In [34]:
# if __name__ == "__main__":
#     term = "pine needles"
#     print(search(term))