Picking up from Sebastian's scraped code output.

In [1]:
# Library Imports
import spacy
import en_core_web_lg
import pandas as pd
import nltk
from nltk.corpus import stopwords
from string import punctuation
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
nlp = en_core_web_lg.load() #just import once or will take some time
!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()



## PART B

In [2]:
doc = nlp("Whats up , MSBA rocks!!!>cld??.")
doc

Whats up , MSBA rocks!!!>cld??.

In [3]:
raw = pd.read_csv("https://raw.githubusercontent.com/coldonline/TextAnalytics/master/HW2/reviews_text.csv")
raw.drop(['Unnamed: 0'], axis=1,inplace=True)
raw.to_csv("raw_scrape.csv")
raw.head()

Unnamed: 0,product_name,product_review,user_rating
0,Kentucky Brunch Brand Stout,"2016 Silver Wax. Aroma has whiskey, maple, tof...",4.8
1,Kentucky Brunch Brand Stout,The beer pours Pitch Black with a frothy tan h...,4.74
2,Kentucky Brunch Brand Stout,Probably the smoothest beer I have ever had. S...,4.68
3,Kentucky Brunch Brand Stout,"Dark black, very thick, a little bit of tan he...",5.0
4,Kentucky Brunch Brand Stout,Poured black as ink with thin ruby edges at 58...,4.97


In [4]:
# DATA CLEANSING FUNCTIONS

def remove_space(s):
    return s.replace("\n"," ")

def removepunc(item):
    for p in punctuation:
        item = item.lstrip().replace(p,'')
    return item

def lowerize(x):
    return x.lower()

stop_words = set(stopwords.words('english'))

def remove_stopwords(s):
    return [w for w in s if not w in stop_words] 

def lematize(l):
    s=[]
    for i in l:
        s.append(wordnet_lemmatizer.lemmatize(i))
    return s

In [5]:
df = raw.copy()
print(df.shape)

df = df.dropna()
print(df.shape) #finding if there is any major loss due to this

#Cleaning reviews for calculating word freq
df['product_review'] = df['product_review'].apply(remove_space).apply(removepunc).apply(lowerize) 
wordnet_lemmatizer = WordNetLemmatizer()
df['review_words'] = df['product_review'].str.split().apply(set).apply(list).apply(remove_stopwords)
df.head()

(6214, 3)
(6214, 3)


Unnamed: 0,product_name,product_review,user_rating,review_words
0,Kentucky Brunch Brand Stout,2016 silver wax aroma has whiskey maple toffee...,4.8,"[umami, maple, aroma, wood, silver, coffee, bi..."
1,Kentucky Brunch Brand Stout,the beer pours pitch black with a frothy tan h...,4.74,"[maple, pours, syrup, tan, tastes, coffee, smo..."
2,Kentucky Brunch Brand Stout,probably the smoothest beer i have ever had sm...,4.68,"[caramelmaple, maple, overall, mostly, tasted,..."
3,Kentucky Brunch Brand Stout,dark black very thick a little bit of tan head...,5.0,"[letdown, mouthfeel, maple, best, bad, tan, am..."
4,Kentucky Brunch Brand Stout,poured black as ink with thin ruby edges at 58...,4.97,"[lasting, cherries, full, syrup, coffee, ink, ..."


In [6]:
word_bag = []
for i in range(df.shape[0]):
    word_bag += df['review_words'][i]

#word_bag

word_freq = nltk.FreqDist(word_bag)

In [7]:
len(word_freq) # number of words

23894

In [8]:
rslt = pd.DataFrame(word_freq.most_common(22337),              
                    columns=['Word', 'Frequency'])
rslt.head()
rslt.to_csv("beer_review_wordcount.csv")
#some words like 'balanced', 'crisp' , 'robust' seem to be mixing with other words in the file, but that doesn't bother us 
#since we care only about existence than perfect counts

# PART C

In [9]:
# CHOOSING WORDS 'balanced' , 'crisp' , 'robust' they seem pretty prevelant in the data

#spacy implementation


def process_text(text):
    doc = nlp(text.lower())
    result = []
    for token in doc:
        if token.text in nlp.Defaults.stop_words:
            continue
        if token.is_punct:
            continue
#         if token.lemma_ == '-PRON-'
#             continue
        result.append(token.lemma_)
    return " ".join(result)

def calculate_similarity(text1, text2):
    base = nlp(process_text(text1))
    compare = nlp(process_text(text2))
    return base.similarity(compare)


In [11]:
# Not the cleanest piece of code, but can clean similarity calculation with above function later
# This chunk might take some time
df['similarity_crisp'] = 0
df['similarity_bal'] = 0
df['similarity_rob'] = 0

for i in range(df.shape[0]):
    df.loc[i,'similarity_crisp'] = nlp(process_text(df.product_review[i])).similarity(nlp("crisp"))
    df.loc[i,'similarity_bal'] = nlp(process_text(df.product_review[i])).similarity(nlp("balanced"))
    df.loc[i,'similarity_rob'] = nlp(process_text(df.product_review[i])).similarity(nlp("robust"))
df.head()

Unnamed: 0,product_name,product_review,user_rating,review_words,similarity_crisp,similarity_bal,similarity_rob
0,Kentucky Brunch Brand Stout,2016 silver wax aroma has whiskey maple toffee...,4.8,"[umami, maple, aroma, wood, silver, coffee, bi...",0.539329,0.418854,0.312164
1,Kentucky Brunch Brand Stout,the beer pours pitch black with a frothy tan h...,4.74,"[maple, pours, syrup, tan, tastes, coffee, smo...",0.598273,0.440048,0.340116
2,Kentucky Brunch Brand Stout,probably the smoothest beer i have ever had sm...,4.68,"[caramelmaple, maple, overall, mostly, tasted,...",0.541582,0.539612,0.449008
3,Kentucky Brunch Brand Stout,dark black very thick a little bit of tan head...,5.0,"[letdown, mouthfeel, maple, best, bad, tan, am...",0.55154,0.486125,0.384138
4,Kentucky Brunch Brand Stout,poured black as ink with thin ruby edges at 58...,4.97,"[lasting, cherries, full, syrup, coffee, ink, ...",0.615404,0.516429,0.422285


In [12]:
df[['similarity_crisp', 'similarity_bal','similarity_rob']].mean(axis=1)
df.drop(['review_words'], axis=1, inplace=True)

In [None]:
# data.to_csv('output_partC.csv')

## PART D

In [13]:
#import file with reduced number of reviews - ie 300 reviews that have the highest similarity scores
data = pd.read_excel('output_partC.xlsx')
data.drop(['Unnamed: 0'], axis=1,inplace=True)
data.head()

Unnamed: 0,product_name,product_review,similarity_score
0,Breakfast Stout,good color no head clean and balanced with an ...,0.615617
1,Flora Plum,2018 releasea pours a very light pale hazy kin...,0.611942
2,Abricot Du Fermier,this is the best sour saison i have ever had n...,0.608264
3,King Sue,the emphasis in the beer is a strong sharp hop...,0.599622
4,Flora,flora is clean crisp light and tart with smoot...,0.599248


In [14]:
def sentiment_analyzer_scores(sentence):
    score = analyser.polarity_scores(sentence)
    print("{:-<40} {}".format(sentence, str(score)))

In [15]:
sentiment_analyzer_scores("I hate this product.")
sentiment_analyzer_scores("I love this product!")
sentiment_analyzer_scores("I think this product is okay.")

I hate this product.-------------------- {'neg': 0.649, 'neu': 0.351, 'pos': 0.0, 'compound': -0.5719}
I love this product!-------------------- {'neg': 0.0, 'neu': 0.308, 'pos': 0.692, 'compound': 0.6696}
I think this product is okay.----------- {'neg': 0.0, 'neu': 0.678, 'pos': 0.322, 'compound': 0.2263}


In [16]:
data['sentiment_neg'] = data['product_review'].apply(lambda x :analyser.polarity_scores(x)['neg'])
data['sentiment_neu'] = data['product_review'].apply(lambda x :analyser.polarity_scores(x)['neu'])
data['sentiment_pos'] = data['product_review'].apply(lambda x :analyser.polarity_scores(x)['pos'])
data['sentiment_compound'] = data['product_review'].apply(lambda x :analyser.polarity_scores(x)['compound'])

In [17]:
data.head()

Unnamed: 0,product_name,product_review,similarity_score,sentiment_neg,sentiment_neu,sentiment_pos,sentiment_compound
0,Breakfast Stout,good color no head clean and balanced with an ...,0.615617,0.053,0.413,0.534,0.9628
1,Flora Plum,2018 releasea pours a very light pale hazy kin...,0.611942,0.023,0.815,0.162,0.923
2,Abricot Du Fermier,this is the best sour saison i have ever had n...,0.608264,0.0,0.728,0.272,0.9565
3,King Sue,the emphasis in the beer is a strong sharp hop...,0.599622,0.0,0.848,0.152,0.7906
4,Flora,flora is clean crisp light and tart with smoot...,0.599248,0.0,0.909,0.091,0.4019


## PART E

In [18]:
recomdat = data.groupby(by = 'product_name').mean()

recbestpos = recomdat.sort_values(by = ['sentiment_compound'], axis = 0, ascending = False)[:1]
reclowestneg = recomdat.sort_values(by = ['sentiment_pos'], axis = 0, ascending = False)[:1]
preferencebeer = recomdat.sort_values(by = ['similarity_score'], axis = 0, ascending = False)[:1]

In [19]:
Recommended = recbestpos.append(reclowestneg)
Recommended = Recommended.append(preferencebeer)
Recommended

Unnamed: 0_level_0,similarity_score,sentiment_neg,sentiment_neu,sentiment_pos,sentiment_compound
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Black & Wild,0.55872,0.006,0.724,0.271,0.9975
Chemtrailmix,0.557522,0.0,0.547,0.453,0.948
Melcher Street (Double Dry-Hopped),0.597064,0.042,0.807,0.151,0.9525


Our recommended beers are the beers with the highest average compund sentiment, the highest average positive sentiment and the highest combined similarity score of the three important attributes. This provides recommendations to try the two best rated beers by compound sentiment and positive sentiment for someone seeking the best tasting beers in this category as well as a reccomendation for the person seeking a beer with the highest score against the predefined attributes that focuses on meeting their preferences for crisp, balanced and robust beer.

## PART F

#### Check top rated beers

In [20]:
#Calculate average for each beer and show top 3 average scores
df.groupby(["product_name"])["user_rating"].mean().sort_values(ascending=False)[:3]

product_name
Kentucky Brunch Brand Stout    4.812000
Chemtrailmix                   4.811176
Barrel-Aged Abraxas            4.796400
Name: user_rating, dtype: float64

In [42]:
df["similarity_score"] = (df["similarity_crisp"] + df["similarity_bal"] + df["similarity_rob"]) / 3

In [47]:
#create mask for top 3 beers by average score and calc their similarity scores
mask = (df["product_name"] == "Kentucky Brunch Brand Stout") | (raw["product_name"] == "Chemtrailmix") | (raw["product_name"] == "Barrel-Aged Abraxas")
df[mask].groupby(["product_name"])["similarity_score"].mean().sort_values(ascending=False)

product_name
Chemtrailmix                   0.502364
Barrel-Aged Abraxas            0.495141
Kentucky Brunch Brand Stout    0.476202
Name: similarity_score, dtype: float64

#### Check if they appear in top 300 reviews

In [59]:
mask = data["product_name"] == "Chemtrailmix"
data[mask].groupby(["product_name"])["similarity_score"].mean()

product_name
Chemtrailmix    0.557522
Name: similarity_score, dtype: float64

In [60]:
mask = data["product_name"] == "Barrel-Aged Abraxas"
data[mask].groupby(["product_name"])["similarity_score"].mean()

product_name
Barrel-Aged Abraxas    0.582997
Name: similarity_score, dtype: float64

In [61]:
mask = data["product_name"] == "Kentucky Brunch Brand Stout"
data[mask].groupby(["product_name"])["similarity_score"].mean()

Series([], Name: similarity_score, dtype: float64)

#### Compare missing beer with lowest attribute scores of beers in top 300 reviews across the entire scraped set of reviews

In [62]:
#compare with lowest similarity score from above analysis
preferencebeer = recomdat.sort_values(by = ['similarity_score'], axis = 0, ascending = True)[:1]
preferencebeer

Unnamed: 0_level_0,similarity_score,sentiment_neg,sentiment_neu,sentiment_pos,sentiment_compound
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3rd Anniversary Imperial IPA,0.556869,0.0,0.716,0.284,0.9337


In [63]:
mask = df["product_name"] == "3rd Anniversary Imperial IPA"
df[mask].groupby(["product_name"])["similarity_score"].mean()

product_name
3rd Anniversary Imperial IPA    0.49047
Name: similarity_score, dtype: float64

#### Conclusion

Two of the three top rated beers, Chemtrailmix and Barrel-Aged Abraxas appear in the top 300 reviews based on similarity scores. Chemtrailmix is actually one beer recommended in PART E which has the second highest average rating across the scraped data. Therefore these two beers do meet user requirements.

Barrel-Aged Abraxas does not meet the user requirements based on it not appearing in the top 300 reviews and having an average similarity score below the lowest average similarity score from the 300 reviews.