Picking up from Sebastian's scraped code output.

In [1]:
# Library Imports
import spacy
import en_core_web_lg
import pandas as pd
import nltk
from nltk.corpus import stopwords
from string import punctuation
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
nlp = en_core_web_lg.load() #just import once or will take some time

In [2]:
doc = nlp("Whats up , MSBA rocks!!!>cld??.")
doc

Whats up , MSBA rocks!!!>cld??.

In [3]:
raw = pd.read_csv("https://raw.githubusercontent.com/coldonline/TextAnalytics/master/HW2/reviews_text.csv")
raw.drop(['Unnamed: 0'], axis=1,inplace=True)
raw.to_csv("raw_scrape.csv")
raw.head()

Unnamed: 0,product_name,product_review,user_rating
0,Kentucky Brunch Brand Stout,"2016 Silver Wax. Aroma has whiskey, maple, tof...",4.8
1,Kentucky Brunch Brand Stout,The beer pours Pitch Black with a frothy tan h...,4.74
2,Kentucky Brunch Brand Stout,Probably the smoothest beer I have ever had. S...,4.68
3,Kentucky Brunch Brand Stout,"Dark black, very thick, a little bit of tan he...",5.0
4,Kentucky Brunch Brand Stout,Poured black as ink with thin ruby edges at 58...,4.97


In [4]:
# DATA CLEANSING FUNCTIONS

def remove_space(s):
    return s.replace("\n"," ")

def removepunc(item):
    for p in punctuation:
        item = item.lstrip().replace(p,'')
    return item

def lowerize(x):
    return x.lower()

stop_words = set(stopwords.words('english'))

def remove_stopwords(s):
    return [w for w in s if not w in stop_words] 

def lematize(l):
    s=[]
    for i in l:
        s.append(wordnet_lemmatizer.lemmatize(i))
    return s

In [5]:
data = raw.copy()
print(data.shape)

data = data.dropna()
print(data.shape) #finding if there is any major loss due to this

#Cleaning reviews for calculating word freq
data['product_review'] = data['product_review'].apply(remove_space).apply(removepunc).apply(lowerize) 
wordnet_lemmatizer = WordNetLemmatizer()
data['review_words'] = data['product_review'].str.split().apply(set).apply(list).apply(remove_stopwords)
data.head()

(6214, 3)
(6214, 3)


Unnamed: 0,product_name,product_review,user_rating,review_words
0,Kentucky Brunch Brand Stout,2016 silver wax aroma has whiskey maple toffee...,4.8,"[silver, whiskey, taste, aftertaste, maple, 20..."
1,Kentucky Brunch Brand Stout,the beer pours pitch black with a frothy tan h...,4.74,"[smells, bottle, tan, bourbon, creams, frothy,..."
2,Kentucky Brunch Brand Stout,probably the smoothest beer i have ever had sm...,4.68,"[smoothest, taste, alcohol, smelled, bourbon, ..."
3,Kentucky Brunch Brand Stout,dark black very thick a little bit of tan head...,5.0,"[prepared, smells, chocolate, tan, fully, alco..."
4,Kentucky Brunch Brand Stout,poured black as ink with thin ruby edges at 58...,4.97,"[retention, complimenting, 58, maple, awesome,..."


In [6]:
word_bag = []
for i in range(data.shape[0]):
    word_bag += data['review_words'][i]

#word_bag

word_freq = nltk.FreqDist(word_bag)

In [7]:
len(word_freq) # number of words

23894

In [17]:
rslt = pd.DataFrame(word_freq.most_common(22337),              
                    columns=['Word', 'Frequency'])
rslt.head()
rslt.to_csv("beer_review_wordcount.csv")
#some words like 'balanced', 'crisp' , 'robust' seem to be mixing with other words in the file, but that doesn't bother us 
#since we care only about existence than perfect counts

Unnamed: 0,Word,Frequency
0,head,3207
1,beer,2748
2,taste,2097
3,sweet,1624
4,dark,1591


# PART C

In [8]:
# CHOOSING WORDS 'balanced' , 'crisp' , 'robust' they seem pretty prevelant in the data

#spacy implementation


def process_text(text):
    doc = nlp(text.lower())
    result = []
    for token in doc:
        if token.text in nlp.Defaults.stop_words:
            continue
        if token.is_punct:
            continue
#         if token.lemma_ == '-PRON-'
#             continue
        result.append(token.lemma_)
    return " ".join(result)

def calculate_similarity(text1, text2):
    base = nlp(process_text(text1))
    compare = nlp(process_text(text2))
    return base.similarity(compare)


In [None]:
# Not the cleanest piece of code, but can clean similarity calculation with above function later
# This chunk might take some time
data['similarity_crisp'] = 0
data['similarity_bal'] = 0
data['similarity_rob'] = 0

for i in range(data.shape[0]):
    data.loc[i,'similarity_crisp'] = nlp(process_text(data.product_review[i])).similarity(nlp("crisp"))
    data.loc[i,'similarity_bal'] = nlp(process_text(data.product_review[i])).similarity(nlp("balanced"))
    data.loc[i,'similarity_rob'] = nlp(process_text(data.product_review[i])).similarity(nlp("robust"))
data.head()

In [42]:
data[['similarity_crisp', 'similarity_bal','similarity_rob']].mean(axis=1)
data.drop(['review_words'], axis=1, inplace=True)

In [43]:
data.to_csv('output_partC.csv')