In [1]:
import pandas as pd
import sqlalchemy as sql
import spacy
from sql_functions import get_dataframe

### Keyword analysis with keyBERT

In [None]:
from keybert import KeyBERT

In [None]:
model = KeyBERT()

In [None]:
model.extract_keywords(text)

In [None]:
# Work with bigrams and trigrams

model.extract_keywords(text, keyphrase_ngram_range=(1, 2))

In [None]:
# Remove stop-words from a text. Stop-word are words that are irrelevant and are sometimes so ubiquitous in a text that they mess up the results.

model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words='english')

In [None]:
# Highlight all the important words in a document

model.extract_keywords(text, highlight=True)

In [None]:
for index, text in enumerate(t['text']):
    if index > 10:
        break
    keywords = model.extract_keywords(text, keyphrase_ngram_range=(1, 6), stop_words=None)
    print(text, keywords)

### Sentiment analysis with spacytextblob plus keyword analysis with rake-spacy

https://importsem.com/evaluate-sentiment-analysis-in-bulk-with-spacy-and-python/

https://www.section.io/engineering-education/sentiment-analysis-with-spacy-and-scikit-learn/

https://spacy.io/universe/project/spacy-textblob

In [None]:
import pandas as pd

import spacy
from spacytextblob.spacytextblob import SpacyTextBlob

nlp = spacy.load('en_core_web_md')
nlp.add_pipe('spacytextblob')
text = 'I had a really horrible day. It was the worst day ever!'

doc = nlp(text)
# doc._.blob.polarity                            # Polarity: -0.125 1.0 is very positive and -1.0 is very negative
# doc._.blob.subjectivity                        # Subjectivity: 0.9, 0.0 is very objective and 1.0 is very subjective 
doc._.blob.sentiment_assessments.assessments   # Assessments: [(['really', 'horrible'], -1.0, 1.0, None), (['worst', '!'], -1.0, 1.0, None), (['really', 'good'], 0.7, 0.6000000000000001, None), (['happy'], 0.8, 1.0, None)]
# doc._.blob.ngrams()

In [None]:
import pandas as pd

import spacy
from spacytextblob.spacytextblob import SpacyTextBlob

reviews = pd.read_csv('data/review_test.csv')

nlp = spacy.load('en_core_web_md')
nlp.add_pipe('spacytextblob')


for text in reviews['text']:
    doc = nlp(text)
    doc._.blob.polarity
    doc._.blob.subjectivity
    doc._.blob.sentiment_assessments.assessments
    # doc._.blob.ngrams()

https://spacy.io/universe/project/spacy-pytextrank

https://pypi.org/project/rake-spacy/

Remove stop-words with Spacy: https://machinelearningknowledge.ai/tutorial-for-stopwords-in-spacy/

### Keyword analysis with rake spacy on reviews/results sentiment Sining

In [2]:
import pandas as pd

reviews = pd.read_csv('data/review_pa_sentiment.csv')

In [3]:
reviews[['sentiment_label', 'sentiment_score']] = reviews['result'].str.split(', ', expand=True)
reviews[['delete', 'sentiment_label']] = reviews['sentiment_label'].str.split(' ', expand=True)

In [4]:
reviews['sentiment_label'] = reviews['sentiment_label'].str.replace('LABEL_2', 'Positive')
reviews['sentiment_label'] = reviews['sentiment_label'].str.replace('LABEL_1', 'Neutral')
reviews['sentiment_label'] = reviews['sentiment_label'].str.replace('LABEL_0', 'Negative')

reviews['sentiment_label'] = reviews['sentiment_label'].str.replace("'", "")

In [5]:
reviews.drop(columns=['delete', 'result', 'sentiment_score'], inplace=True, errors='ignore')

In [6]:
neg_reviews = reviews[reviews['sentiment_label'] == 'Negative']

In [7]:
neg_reviews

Unnamed: 0,business_id,review_id,text,sentiment_label
10,WbxPdq_PgVvCI462l0vPEQ,qfPqEn8hDJ5RpMay9jeDJQ,"Poor service, the drive through is open 24 hou...",Negative
16,8va8lpVU8aiQtnSbyjVScA,SsxNks2Kkhh6G1ulSU1fig,Does not really provide the real philly cheese...,Negative
21,o2i4AvqF6Q9lWK-wksxzuw,oXD4_mSslmhSzFBzzHhY3Q,Never going back. After visiting multiple time...,Negative
22,tdWrPT2XT2NpPtfQQhf08w,FqBl4Y9I9rA2QbFa1bQGbA,My sister and I went here for the all you can ...,Negative
33,Iw8uqNPxviwcgxtruAx_LA,RE9pOuEb0kYAzbgGPcMZmA,I'm only giving three stars because the food I...,Negative
...,...,...,...,...
1184867,mQO7LKZ1LHq0LLBO_Kg_rw,CXhYsqWe6CxYstBXPTgU3A,Ordered a Taco pizza tonight and it came with ...,Negative
1184868,lRpOWh8A7PlpDIkVEKpsQg,VHMHunKnQNjPgAizjUyclA,Just got a nyc style thin crust with pepperoni...,Negative
1184876,9_B5sCqKBOKDAmYpByiFFg,gGfFShzJ8PMfo3F-8PzUPA,"When we arrived at the restaurant, the hostess...",Negative
1184878,pqujXRfvvMSjcVLfzzKghQ,_9j5m3vmRLup_jRwdCDhYQ,Menu price for 12 wings is $7.95 which is not ...,Negative


In [9]:
reviews_text = ''

for item in neg_reviews['text'][:1000]:
    reviews_text = reviews_text + ' ' + item

In [None]:
reviews_text = reviews_text.lower()

In [None]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load('en_core_web_lg')

matcher = Matcher(nlp.vocab)
pattern = [{'POS': 'ADV', 'OP': '?'},{'POS': 'ADJ', 'OP': '+'}, {'POS': 'NOUN', 'OP': '+'}]
matcher.add('ADJECTIVE_NOUN', [pattern], greedy='LONGEST')
doc=nlp(reviews_text)
matches = matcher(doc)

# What about fuzzy machting?

In [None]:
print(len(matches))
for match in matches:
    print(match, doc[match[1]:match[2]])

In [None]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load('en_core_web_lg')

matcher = Matcher(nlp.vocab)
pattern = [
    {'POS': 'NOUN'},
    {'LEMMA': 'be'},  # Add 'taste' and 'seem' to the list? Yields 7 and 8 more results.
    # {'DEP': 'neg', 'OP': '?'},
    # {'POS': 'ADV', 'OP': '*'},
    {'POS': 'ADJ'},
    {'POS': 'CCONJ', 'OP': '?'},
    {'POS': 'ADJ', 'OP': '?'}

]
matcher.add('NOUN_IS_ADJECTIVE', [pattern], greedy='LONGEST')
doc=nlp(reviews_text)
matches2 = matcher(doc)

# Maybe add a negation?
# Use displaycy to visualize findings?

In [None]:
doc[4905: 4911]

# How to include relative clauses?
# How to inlude sentences that start with a pronoun? 'it was burned'
# How to inlcude sentence that don't start with a noun or a pronoun? 'the fisch was salty AND COLD', additions with and

for token in doc[4905: 4911]:
    print(token, token.dep_)


In [None]:
print(len(matches))
for match in matches:
    print(doc[match[1]:match[2]])

In [None]:
service_labels = ['service',
'waiter',
'barista',
'lady',
'staff',
'manager',
'dude',
'server',
'waitress',
'bartender',
'workers']

In [None]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load('en_core_web_lg')

matcher = Matcher(nlp.vocab)
pattern = [
    {'TEXT': {'IN': service_labels}},
    {'LEMMA': 'be'},
    {'POS': 'ADV', 'OP': '*'},
    {'POS': 'ADJ'},
]
matcher.add('COMPARISON', [pattern], greedy='LONGEST')
doc=nlp(reviews_text)
matches = matcher(doc)

In [None]:
list_service = []

for match in matches:
    list_service.append(doc[match[1]:match[2]].text)

In [None]:
doc[94522:94535]

### Create a word cloud to find the most important word fields

In [None]:
import numpy as np
import pandas as pd
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
text = ''

for match in matches:
    text = text + ' ' + doc[match[1]:match[2]].text

In [None]:
wordcloud = WordCloud(width=1000, height=500)

wordcloud.generate(text)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

### Use KMeans to identify themes (did not work)

In [None]:
reviews_list = []

for match in matches2:
    reviews_list.append(doc[match[1]:match[2]].text)

In [None]:
text = 'This is an apple'

doc = nlp(text)

for token in doc.ents:
    print(token, token.label_)

In [None]:
from sklearn.preprocessing import normalize

span1 = nlp('service was terrible')
span2 = nlp('service was atrocious')
span3 = nlp('service was good')

def vectorize(text):
    # Get the SpaCy vector -- turning off other processing to speed things up
    return nlp(text).vector

# Now we stack the vectors and normalize them
# Inputs are typically called "X"
X = normalize(np.stack([vectorize(review) for review in reviews_list]))
print("X (the document matrix) has shape: {}".format(X.shape))
print("That means it has {} rows and {} columns".format(X.shape[0], X.shape[1]))

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X2 = pca.fit_transform(X)
print("X2 shape is {}".format(X2.shape))

In [None]:
plt.scatter(X2[:, 0], X2[:, 1])

In [None]:
CLUSTERS = 4

# First we fit the model...
from sklearn.cluster import KMeans
k_means = KMeans(n_clusters=CLUSTERS, random_state=1)
k_means.fit(X)

In [None]:
common_words = k_means.cluster_centers_.argsort()[:,-1:-26:-1]
for num, centroid in enumerate(common_words):
    # print(num)
    # print(centroid)
    print(str(num) + ' : ' + ', '.join(reviews_list[word] for word in centroid))
    print('\n')


### Look for similar reviews using Spacy similarity (did not work)

In [None]:
span1 = nlp('the hostess was nice')
span2 = nlp('the service was rude')
span3 = nlp('the service was horrible')

# for token in nlp(span1):
#     print(token.text, token.vector)

In [None]:
def print_comparison(a, b):
    # Create the doc objects
    a = nlp(a)
    b = nlp(b)
    # Euclidean "L2" distance
    distance = np.linalg.norm(a.vector - b.vector)
    # Cosine similarity
    similarity = a.similarity(b)
    print("-" * 80)
    print("A: {}\nB: {}\nDistance: {}\nSimilarity: {}".format(a, b, distance, similarity))

print_comparison(span1, span2)
print_comparison(span1, span3)
print_comparison(span2, span3)

## Continue with spaCy

### Looking for desscriptions of food

In [10]:
# Training the entity rule to recognize food

food_df = pd.read_csv('data/food_labels.csv')

In [11]:
food_labels = food_df[food_df['description'].str.contains('[^a-zA-Z]') == True]['description']

In [12]:
food_labels = food_labels[food_labels.str.split().apply(len) <= 2].drop_duplicates()

In [13]:
food_labels = food_labels.str.lower()

In [14]:
food_labels = food_labels[food_labels.str.contains('.*,.*,.*', regex=True) == False]

In [15]:
food_labels

182            walnut butter
195            tita crackers
206           teriyaki sauce
214           dessert shells
236         italian dressing
                 ...        
1677708    pasta, tagliarini
1679322         pork riblets
1679410    dried cranberries
1692082    shawarma marinade
1694154       golden nuggets
Name: description, Length: 16773, dtype: object

In [16]:
# Fix: Foods are comma seperated and order is reversed: 'muffins, blueberry' become 'blueberry muffins'

food_labels[food_labels.str.contains(', ') == True] = (
    food_labels[food_labels.str.contains(', ') == True].str.split(', ', expand=True)[1] +
    ' ' +
    food_labels[food_labels.str.contains(', ') == True].str.split(', ', expand=True)[0]
)

# food_labels[food_labels.str.contains(',') == True] = (
#     food_labels[food_labels.str.contains(',') == True].str.split(',', expand=True)[1] +
#     ' ' +
#     food_labels[food_labels.str.contains(',') == True].str.split(',', expand=True)[0]
# )

In [15]:
# If label has no plural, create singular form. If label has no singular, create plural 

import spacy
import lemminflect

nlp = spacy.load('en_core_web_lg')

inflected_labels = []

for label in food_labels:
    
    doc = nlp(label)

    if len(doc) == 1:
        if doc[0].tag_ == 'NNS':
            inflected_labels.append(doc[0]._.inflect('NN'))
        else:
            inflected_labels.append(doc[0]._.inflect('NNS'))

    if len(doc) == 2:
        if doc[1].tag_ == 'NNS':
            inflected_labels.append(doc[0].text + ' ' + doc[1]._.inflect('NN'))
        else:
            inflected_labels.append(doc[0].text + ' ' + doc[1]._.inflect('NNS'))

In [16]:
food_labels_inflected = pd.Series(inflected_labels)

In [17]:
food_labels = pd.concat([food_labels, food_labels_inflected])

In [18]:
food_labels.drop_duplicates(inplace=True)

In [None]:
food_labels

In [19]:
# Add food labels to entity ruler

food_labels = food_labels  # Remove 'bar' from foods_labels, add 'product'

patterns = []

nlp = spacy.load('en_core_web_lg')

ruler = nlp.add_pipe('entity_ruler', before='ner')

for label in food_labels:
    patterns.append({'label': 'FOOD', 'pattern': label})

ruler.add_patterns(patterns)

In [20]:
from spacy.matcher import Matcher

# nlp = spacy.load('en_core_web_lg')  # !!!Do not load again, because will reset the entity ruler from above

matcher = Matcher(nlp.vocab)

pattern = [
    [
        {'ENT_TYPE': 'FOOD'},
        {'LEMMA': {'IN': ['be', 'taste', 'smell']}},
        {'DEP': 'neg', 'OP': '?'},
        {'POS': 'ADV', 'OP': '?'},
        {'POS': 'ADJ'}
    ]#,
    # [
    #     # {'POS': 'ADJ'},
    #     {'ENT_TYPE': 'FOOD'}
    # ]  
]
matcher.add('FOOD_IS', pattern, greedy='LONGEST')
doc=nlp(reviews_text)
matches2 = matcher(doc)

# Create list of top ten adjectives that were used to describe the food
# Show as a word cloud

In [22]:
reviews_text



In [21]:
print(len(matches2))
for match in matches2:
    print(match, doc[match[1]:match[2]])

83
(8386174512400911802, 1662, 1664) pie was
(8386174512400911802, 3622, 3624) sauce was
(8386174512400911802, 4370, 4372) peppers were
(8386174512400911802, 4390, 4392) skewers were
(8386174512400911802, 11943, 11945) rabe was
(8386174512400911802, 12069, 12071) slaw was
(8386174512400911802, 13808, 13810) tacos were
(8386174512400911802, 14256, 14258) cheese was
(8386174512400911802, 15515, 15517) seltzer was
(8386174512400911802, 18153, 18155) sandwiches were
(8386174512400911802, 18726, 18728) pie tasted
(8386174512400911802, 26380, 26382) chowder was
(8386174512400911802, 26438, 26440) cake tasted
(8386174512400911802, 26762, 26764) bread was
(8386174512400911802, 27037, 27039) sprouts were
(8386174512400911802, 33376, 33378) roll was
(8386174512400911802, 33382, 33384) sauce was
(8386174512400911802, 37255, 37257) dumplings were
(8386174512400911802, 37910, 37912) salad was
(8386174512400911802, 41976, 41978) oil are
(8386174512400911802, 45263, 45265) bar is
(8386174512400911802

### Looking for speed of the service

In [None]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load('en_core_web_lg')

matcher = Matcher(nlp.vocab)
pattern = [
    {'LEMMA': {'IN': ['wait', 'take', 'spend']}},
    {'POS': 'ADP', 'OP': '*'},
    {'ENT_TYPE': 'TIME', 'OP': '+'}
]
matcher.add('SLOW_SERVICE', [pattern], greedy='LONGEST')
doc=nlp(reviews_text)
matches = matcher(doc)

In [None]:
print(len(matches))
for match in matches:
    print(match, doc[match[1]:match[2]])

## Playing around

In [None]:
import spacy
from spacy.matcher import Matcher

# nlp = spacy.load('en_core_web_lg')  # !!!Do not load again, because will reset the entity ruler from above

matcher = Matcher(nlp.vocab)

pattern = [
    [
        {'ENT_TYPE': 'FOOD'},
        {'LEMMA': {'IN': ['be', 'taste']}},
        {'DEP': 'neg', 'OP': '?'},
        {'POS': 'ADV', 'OP': '?'},
        {'POS': 'ADJ'}
    ]#,
    # [
    #     # {'POS': 'ADJ'},
    #     {'ENT_TYPE': 'FOOD'}
    # ]  
]
matcher.add('FOOD_IS', pattern, greedy='LONGEST')
doc=nlp(reviews_text)
matches2 = matcher(doc)

# Create list of top ten adjectives that were used to describe the food
# Show as a word cloud

In [None]:
len(matches2)

In [None]:
food_matches = []

print(len(matches2))
for match in matches2:
    food_matches.append(doc[match[1]:match[2]].text)

In [None]:
import spacy
from spacy.matcher import Matcher

# nlp = spacy.load('en_core_web_lg')  # !!!Do not load again, because will reset the entity ruler from above

matcher = Matcher(nlp.vocab)

pattern = [
    [
        {'POS': 'NOUN'},
        {'LEMMA': {'IN': ['be', 'taste']}},
        {'DEP': 'neg', 'OP': '?'},
        {'POS': 'ADV', 'OP': '?'},
        {'POS': 'ADJ'}
    ]#,
    # [
    #     # {'POS': 'ADJ'},
    #     {'ENT_TYPE': 'FOOD'}
    # ]  
]
matcher.add('FOOD_IS', pattern, greedy='LONGEST')
doc=nlp(reviews_text)
matches2 = matcher(doc)

# Create list of top ten adjectives that were used to describe the food
# Show as a word cloud

In [None]:
len(matches2)

In [None]:
without_food_matches = []

for match in matches2:

    if doc[match[1]:match[2]].text not in food_matches and list_service:
        without_food_matches.append(doc[match[1]:match[2]].text)

In [None]:
without_food_matches
