#### In this notebook we are going to use spaCy to find out what customers are complaining about

In [None]:
import pandas as pd

In [None]:
reviews = pd.read_csv('data/biz_review_3-5.csv')

In [None]:
reviews_text = reviews['text'].str.lower()

#### One focus will be on finding customer complaints about food.
#### To do so, we will use a table with food labels and train spaCy to recognize them.

In [None]:
import spacy

In [None]:
food_df = pd.read_csv('data/food_labels.csv')

In [None]:
food_labels = food_df[food_df['description'].str.contains('[a-zA-Z]') == True]['description']

In [None]:
food_labels = food_labels[food_labels.str.split().apply(len) <= 2].drop_duplicates()

In [None]:
food_labels = food_labels.str.lower()

In [None]:
food_labels = food_labels[food_labels.str.contains('.*,.*,.*', regex=True) == False]

In [None]:
# Fix: Foods are comma seperated and order is reversed: 'muffins, blueberry' become 'blueberry muffins'

food_labels[food_labels.str.contains(', ') == True] = (
    food_labels[food_labels.str.contains(', ') == True].str.split(', ', expand=True)[1] +
    ' ' +
    food_labels[food_labels.str.contains(', ') == True].str.split(', ', expand=True)[0]
)

food_labels[food_labels.str.contains(',') == True] = (
    food_labels[food_labels.str.contains(',') == True].str.split(',', expand=True)[1] +
    ' ' +
    food_labels[food_labels.str.contains(',') == True].str.split(',', expand=True)[0]
)

In [None]:
# If label has no plural, create singular form. If label has no singular, create plural 

import lemminflect

nlp = spacy.load('en_core_web_lg')

inflected_labels = []

for label in food_labels:
    
    doc = nlp(label)

    if len(doc) == 1:
        if doc[0].tag_ == 'NNS':
            inflected_labels.append(doc[0]._.inflect('NN'))
        else:
            inflected_labels.append(doc[0]._.inflect('NNS'))

    if len(doc) == 2:
        if doc[1].tag_ == 'NNS':
            inflected_labels.append(doc[0].text + ' ' + doc[1]._.inflect('NN'))
        else:
            inflected_labels.append(doc[0].text + ' ' + doc[1]._.inflect('NNS'))

In [None]:
food_labels_inflected = pd.Series(inflected_labels)

In [None]:
food_labels = pd.concat([food_labels, food_labels_inflected])

In [None]:
food_labels.drop_duplicates(inplace=True)

In [None]:
# Add food labels to entity ruler

food_labels = food_labels  # Remove 'bar' from foods_labels, add 'product'

patterns = []

nlp = spacy.load('en_core_web_lg')

ruler = nlp.add_pipe('entity_ruler', before='ner')

for label in food_labels:
    patterns.append({'label': 'FOOD', 'pattern': label})

ruler.add_patterns(patterns)

In [None]:
ruler.to_disk('data/food_patterns.jsonl')

#### We can now use the matcher to look for specific word patterns

In [None]:
from spacy.matcher import Matcher

In [116]:
# The function takes as arguments:
# model=spaCy language model, must be stringformat
# file=PandasSeries with strings to analyse
# pattern=Patterns to look for
# ruler=an entity ruler with additional entity labels, optional, must be a path in string format referring to a jsonl file
# The name of the matcher in capital letters

def match_reviews(model, file, pattern, matcher_name, ruler_path=None):

   from spacy.matcher import Matcher
    
   nlp = spacy.load(model)

   if ruler_path:
      ruler = nlp.add_pipe("entity_ruler", before='ner')
      ruler.from_disk(ruler_path)
 
   matcher = Matcher(nlp.vocab)

   matcher.add(matcher_name, pattern, greedy='LONGEST')

   matches_temp = []
   matches = []

   for text in file:
      doc=nlp(text)
      matches_temp = matcher(doc)
      if matches_temp != []:
         for match in matches_temp:
            matches.append(doc[match[1]:match[2]].text)

   return matches

In [117]:
model = 'en_core_web_lg'
file = reviews_text[:1000]
pattern = [
    [
        {'ENT_TYPE': 'FOOD'},
        {'LEMMA': {'IN': ['be', 'taste', 'smell']}},
        {'DEP': 'neg', 'OP': '?'},
        {'POS': 'ADV', 'OP': '?'},
        {'POS': 'ADJ'}
    ]
]
matcher_name = 'FOOD_MATCHER'
ruler_path = 'data/food_patterns.jsonl'

food_matches = match_reviews(model, file, pattern, matcher_name, ruler_path)