### Import Libraries

In [1]:
import pandas as pd
import numpy as np
from time import time


In [2]:
onion_soup_reviews = pd.read_csv('../data/interim/onion_soup_reviews.csv')
onion_soup_sentences = pd.read_csv('../data/interim/onion_soup_sentences.csv')
menu = pd.read_csv('../data/interim/menu.csv')

In [5]:
onion_soup_reviews.head()

Unnamed: 0,text,cool,funny,review_id,date,stars,business_id,useful,user_id,business_name
0,Other than being right across the Fountains of...,1.0,2.0,uczUlWIWuO-KzoUiLhICNw,2015-02-10,2.0,4JNXUYY8wbaaDmk3BPzlWw,3.0,9zuYkm3k4_9KjE1PC8EPfg,Mon Ami Gabi
1,French onion soup was watery with little taste...,0.0,0.0,185E0cpQpDRUO4JRGu3fXQ,2017-04-24,3.0,4JNXUYY8wbaaDmk3BPzlWw,0.0,EYiYLS0ZHDKGJSb1IKcpwg,Mon Ami Gabi
2,Where to begin! Now our dining experience her...,0.0,0.0,QoY3L_d_axTcMn68pI8zxQ,2014-12-03,5.0,4JNXUYY8wbaaDmk3BPzlWw,1.0,mp3Xy-w2isyLjEN91xOeGQ,Mon Ami Gabi
3,Charming resturant that looks like it would be...,0.0,0.0,nth_q-GqOy_Ly8sxsREIwA,2010-12-04,4.0,4JNXUYY8wbaaDmk3BPzlWw,0.0,M4g64KUEia1qgcn-qNlYsw,Mon Ami Gabi
4,This review is long overdue! I have been eat...,0.0,0.0,l0Lm7Dx69s6aH7a-5dwKDg,2010-07-11,5.0,4JNXUYY8wbaaDmk3BPzlWw,0.0,pQAUyBorkc1ZOxmV-uJ02w,Mon Ami Gabi


In [6]:
onion_soup_sentences.head()

Unnamed: 0,text,tags
0,Our table ordered Bordelaise Steak Frites (...,"scallops_gratinees, onion_soup_au_gratin, bord..."
1,The steak frites and onion soup were the be...,"onion_soup_au_gratin, prime_steak_frites, frites"
2,"Onion soup was also a nice, big portion, but ...",onion_soup_au_gratin
3,French onion soup was watery with little taste,onion_soup_au_gratin
4,We ate almost everything on the menu - altho...,"baked_goat_cheese, onion_soup_au_gratin"


In [10]:
def find_term(word_list, term):
    '''
    Arguments:
    word_list : List of words or a string
    term      : List or string of words to search for
    
    Finds the start and end indices of a search term in a string.
    `start` is the index of the first character in `term` in word_list,
    `end` is the index of the last character in `term` in word_list.    
    
    Return:
    results : List of tuples (start, end)
    '''    
    # Check if word_list is a string or list
    if type(word_list) is str:
        word_list = word_list.lower().split()
    elif type(word_list) is not list:
        print('Error: word_list must be a list or string.')
        return None

    # Check if term is a string or list    
    if type(term) is str:
        term = term.lower().split()
    elif type(term) is not list:
        print('Error: term must be a list or string.')
        return None

    results = []
    term_length = len(term)

    # Find indices of term[0] in sentence
    for ind in (i for i, word in enumerate(word_list) if word == term[0]):
        # Check if rest of the term matches
        if word_list[ind:ind + term_length] == term:
            results.append((ind, ind+term_length-1))

    return results

In [14]:
find_term('The onion soup is at index (1,2). The onion soup is also at index (8,9).', 'onion soup')

[(1, 2), (8, 9)]

In [15]:
def get_chunks(word_list, term, n_before = 5, n_after = 5):
    '''
    Arguments:
    word_list : List or string of words
    term      : List or string of words to search for
    before    : Number of characters to span before term
    after     : Number of characters to span after term   
    
    Gets a list of sentence fragments containing term in word_list
    Each sentence fragment spans n_before characters to the left
    or until the start of the word_list
    and n_after characters to the right 
    or until the end of the word_list
    
    Return:
    chunks : List of chunks
    
    '''
    # Check if word_list is a string or list
    if type(word_list) is str:
        word_list = word_list.lower().split()
    elif type(word_list) is not list:
        print('Error: word_list must be a list or string.')
        return None
    
    # Check if term is a string or list    
    if type(term) is str:
        term = term.lower().split()
    elif type(term) is not list:
        print('Error: term must be a list or string.')
        return None    
    
    indices = find_term(word_list, term)
    chunks = []

    for start, end in indices:
        before = n_before
        after = n_after
        
        # Check if start index is near the beginning of the word_list
        if start < n_before:
            before = start
        # Check if end index is near the end of the word_list
        if end > len(word_list) - n_after:
            after = len(word_list) - end
            
        chunks.append(' '.join(word_list[start-before : end+after+1]))
        
    return chunks



In [21]:
test = 'I got the the onion soup, which was great my wife also enjoyed the onion soup, my children do not like onion soup'
get_chunks(test, 'onion soup')

['my children do not like onion soup']

The input string must be lemmatized before running `get_chunks()`, or it will fail to extract terms with punctuation.

In [22]:
test = 'I got the the onion soup which was great my wife also enjoyed the onion soup my children do not like onion soup'
get_chunks(test, 'onion soup')

['i got the the onion soup which was great my wife',
 'my wife also enjoyed the onion soup my children do not like',
 'my children do not like onion soup']

In [415]:
def flatten(superlist): 
    '''
    Arguments: 
    superlist : A list of list of strings.

    Requirements: 
    Each element in superlist must be a list.
    
    Return:
    A flattened list of strings.

    ex: 
    flatten([['a'], ['b', 'c'], ['d', 'e', 'f']])
    >> ['a', 'b', 'c', 'd', 'e', 'f']
    '''    
    return [item \
            for sublist in superlist \
            for item in sublist]

In [410]:
# onion_soup_chunks = []
onion_soup_chunks = onion_soup_sentences.apply(lambda row: 
                                                    get_chunks(list(TextBlob(row['text'].lower()).words), row['target']), axis = 1)
onion_soup_chunks = pd.Series(flatten(onion_soup_chunks))


In [411]:
onion_soup_chunks = pd.concat([onion_soup_sentences['target'], onion_soup_chunks], axis = 1)
onion_soup_chunks.columns = ['target', 'text']


In [511]:
onion_soup_chunks[['text']].head()

Unnamed: 0,text
0,scallops shrimp peas and cream onion soup and eggs benedict with canadian
1,the steak frites and onion soup were the best things we
2,onion soup was also a nice big
3,french onion soup was watery with little taste
4,the menu although their french onion soup was n't spectacular their baked


In [414]:
onion_soup_chunks.to_csv('../data/interim/onion_soup_chunks.csv', index = False)