# Imports

In [1]:
import re
import abc
import json
import pprint
import pathlib
import operator
import collections

In [2]:
import nltk
import spacy
import pytest
import ipytest
import inflect
import humanize

In [3]:
import reader as rd
import preprocessor as pr
import transformer as tr
import recommender as rc

In [4]:
ipytest.autoconfig()

# Constants

In [5]:
PROJECT_DIR = pathlib.Path('~/project').expanduser()
DATA_DIR = PROJECT_DIR / 'data'
SPACY_DATA = DATA_DIR / 'spacy_data'

In [6]:
CONVERSION_PATH = DATA_DIR / 'conversions.json'

In [7]:
COOKING_CORPUS_ROOT = DATA_DIR / 'cooking_corpus'
COOKING_CORPUS_MINI_ROOT = DATA_DIR / 'cooking_corpus_mini'

In [8]:
COOKING_CORPUS_PICKLED_ROOT = DATA_DIR / 'cooking_corpus_pickled'
COOKING_CORPUS_MINI_PICKLED_ROOT = DATA_DIR / 'cooking_corpus_mini_pickled'

In [9]:
RECOMMENDER_PATH = pathlib.Path('recommender.pkl')

In [10]:
# CORPUS_ROOT = COOKING_CORPUS_MINI_ROOT
# CORPUS_PICKLED_ROOT = COOKING_CORPUS_MINI_PICKLED_ROOT

In [11]:
CORPUS_ROOT = COOKING_CORPUS_ROOT
CORPUS_PICKLED_ROOT = COOKING_CORPUS_PICKLED_ROOT

# Classes

## HTMLCorpusReader

In [12]:
corpus = rd.HTMLCorpusReader(CORPUS_ROOT.as_posix())
print(corpus.describes())

HTML corpus contains 3 files in 1 categories.
Structured as:
    63 paragraphs (21.000 mean paragraphs per file)
    180 sentences (2.857 mean sentences per paragraph).
Word count of 3,000 with a vocabulary of 926 (3.240 lexical diversity).
Corpus scan took 0.090 seconds.


## Preprocessor

In [13]:
%%time
corpus = rd.HTMLCorpusReader(CORPUS_ROOT.as_posix())
preprocessor = pr.Preprocessor(corpus, CORPUS_PICKLED_ROOT.as_posix())
docs = preprocessor.transform()

CPU times: user 110 ms, sys: 70.6 ms, total: 181 ms
Wall time: 282 ms


## HTMLPickledCorpusReader

In [14]:
corpus = rd.HTMLPickledCorpusReader(CORPUS_PICKLED_ROOT.as_posix())
print(corpus.describes())

HTML corpus contains 3 files in 1 categories.
Structured as:
    63 paragraphs (21.000 mean paragraphs per file)
    180 sentences (2.857 mean sentences per paragraph).
Word count of 3,000 with a vocabulary of 926 (3.240 lexical diversity).
Corpus scan took 0.006 seconds.


## TextNormalizer

In [15]:
corpus = rd.HTMLPickledCorpusReader(CORPUS_PICKLED_ROOT.as_posix())
normalizer = tr.TextNormalizer()

In [16]:
len(list(normalizer.fit_transform(corpus.docs())))

3

# Fundamentals of Conversation

## Dialog: A Brief Exchange

In [16]:
class Dialog(abc.ABC):
    """
    A dialog listens for utterances, parses and interprets them, then updates
    its internal state. It can then formulate a response on demand.
    """
    def listen(self, text, need_response=True, **kwargs):
        """
        A text utterance is passed in and parsed. It is then passed to the
        interpret method to determine how to respond. If a response is
        requested, the respond method is used to generate a text response
        based on the most recent input and the current Dialog state.
        """
        # Parse the input
        sents = self.parse(text)
        
        # Interpret the input
        sents, confidence, kwargs = self.interpret(sents, **kwargs)
        
        # Determine the response
        response = (self.respond(sents, confidence, **kwargs)
                    if need_response else None)
        
        # Return initiative
        return response, confidence
    
    @abc.abstractmethod
    def parse(self, text):
        """
        Every dialog may need its own parsing strategy, some dialogs may need
        dependency vs. constituency parses, others may simply require regular
        expressions or chunkers.
        """
        return []
    
    @abc.abstractmethod
    def interpret(self, sents, **kwargs):
        """
        Interprets the utterance passed in as a list of parsed sentences,
        updates the internal state of the dialog, computes a confidence of the
        interpretation. May also return arguments specific to the response
        mechanism.
        """
        return sents, 0.0, kwargs
    
    @abc.abstractmethod
    def respond(self, sents, confidence, **kwargs):
        """
        Creates a response given the input utterances and the current state of
        the dialog, along with any arguments passed in from the listen or the
        interpret methods.
        """
        return None

## Maintaining a Conversation

In [17]:
class SimpleConversation(Dialog, collections.abc.Sequence):
    """
    This is the most simple version of a conversation.
    """
    
    def __init__(self, dialogs):
        self._dialogs = dialogs
    
    def __getitem__(self, idx):
        return self._dialogs[idx]

    def __len__(self):
        return len(self._dialogs)
    
    def listen(self, text, need_response=True, **kwargs):
        """
        Simply return the best confidence response
        """        
        responses = [dialog.listen(text, need_response, **kwargs)
                     for dialog in self._dialogs]
        
        # Responses is a list of (response, confidence) pairs
        return max(responses, key=operator.itemgetter(1))
    
    def parse(self, text):
        """
        Returns parses for all internal dialogs for debugging
        """
        return [dialog.parse(text)
                for dialog in self._dialogs]
    
    def interpret(self, sents, **kwargs):
        """
        Returns interpretations for all internal dialogs for debugging
        """
        return [dialog.interpret(sents, **kwargs)
                for dialog in self._dialogs]

    def respond(self, sents, confidence, **kwargs):
        """
        Returns responses for all internal dialogs for debugging
        """
        return [dialog.respond(sents, confidence, **kwargs)
                for dialog in self._dialogs]

# Rules for Polite Conversation

## Greetings and Salutations

In [18]:
class Greeting(Dialog):
    """
    Keeps track of the participants entering or leaving the conversation and
    responds with appropriate salutations. This is an example of a rules based
    system that keeps track of state and uses regular expressions and logic to
    handle the dialog.
    """
    
    PATTERNS = {
        'greeting': r'hello|hi|hey|good morning|good evening',
        'introduction': r'my name is ([a-z\-\s]+)',
        'goodbye': r'goodbye|bye|ttyl',
        'rollcall': r'roll call|who\'s here?'
    }
    
    def __init__(self, participants=None):
        # Participants is a map of user name to real name
        self.participants = {}
        
        if participants is not None:
            for participant in participants:
                self.participants[participant] = None
        
        # Compile regular expressions
        self._patterns = {
            key: re.compile(pattern, re.I)
            for key, pattern in self.PATTERNS.items()
        }
    
    def parse(self, text):
        """
        Applies all regular expressions to the text to find matches.
        """
        return {
            key: match
            for key, pattern in self._patterns.items()
            if (match := pattern.search(text))
            and match is not None
        }
    
    def interpret(self, sents, **kwargs):
        """
        Takes in parsed matches and determines if the message is an enter,
        exit, or name change.
        """
        # Can't do anything with no matches
        if len(sents) == 0:
            return sents, 0.0, kwargs

        # Get username from the participants
        user = kwargs.get('user', None)
        
        # Determine if an introduction has been made
        if 'introduction' in sents:
            # Get the name from the utterance
            name = sents['introduction'].groups()[0]
            user = user or name.lower()

            # Determine if name has changed
            if (user not in self.participants
                or self.participants[user] != name):
                kwargs['name_changed'] = True
            
            # Update the participants
            self.participants[user] = name
            kwargs['user'] = user
            
        # Determine if a greeting has been made
        if 'greeting' in sents:
            # If we don't have a name for the user
            if user not in self.participants:
                kwargs['request_introduction'] = True

        # Determine if goodbye has been made
        if 'goodbye' in sents and user is not None:
            # Remove participant
            self.participants.pop(user)
            kwargs.pop('user', None)
        
        # If we've seen anything we're looking for, we're pretty confident
        return sents, 1.0, kwargs

    def respond(self, sents, confidence, **kwargs):
        """
        Gives a greeting or a goodbye depending on what's appropriate.
        """
        if confidence == 0:
            return None
        
        name = self.participants.get(kwargs.get('user', None), None)
        name_changed = kwargs.get('name_changed', False)
        request_introduction = kwargs.get('request_introduction', False)
        
        if 'greeting' in sents or 'introduction' in sents:
            if request_introduction:
                return "Hello, what is your name?"
            else:
                return "Hello, {}!".format(name)
            
        if 'goodbye' in sents:
            return "Talk to you later!"
        
        if 'rollcall' in sents:
            people = list(self.participants.values())
            
            if len(people) > 1:
                roster = ", ".join(people[:-1])
                roster += " and {}.".format(people[-1])
                return "Currently in the conversation are " + roster
            elif len(people) == 1:
                return "It's just you and me right now, {}.".format(name)
            else:
                return "So lonely in here by myself ... wait who is that?"

        raise Exception(
            "expected response to be returned, but could not find rule"
        )

In [19]:
dialog = Greeting()
# `listen` returns (response, confidence) tuples; just print the response
print(dialog.listen("Hello!", user="jakevp321")[0])
print(dialog.listen("my name is Jake", user="jakevp321")[0])
print(dialog.listen("Roll call!", user="jakevp321")[0])
print(dialog.listen("Have to go, goodbye!", user="jakevp321")[0])

Hello, what is your name?
Hello, Jake!
It's just you and me right now, Jake.
Talk to you later!


In [20]:
dialog = Greeting()
print(dialog.listen("hey", user="jillmonger")[0])
print(dialog.listen("my name is Jill.", user="jillmonger")[0])
print(dialog.listen("who's here?")[0])

Hello, what is your name?
Hello, Jill!
It's just you and me right now, None.


## Handling Miscommunication

In [21]:
%%ipytest -vv

class TestBaseClasses(object):
    """
    Tests for the Dialog class
    """
    
    @pytest.mark.parametrize("text", [
        "Gobbledeguk", "Gibberish", "Wingdings"
    ])
    def test_dialog_abc(self, text):
        """
        Test the Dialog ABC and the listen method
        """
        class SampleDialog(Dialog):
            
            def parse(self, text):
                return []

            def interpret(self, sents):
                return sents, 0.0, {}
            
            def respond(self, sents, confidence):
                return None

        sample = SampleDialog()
        reply, confidence = sample.listen(text)
        assert confidence == 0.0
        assert reply is None

platform linux -- Python 3.8.11, pytest-7.4.0, pluggy-1.3.0 -- /home/python/.local/share/virtualenvs/project-onCnT2CZ/bin/python
cachedir: .pytest_cache
rootdir: /home/python/project/source/chapter_10
plugins: anyio-4.0.0
[1mcollecting ... [0mcollected 3 items

t_28d1ad35a45c404aadba9f55daad3222.py::TestBaseClasses::test_dialog_abc[Gobbledeguk] [32mPASSED[0m[32m  [ 33%][0m
t_28d1ad35a45c404aadba9f55daad3222.py::TestBaseClasses::test_dialog_abc[Gibberish] [32mPASSED[0m[32m    [ 66%][0m
t_28d1ad35a45c404aadba9f55daad3222.py::TestBaseClasses::test_dialog_abc[Wingdings] [32mPASSED[0m[32m    [100%][0m



In [22]:
%%ipytest -vv

class TestGreetingDialog(object):
    """
    Test expected input and responses for the Greeting dialog
    """

    @pytest.mark.parametrize("text", ["Hello!", "hello", 'hey', 'hi'])
    @pytest.mark.parametrize("user", [ "jay", None], ids=["w/ user", "w/o user"])
    def test_greeting_intro(self, user, text):
        """
        Test that an initial greeting requests an introduction
        """
        g = Greeting()
        reply, confidence = g.listen(text, user=user)
        assert confidence == 1.0
        assert reply is not None
        assert reply == "Hello, what is your name?"

platform linux -- Python 3.8.11, pytest-7.4.0, pluggy-1.3.0 -- /home/python/.local/share/virtualenvs/project-onCnT2CZ/bin/python
cachedir: .pytest_cache
rootdir: /home/python/project/source/chapter_10
plugins: anyio-4.0.0
[1mcollecting ... [0mcollected 8 items

t_28d1ad35a45c404aadba9f55daad3222.py::TestGreetingDialog::test_greeting_intro[w/ user-Hello!] [32mPASSED[0m[32m [ 12%][0m
t_28d1ad35a45c404aadba9f55daad3222.py::TestGreetingDialog::test_greeting_intro[w/ user-hello] [32mPASSED[0m[32m [ 25%][0m
t_28d1ad35a45c404aadba9f55daad3222.py::TestGreetingDialog::test_greeting_intro[w/ user-hey] [32mPASSED[0m[32m [ 37%][0m
t_28d1ad35a45c404aadba9f55daad3222.py::TestGreetingDialog::test_greeting_intro[w/ user-hi] [32mPASSED[0m[32m [ 50%][0m
t_28d1ad35a45c404aadba9f55daad3222.py::TestGreetingDialog::test_greeting_intro[w/o user-Hello!] [32mPASSED[0m[32m [ 62%][0m
t_28d1ad35a45c404aadba9f55daad3222.py::TestGreetingDialog::test_greeting_intro[w/o user-hello] [32mPASSED[

In [23]:
%%ipytest -vv

class TestGreetingDialog(object):
    """
    Test expected input and responses for the Greeting dialog
    """

    @pytest.mark.xfail(reason="a case that must be handled")
    @pytest.mark.parametrize("text", ["My name is Jake", "Hello, I'm Jake."])
    @pytest.mark.parametrize("user", ["jkm", None], ids=["w/ user", "w/o user"])
    def test_initial_intro(self, user, text):
        """
        Test an initial introduction without greeting
        """
        g = Greeting()
        reply, confidence = g.listen(text, user=user)
        assert confidence == 1.0
        assert reply is not None
        assert reply == "Hello, Jake!"

        if user is None:
            user = 'jake'

        assert user in g.participants
        assert g.participants[user] == 'Jake'

platform linux -- Python 3.8.11, pytest-7.4.0, pluggy-1.3.0 -- /home/python/.local/share/virtualenvs/project-onCnT2CZ/bin/python
cachedir: .pytest_cache
rootdir: /home/python/project/source/chapter_10
plugins: anyio-4.0.0
[1mcollecting ... [0mcollected 4 items

t_28d1ad35a45c404aadba9f55daad3222.py::TestGreetingDialog::test_initial_intro[w/ user-My name is Jake] [33mXPASS[0m[33m [ 25%][0m
t_28d1ad35a45c404aadba9f55daad3222.py::TestGreetingDialog::test_initial_intro[w/ user-Hello, I'm Jake.] [33mXFAIL[0m[33m [ 50%][0m
t_28d1ad35a45c404aadba9f55daad3222.py::TestGreetingDialog::test_initial_intro[w/o user-My name is Jake] [33mXPASS[0m[33m [ 75%][0m
t_28d1ad35a45c404aadba9f55daad3222.py::TestGreetingDialog::test_initial_intro[w/o user-Hello, I'm Jake.] [33mXFAIL[0m[33m [100%][0m



# Entertaining Questions

## Dependency Parsing

In [24]:
spacy_nlp = spacy.load(SPACY_DATA / 'en_core_web_sm')

In [25]:
def plot_displacy_tree(sent):
    doc = spacy_nlp(sent)
    spacy.displacy.render(doc, style='dep')

In [26]:
plot_displacy_tree('How many teaspoons are in a tablespoon?')

## Constituency Parsing

In [27]:
def spacy_tree(sent):
    """
    Get the SpaCy dependency tree structure
    :param sent: string
    :return: None
    """
    doc = spacy_nlp(sent)
    pprint.pprint(doc.to_json())

In [28]:
spacy_tree('How many teaspoons are in a tablespoon?')

{'ents': [],
 'sents': [{'end': 39, 'start': 0}],
 'text': 'How many teaspoons are in a tablespoon?',
 'tokens': [{'dep': 'advmod',
             'end': 3,
             'head': 1,
             'id': 0,
             'lemma': 'how',
             'morph': '',
             'pos': 'SCONJ',
             'start': 0,
             'tag': 'WRB'},
            {'dep': 'amod',
             'end': 8,
             'head': 2,
             'id': 1,
             'lemma': 'many',
             'morph': 'Degree=Pos',
             'pos': 'ADJ',
             'start': 4,
             'tag': 'JJ'},
            {'dep': 'nsubj',
             'end': 18,
             'head': 3,
             'id': 2,
             'lemma': 'teaspoon',
             'morph': 'Number=Plur',
             'pos': 'NOUN',
             'start': 9,
             'tag': 'NNS'},
            {'dep': 'ROOT',
             'end': 22,
             'head': 3,
             'id': 3,
             'lemma': 'be',
             'morph': 'Mood=Ind|Tense=Pres|

In [29]:
def nltk_spacy_tree(sent):
    """
    Visually inspect the SpaCy dependency tree with nltk.tree
    :param sent: string
    :return: None
    """
    doc = spacy_nlp(sent)
    
    def token_format(token):
        return "_".join([token.orth_, token.tag_, token.dep_])

    def to_nltk_tree(node):
        if node.n_lefts + node.n_rights > 0:
            return nltk.Tree(token_format(node),
                             [to_nltk_tree(child)
                              for child in node.children])
        else:
            return token_format(node)

    return [to_nltk_tree(sent.root) for sent in doc.sents][0] 

In [30]:
tree = nltk_spacy_tree('How many teaspoons are in a tablespoon?')
tree.draw()

## Question Detection

In [31]:
def question_type(sent):
    """
    Try to identify whether the question is about measurements,
    recipes, or not a question.
    :param sent: string
    :return: str response type
    """
    doc = spacy_nlp(sent)
    
    noun_tags = {'NN', 'NNS', 'NNP', 'NNPS'}
    nouns = [token.orth_
             for sent in doc.sents
             for token in sent
             if token.tag_ in noun_tags]
    for sent in doc.sents:
        for token in sent:
            # Find wh-adjective and wh-adverb phrases
            if token.tag_ == 'WRB':
                if token.nbor().tag_ == 'JJ':
                    return ("quantity", nouns)
            # Find wh-noun phrases
            elif token.tag_ == 'WP':
                # Use pre-trained clusters to return recipes
                return ("recipe", nouns)
    # Todo: try to be conversational using our n-gram language generator?
    return ("default", nouns)

In [32]:
question_type('How many teaspoons are in a tablespoon?')

('quantity', ['teaspoons', 'tablespoon'])

## From Tablespoons to Grams

In [33]:
class Converter(Dialog):
    """
    Answers questions about converting units
    """
    
    def __init__(self, conversion_path=CONVERSION_PATH):
        with open(conversion_path, 'r') as f:
            self.metrics = json.load(f)
        self.inflect = inflect.engine()
        self.stemmer = nltk.SnowballStemmer('english')
        self.parser = spacy.load(SPACY_DATA / 'en_core_web_sm')
    
    def parse(self, text):
        parse = self.parser(text)
        return parse
    
    def interpret(self, sents, **kwargs):
        measures = []
        confidence = 0
        results = {}
        # Make sure there are wh-adverb phrases
        if 'WRB' in [token.tag_
                     for sent in sents.sents
                     for token in sent]:
            # If so, increment confidence & traverse sents
            confidence += .2
            for sent in sents.sents:
                for token in sent:
                    # Store nouns as target measures
                    if token.tag_ in ['NN', 'NNS']:
                        measures.append(token.orth_)
                    # Store numbers as target quantities
                    elif token.tag_ in ['CD']:
                        results['quantity'] = token.orth_
        
            # If both source and destination measures are provided...
            if len(measures) == 2:
                confidence += .4
                # Stem source and dest to remove pluralization
                results['dst'], results['src'] = (
                    tuple(map(self.stemmer.stem, measures))
                )
                
                # Check to see if they correspond to our lookup table
                if results['src'] in self.metrics:
                    confidence += .2
                    if results['dst'] in self.metrics[results['src']]:
                        confidence += .2
                        
        return results, confidence, kwargs
    
    def convert(self, src, dst, quantity=1.0):
        """
        Converts from the source unit to the dest unit for the given quantity
        of the source unit.
        """
        # Check that we can convert
        if dst not in self.metrics:
            raise KeyError(f"cannot convert to '{dst}' units")
        if src not in self.metrics[dst]:
            raise KeyError(f"cannot convert from '{src}' to '{dst}'")
        
        return self.metrics[dst][src] * float(quantity), src, dst
    
    def round(self, num):
        num = round(float(num), 4)
        return int(num) if num.is_integer() else num
    
    def pluralize(self, noun, num):
        return self.inflect.plural_noun(noun, num)
    
    def numericalize(self, amt):
        if 1e2 < amt < 1e6:
            return humanize.intcomma(int(amt))
        elif amt >= 1e6:
            return humanize.intword(int(amt))
        elif isinstance(amt, int) or amt.is_integer():
            return humanize.apnumber(int(amt))
        else:
            return humanize.fractional(amt)
    
    def respond(self, sents, confidence, **kwargs):
        """
        Response makes use of the humanize and inflect libraries to produce
        much more human understandable results.
        """
        if confidence < .5:
            return "I'm sorry, I don't know that one."
        
        try:
            quantity = sents.get('quantity', 1)
            amount, src, dst = self.convert(**sents)
            
            # Perform numeric rounding
            amount = self.round(amount)
            quantity = self.round(quantity)
            
            # Pluralize
            src = self.pluralize(src, quantity)
            dst = self.pluralize(dst, amount)
            verb = self.inflect.plural_verb('is', amount)
            
            # Numericalize
            quantity = self.numericalize(quantity)
            amount = self.numericalize(amount)
            
            return f'There {verb} {amount} {dst} in {quantity} {src}.'
        
        except KeyError as e:
            return "I'm sorry I {}".format(str(e))

In [34]:
dialog = Converter()
print(dialog.listen("How many cups are in a gallon?"))
print(dialog.listen("How many gallons are in 2 cups?"))
print(dialog.listen("How many tablespoons are in a cup?"))
print(dialog.listen("How many tablespoons are in 10 cups?"))
print(dialog.listen("How many tablespoons are in a teaspoon?"))

('There are 16 cups in one gallon.', 1.0)
('There are 1/8 gallons in two cups.', 1.0)
('There are 16 tablespoons in one cup.', 1.0)
('There are 160 tablespoons in 10 cups.', 1.0)
('There are 1/3 tablespoons in one teaspoon.', 1.0)


# Learning to Help

## Being Neighborly

In [12]:
corpus = rd.HTMLPickledCorpusReader(CORPUS_PICKLED_ROOT.as_posix())

In [13]:
import time

for algorithm in ['brute', 'kd_tree', 'ball_tree', 'auto']:
    recommender = rc.KNNRecommender(
        n_components=100,
        n_neighbors=3,
        algorithm='ball_tree'
    )
    # fit
    start = time.perf_counter()
    recommender.fit(corpus.docs())
    fit_time = time.perf_counter() - start
    # transform
    start = time.perf_counter()
    recommender.transform(corpus.docs())
    transform_time = time.perf_counter() - start
    # save
    recommender.save(RECOMMENDER_PATH)
    print(algorithm, fit_time, transform_time, RECOMMENDER_PATH.stat().st_size / 1024 / 1024)

brute 101.29095779600175 639.4074635020006 189.18437099456787
kd_tree 106.10647700799746 686.1334123970009 189.18437099456787
ball_tree 107.68528729800164 711.4392622820014 189.18437099456787
auto 105.72828338199906 664.0567092199999 189.18437099456787


In [14]:
# recommender.transform([{'content': [[[('egg', 'N')]]]}])

## Offering Recommendations