# Imports

In [1]:
import re
import abc
import json
import time
import pprint
import pathlib
import logging
import operator
import collections

In [2]:
import bs4
import nltk
import spacy
import pytest
import ipytest
import inflect
import humanize

In [3]:
ipytest.autoconfig()

# Constants

In [4]:
PROJECT_DIR = pathlib.Path('~/project').expanduser()
DATA_DIR = PROJECT_DIR / 'data'
SPACY_DATA = DATA_DIR / 'spacy_data'

In [5]:
CONVERSION_PATH = DATA_DIR / 'conversions.json'

In [6]:
COOKING_CORPUS_ROOT = DATA_DIR / 'cooking_corpus'
COOKING_CORPUS_MINI_ROOT = DATA_DIR / 'cooking_corpus_mini'

In [7]:
CORPUS_ROOT = COOKING_CORPUS_MINI_ROOT

# Classes

## HTMLCorpusReader

In [8]:
from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.api import CategorizedCorpusReader

In [9]:
from readability.readability import Unparseable
from readability.readability import Document as Paper

In [10]:
logger = logging.getLogger("readability.readability")
logger.disabled = True

In [11]:
DOC_PATTERN = r'(?!\.)[a-z_\s]+/[\w\s\d\-]+\.html'
CAT_PATTERN = r'([a-z_\s]+)/.*'

In [12]:
# Tags to extract as paragraphs from the HTML text
TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'li']

In [13]:
class HTMLCorpusReader(CategorizedCorpusReader, CorpusReader):
    """
    A corpus reader for raw HTML documents to enable preprocessing.
    """

    def __init__(self, root, fileids=DOC_PATTERN,
                 word_tokenizer=nltk.WordPunctTokenizer(),
                 sent_tokenizer=nltk.data.LazyLoader(
                     'tokenizers/punkt/english.pickle'
                 ),
                 pos_tagger=nltk.PerceptronTagger(),
                 tags=TAGS, encoding='latin-1', **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining
        arguments are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN
        
        # Initialize the NLTK corpus reader objects
        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids, encoding)
        
        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._pos_tagger = pos_tagger
        self._tags = tags
        
    def resolve(self, fileids, categories):
        """
        Returns a list of fileids or categories depending on what is passed
        to each internal corpus reader function. Implemented similarly to
        the NLTK ``CategorizedPlaintextCorpusReader``.
        """
        if fileids is not None and categories is not None:
            raise ValueError("Specify fileids or categories, not both")

        if categories is not None:
            return self.fileids(categories)
        return fileids

    def docs(self, fileids=None, categories=None):
        """
        Returns the complete text of an HTML document, closing the document
        after we are done reading it and yielding it in a memory safe fashion.
        """
        # Resolve the fileids and the categories
        fileids = self.resolve(fileids, categories)

        # Create a generator, loading one document into memory at a time.
        for path, encoding in self.abspaths(fileids, include_encoding=True):
            with open(path, 'r', encoding=encoding) as f:
                yield f.read()
    
    def html(self, fileids=None, categories=None):
        """
        Returns the HTML content of each document, cleaning it using
        the readability-lxml library.
        """
        for doc in self.docs(fileids, categories):
            try:
                yield Paper(doc).summary()
            except Unparseable as e:
                print("Could not parse HTML: {}".format(e))
                continue

    def paras(self, fileids=None, categories=None):
        """
        Uses BeautifulSoup to parse the paragraphs from the HTML.
        """
        for html in self.html(fileids, categories):
            soup = bs4.BeautifulSoup(html, 'lxml')
            for element in soup.find_all(self._tags):
                yield element.text
            soup.decompose()
    
    def titles(self, fileids=None, categories=None):
        """
        Uses BeautifulSoup to identify titles from the
        head tags within the HTML
        """
        for doc in self.docs(fileids, categories):
            soup = bs4.BeautifulSoup(doc, 'lxml')
            try:
                yield soup.title.text
                soup.decompose()
            except AttributeError as e:
                continue
    
    def sents(self, fileids=None, categories=None):
        """
        Uses the built in sentence tokenizer to extract sentences from the
        paragraphs. Note that this method uses BeautifulSoup to parse HTML.
        """
        for paragraph in self.paras(fileids, categories):
            for sentence in self._sent_tokenizer.tokenize(paragraph):
                yield sentence
    
    def words(self, fileids=None, categories=None):
        """
        Uses the built in word tokenizer to extract tokens from sentences.
        Note that this method uses BeautifulSoup to parse HTML content.
        """
        for sentence in self.sents(fileids, categories):
            for word in self._word_tokenizer.tokenize(sentence):
                yield word
    
    def tokenize(self, fileids=None, categories=None):
        """
        Segments, tokenizes, and tags a document in the corpus.
        """
        for paragraph in self.paras(fileids, categories):
            yield [
                self._pos_tagger.tag(self._word_tokenizer.tokenize(sentence))
                for sentence in self._sent_tokenizer.tokenize(paragraph)
            ]
    
    def sizes(self, fileids=None, categories=None):
        """
        Returns a list of tuples, the fileid and size on disk of the file.
        This function is used to detect oddly large files in the corpus.
        """
        # Resolve the fileids and the categories
        fileids = self.resolve(fileids, categories)

        # Create a generator, getting every path and computing filesize
        for path in self.abspaths(fileids):
            yield os.path.getsize(path)
    
    def describe(self, fileids=None, categories=None):
        """
        Performs a single pass of the corpus and returns a dictionary with a
        variety of metrics concerning the state of the corpus.
        """
        started = time.perf_counter()

        # Structures to perform counting.
        counts = nltk.FreqDist()
        tokens = nltk.FreqDist()

        # Perform single pass over paragraphs, tokenize and count
        for para in self.paras(fileids, categories):
            counts['paras'] += 1

            for sent in self._sent_tokenizer.tokenize(para):
                counts['sents'] += 1

                for word in self._word_tokenizer.tokenize(sent):
                    counts['words'] += 1
                    tokens[word] += 1

        # Compute the number of files and categories in the corpus
        n_fileids = len(self.resolve(fileids, categories) or self.fileids())
        n_categories = len(self.categories(self.resolve(fileids, categories)))

        # Return data structure with information
        return {
            'files': n_fileids,
            'categories': n_categories,
            'paras': counts['paras'],
            'sents': counts['sents'],
            'words': counts['words'],
            'vocab': len(tokens),
            'lexdiv': counts['words'] / len(tokens),
            'ppdoc': counts['paras'] / n_fileids,
            'sppar': counts['sents'] / counts['paras'],
            'secs': time.perf_counter() - started,
        }
    
    def describes(self, fileids=None, categories=None):
        """
        Returns a string representation of the describe command.
        """
        return (
            "HTML corpus contains {files:,} files in {categories:,} categories.\n"
            "Structured as:\n"
            "    {paras:,} paragraphs ({ppdoc:0,.3f} mean paragraphs per file)\n"
            "    {sents:,} sentences ({sppar:0,.3f} mean sentences per paragraph).\n"
            "Word count of {words:,} with a vocabulary of {vocab:,} "
            "({lexdiv:0,.3f} lexical diversity).\n"
            "Corpus scan took {secs:0,.3f} seconds."
        ).format(**self.describe(fileids, categories))

In [14]:
corpus = HTMLCorpusReader(CORPUS_ROOT.as_posix())
print(corpus.describes())

HTML corpus contains 3 files in 1 categories.
Structured as:
    63 paragraphs (21.000 mean paragraphs per file)
    180 sentences (2.857 mean sentences per paragraph).
Word count of 3,000 with a vocabulary of 926 (3.240 lexical diversity).
Corpus scan took 0.055 seconds.


## HTMLPickledCorpusReader

In [15]:
PKL_PATTERN = r'(?!\.)[a-z_\s]+/[a-f0-9]+\.pickle'
CAT_PATTERN = r'([a-z_\s]+)/.*'

In [16]:
class HTMLPickledCorpusReader(CategorizedCorpusReader, CorpusReader):
    
    def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining arguments
        are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids)
    
    def resolve(self, fileids, categories):
        """
        Returns a list of fileids or categories depending on what is passed
        to each internal corpus reader function. This primarily bubbles up to
        the high level ``docs`` method, but is implemented here similar to
        the nltk ``CategorizedPlaintextCorpusReader``.
        """
        if fileids is not None and categories is not None:
            raise ValueError("Specify fileids or categories, not both")

        if categories is not None:
            return self.fileids(categories)
        return fileids
    
    def docs(self, fileids=None, categories=None):
        """
        Returns the document loaded from a pickled object for every file in
        the corpus. Similar to the BaleenCorpusReader, this uses a generator
        to achive memory safe iteration.
        """
        # Resolve the fileids and the categories
        fileids = self.resolve(fileids, categories)

        # Create a generator, loading one document into memory at a time.
        for path in self.abspaths(fileids):
            with open(path, 'rb') as f:
                yield pickle.load(f)
    
    def titles(self, fileids=None, categories=None):
        """
        Uses BeautifulSoup to identify titles from the
        head tags within the HTML
        """
        for doc in self.docs(fileids, categories):
            yield doc['title']
    
    def tagged_paras(self, fileids=None, categories=None):
        """
        Returns a generator of paragraphs where each paragraph is a list of
        sentences, which is in turn a list of (token, tag) tuples.
        """
        for doc in self.docs(fileids, categories):
            for tagged_para in doc['content']:
                yield tagged_para
    
    def paras(self, fileids=None, categories=None):
        """
        Returns a generator of paragraphs where each paragraph is a list of
        sentences, which is in turn a list of tokens.
        """
        for tagged_para in self.tagged_paras(fileids, categories):
            yield [[word for word, tag in tagged_sent]
                   for tagged_sent in tagged_para]
    
    def tagged_sents(self, fileids=None, categories=None):
        """
        Returns a generator of sentences where each sentence is a list of
        (token, tag) tuples.
        """
        for tagged_para in self.tagged_paras(fileids, categories):
            for tagged_sent in tagged_para:
                yield tagged_sent
                
    def sents(self, fileids=None, categories=None):
        """
        Returns a generator of sentences where each sentence is a list of
        tokens.
        """
        for tagged_sent in self.tagged_sents(fileids, categories):
            yield [word for word, tag in tagged_sent]
    
    def tagged_words(self, fileids=None, categories=None):
        """
        Returns a generator of (token, tag) tuples.
        """
        for sent in self.tagged_sents(fileids, categories):
            for token, tag in sent:
                yield token, tag

    def words(self, fileids=None, categories=None):
        """
        Returns a generator of tokens.
        """
        for word, tag in self.tagged_words(fileids, categories):
            yield word
    
    def describe(self, fileids=None, categories=None):
        """
        Performs a single pass of the corpus and returns a dictionary with a
        variety of metrics concerning the state of the corpus.
        """
        started = time.perf_counter()

        # Structures to perform counting.
        counts = nltk.FreqDist()
        tokens = nltk.FreqDist()
        
        # Perform single pass over paragraphs, tokenize and count
        for para in self.tagged_paras(fileids, categories):
            counts['paras'] += 1
            
            for sent in para:
                counts['sents'] += 1
                
                for word, tag in sent:
                    counts['words'] += 1
                    tokens[word] += 1
        
        # Compute the number of files and categories in the corpus
        n_fileids = len(self.resolve(fileids, categories) or self.fileids())
        n_topics  = len(self.categories(self.resolve(fileids, categories)))
        
        # Return data structure with information
        return {
            'files': n_fileids,
            'categories': n_categories,
            'paras': counts['paras'],
            'sents': counts['sents'],
            'words': counts['words'],
            'vocab': len(tokens),
            'lexdiv': counts['words'] / len(tokens),
            'ppdoc': counts['paras'] / n_fileids,
            'sppar': counts['sents'] / counts['paras'],
            'secs': time.perf_counter() - started,
        }
    
    def describes(self, fileids=None, categories=None):
        """
        Returns a string representation of the describe command.
        """
        return (
            "HTML corpus contains {files:,} files in {categories:,} categories.\n"
            "Structured as:\n"
            "    {paras:,} paragraphs ({ppdoc:0,.3f} mean paragraphs per file)\n"
            "    {sents:,} sentences ({sppar:0,.3f} mean sentences per paragraph).\n"
            "Word count of {words:,} with a vocabulary of {vocab:,} "
            "({lexdiv:0,.3f} lexical diversity).\n"
            "Corpus scan took {secs:0,.3f} seconds."
        ).format(**self.describe(fileids, categories))

# Fundamentals of Conversation

## Dialog: A Brief Exchange

In [17]:
class Dialog(abc.ABC):
    """
    A dialog listens for utterances, parses and interprets them, then updates
    its internal state. It can then formulate a response on demand.
    """
    def listen(self, text, need_response=True, **kwargs):
        """
        A text utterance is passed in and parsed. It is then passed to the
        interpret method to determine how to respond. If a response is
        requested, the respond method is used to generate a text response
        based on the most recent input and the current Dialog state.
        """
        # Parse the input
        sents = self.parse(text)
        
        # Interpret the input
        sents, confidence, kwargs = self.interpret(sents, **kwargs)
        
        # Determine the response
        response = (self.respond(sents, confidence, **kwargs)
                    if need_response else None)
        
        # Return initiative
        return response, confidence
    
    @abc.abstractmethod
    def parse(self, text):
        """
        Every dialog may need its own parsing strategy, some dialogs may need
        dependency vs. constituency parses, others may simply require regular
        expressions or chunkers.
        """
        return []
    
    @abc.abstractmethod
    def interpret(self, sents, **kwargs):
        """
        Interprets the utterance passed in as a list of parsed sentences,
        updates the internal state of the dialog, computes a confidence of the
        interpretation. May also return arguments specific to the response
        mechanism.
        """
        return sents, 0.0, kwargs
    
    @abc.abstractmethod
    def respond(self, sents, confidence, **kwargs):
        """
        Creates a response given the input utterances and the current state of
        the dialog, along with any arguments passed in from the listen or the
        interpret methods.
        """
        return None

## Maintaining a Conversation

In [18]:
class SimpleConversation(Dialog, collections.abc.Sequence):
    """
    This is the most simple version of a conversation.
    """
    
    def __init__(self, dialogs):
        self._dialogs = dialogs
    
    def __getitem__(self, idx):
        return self._dialogs[idx]

    def __len__(self):
        return len(self._dialogs)
    
    def listen(self, text, need_response=True, **kwargs):
        """
        Simply return the best confidence response
        """        
        responses = [dialog.listen(text, need_response, **kwargs)
                     for dialog in self._dialogs]
        
        # Responses is a list of (response, confidence) pairs
        return max(responses, key=operator.itemgetter(1))
    
    def parse(self, text):
        """
        Returns parses for all internal dialogs for debugging
        """
        return [dialog.parse(text)
                for dialog in self._dialogs]
    
    def interpret(self, sents, **kwargs):
        """
        Returns interpretations for all internal dialogs for debugging
        """
        return [dialog.interpret(sents, **kwargs)
                for dialog in self._dialogs]

    def respond(self, sents, confidence, **kwargs):
        """
        Returns responses for all internal dialogs for debugging
        """
        return [dialog.respond(sents, confidence, **kwargs)
                for dialog in self._dialogs]

# Rules for Polite Conversation

## Greetings and Salutations

In [19]:
class Greeting(Dialog):
    """
    Keeps track of the participants entering or leaving the conversation and
    responds with appropriate salutations. This is an example of a rules based
    system that keeps track of state and uses regular expressions and logic to
    handle the dialog.
    """
    
    PATTERNS = {
        'greeting': r'hello|hi|hey|good morning|good evening',
        'introduction': r'my name is ([a-z\-\s]+)',
        'goodbye': r'goodbye|bye|ttyl',
        'rollcall': r'roll call|who\'s here?'
    }
    
    def __init__(self, participants=None):
        # Participants is a map of user name to real name
        self.participants = {}
        
        if participants is not None:
            for participant in participants:
                self.participants[participant] = None
        
        # Compile regular expressions
        self._patterns = {
            key: re.compile(pattern, re.I)
            for key, pattern in self.PATTERNS.items()
        }
    
    def parse(self, text):
        """
        Applies all regular expressions to the text to find matches.
        """
        return {
            key: match
            for key, pattern in self._patterns.items()
            if (match := pattern.search(text))
            and match is not None
        }
    
    def interpret(self, sents, **kwargs):
        """
        Takes in parsed matches and determines if the message is an enter,
        exit, or name change.
        """
        # Can't do anything with no matches
        if len(sents) == 0:
            return sents, 0.0, kwargs

        # Get username from the participants
        user = kwargs.get('user', None)
        
        # Determine if an introduction has been made
        if 'introduction' in sents:
            # Get the name from the utterance
            name = sents['introduction'].groups()[0]
            user = user or name.lower()

            # Determine if name has changed
            if (user not in self.participants
                or self.participants[user] != name):
                kwargs['name_changed'] = True
            
            # Update the participants
            self.participants[user] = name
            kwargs['user'] = user
            
        # Determine if a greeting has been made
        if 'greeting' in sents:
            # If we don't have a name for the user
            if user not in self.participants:
                kwargs['request_introduction'] = True

        # Determine if goodbye has been made
        if 'goodbye' in sents and user is not None:
            # Remove participant
            self.participants.pop(user)
            kwargs.pop('user', None)
        
        # If we've seen anything we're looking for, we're pretty confident
        return sents, 1.0, kwargs

    def respond(self, sents, confidence, **kwargs):
        """
        Gives a greeting or a goodbye depending on what's appropriate.
        """
        if confidence == 0:
            return None
        
        name = self.participants.get(kwargs.get('user', None), None)
        name_changed = kwargs.get('name_changed', False)
        request_introduction = kwargs.get('request_introduction', False)
        
        if 'greeting' in sents or 'introduction' in sents:
            if request_introduction:
                return "Hello, what is your name?"
            else:
                return "Hello, {}!".format(name)
            
        if 'goodbye' in sents:
            return "Talk to you later!"
        
        if 'rollcall' in sents:
            people = list(self.participants.values())
            
            if len(people) > 1:
                roster = ", ".join(people[:-1])
                roster += " and {}.".format(people[-1])
                return "Currently in the conversation are " + roster
            elif len(people) == 1:
                return "It's just you and me right now, {}.".format(name)
            else:
                return "So lonely in here by myself ... wait who is that?"

        raise Exception(
            "expected response to be returned, but could not find rule"
        )

In [20]:
dialog = Greeting()
# `listen` returns (response, confidence) tuples; just print the response
print(dialog.listen("Hello!", user="jakevp321")[0])
print(dialog.listen("my name is Jake", user="jakevp321")[0])
print(dialog.listen("Roll call!", user="jakevp321")[0])
print(dialog.listen("Have to go, goodbye!", user="jakevp321")[0])

Hello, what is your name?
Hello, Jake!
It's just you and me right now, Jake.
Talk to you later!


In [21]:
dialog = Greeting()
print(dialog.listen("hey", user="jillmonger")[0])
print(dialog.listen("my name is Jill.", user="jillmonger")[0])
print(dialog.listen("who's here?")[0])

Hello, what is your name?
Hello, Jill!
It's just you and me right now, None.


## Handling Miscommunication

In [22]:
%%ipytest -vv

class TestBaseClasses(object):
    """
    Tests for the Dialog class
    """
    
    @pytest.mark.parametrize("text", [
        "Gobbledeguk", "Gibberish", "Wingdings"
    ])
    def test_dialog_abc(self, text):
        """
        Test the Dialog ABC and the listen method
        """
        class SampleDialog(Dialog):
            
            def parse(self, text):
                return []

            def interpret(self, sents):
                return sents, 0.0, {}
            
            def respond(self, sents, confidence):
                return None

        sample = SampleDialog()
        reply, confidence = sample.listen(text)
        assert confidence == 0.0
        assert reply is None

platform linux -- Python 3.8.11, pytest-7.4.0, pluggy-1.2.0 -- /home/python/.local/share/virtualenvs/project-onCnT2CZ/bin/python
cachedir: .pytest_cache
rootdir: /home/python/project/source
plugins: anyio-3.7.1
[1mcollecting ... [0mcollected 3 items

t_ec61538c48d54aa1bbeb5f53a73d723c.py::TestBaseClasses::test_dialog_abc[Gobbledeguk] [32mPASSED[0m[32m  [ 33%][0m
t_ec61538c48d54aa1bbeb5f53a73d723c.py::TestBaseClasses::test_dialog_abc[Gibberish] [32mPASSED[0m[32m    [ 66%][0m
t_ec61538c48d54aa1bbeb5f53a73d723c.py::TestBaseClasses::test_dialog_abc[Wingdings] [32mPASSED[0m[32m    [100%][0m



In [23]:
%%ipytest -vv

class TestGreetingDialog(object):
    """
    Test expected input and responses for the Greeting dialog
    """

    @pytest.mark.parametrize("text", ["Hello!", "hello", 'hey', 'hi'])
    @pytest.mark.parametrize("user", [ "jay", None], ids=["w/ user", "w/o user"])
    def test_greeting_intro(self, user, text):
        """
        Test that an initial greeting requests an introduction
        """
        g = Greeting()
        reply, confidence = g.listen(text, user=user)
        assert confidence == 1.0
        assert reply is not None
        assert reply == "Hello, what is your name?"

platform linux -- Python 3.8.11, pytest-7.4.0, pluggy-1.2.0 -- /home/python/.local/share/virtualenvs/project-onCnT2CZ/bin/python
cachedir: .pytest_cache
rootdir: /home/python/project/source
plugins: anyio-3.7.1
[1mcollecting ... [0mcollected 8 items

t_ec61538c48d54aa1bbeb5f53a73d723c.py::TestGreetingDialog::test_greeting_intro[w/ user-Hello!] [32mPASSED[0m[32m [ 12%][0m
t_ec61538c48d54aa1bbeb5f53a73d723c.py::TestGreetingDialog::test_greeting_intro[w/ user-hello] [32mPASSED[0m[32m [ 25%][0m
t_ec61538c48d54aa1bbeb5f53a73d723c.py::TestGreetingDialog::test_greeting_intro[w/ user-hey] [32mPASSED[0m[32m [ 37%][0m
t_ec61538c48d54aa1bbeb5f53a73d723c.py::TestGreetingDialog::test_greeting_intro[w/ user-hi] [32mPASSED[0m[32m [ 50%][0m
t_ec61538c48d54aa1bbeb5f53a73d723c.py::TestGreetingDialog::test_greeting_intro[w/o user-Hello!] [32mPASSED[0m[32m [ 62%][0m
t_ec61538c48d54aa1bbeb5f53a73d723c.py::TestGreetingDialog::test_greeting_intro[w/o user-hello] [32mPASSED[0m[32m [ 7

In [24]:
%%ipytest -vv

class TestGreetingDialog(object):
    """
    Test expected input and responses for the Greeting dialog
    """

    @pytest.mark.xfail(reason="a case that must be handled")
    @pytest.mark.parametrize("text", ["My name is Jake", "Hello, I'm Jake."])
    @pytest.mark.parametrize("user", ["jkm", None], ids=["w/ user", "w/o user"])
    def test_initial_intro(self, user, text):
        """
        Test an initial introduction without greeting
        """
        g = Greeting()
        reply, confidence = g.listen(text, user=user)
        assert confidence == 1.0
        assert reply is not None
        assert reply == "Hello, Jake!"

        if user is None:
            user = 'jake'

        assert user in g.participants
        assert g.participants[user] == 'Jake'

platform linux -- Python 3.8.11, pytest-7.4.0, pluggy-1.2.0 -- /home/python/.local/share/virtualenvs/project-onCnT2CZ/bin/python
cachedir: .pytest_cache
rootdir: /home/python/project/source
plugins: anyio-3.7.1
[1mcollecting ... [0mcollected 4 items

t_ec61538c48d54aa1bbeb5f53a73d723c.py::TestGreetingDialog::test_initial_intro[w/ user-My name is Jake] [33mXPASS[0m[33m [ 25%][0m
t_ec61538c48d54aa1bbeb5f53a73d723c.py::TestGreetingDialog::test_initial_intro[w/ user-Hello, I'm Jake.] [33mXFAIL[0m[33m [ 50%][0m
t_ec61538c48d54aa1bbeb5f53a73d723c.py::TestGreetingDialog::test_initial_intro[w/o user-My name is Jake] [33mXPASS[0m[33m [ 75%][0m
t_ec61538c48d54aa1bbeb5f53a73d723c.py::TestGreetingDialog::test_initial_intro[w/o user-Hello, I'm Jake.] [33mXFAIL[0m[33m [100%][0m



# Entertaining Questions

## Dependency Parsing

In [25]:
spacy_nlp = spacy.load(SPACY_DATA / 'en_core_web_sm')

In [26]:
def plot_displacy_tree(sent):
    doc = spacy_nlp(sent)
    spacy.displacy.render(doc, style='dep')

In [27]:
plot_displacy_tree('How many teaspoons are in a tablespoon?')

## Constituency Parsing

In [28]:
def spacy_tree(sent):
    """
    Get the SpaCy dependency tree structure
    :param sent: string
    :return: None
    """
    doc = spacy_nlp(sent)
    pprint.pprint(doc.to_json())

In [29]:
spacy_tree('How many teaspoons are in a tablespoon?')

{'ents': [],
 'sents': [{'end': 39, 'start': 0}],
 'text': 'How many teaspoons are in a tablespoon?',
 'tokens': [{'dep': 'advmod',
             'end': 3,
             'head': 1,
             'id': 0,
             'lemma': 'how',
             'morph': '',
             'pos': 'SCONJ',
             'start': 0,
             'tag': 'WRB'},
            {'dep': 'amod',
             'end': 8,
             'head': 2,
             'id': 1,
             'lemma': 'many',
             'morph': 'Degree=Pos',
             'pos': 'ADJ',
             'start': 4,
             'tag': 'JJ'},
            {'dep': 'nsubj',
             'end': 18,
             'head': 3,
             'id': 2,
             'lemma': 'teaspoon',
             'morph': 'Number=Plur',
             'pos': 'NOUN',
             'start': 9,
             'tag': 'NNS'},
            {'dep': 'ROOT',
             'end': 22,
             'head': 3,
             'id': 3,
             'lemma': 'be',
             'morph': 'Mood=Ind|Tense=Pres|

In [30]:
def nltk_spacy_tree(sent):
    """
    Visually inspect the SpaCy dependency tree with nltk.tree
    :param sent: string
    :return: None
    """
    doc = spacy_nlp(sent)
    
    def token_format(token):
        return "_".join([token.orth_, token.tag_, token.dep_])

    def to_nltk_tree(node):
        if node.n_lefts + node.n_rights > 0:
            return nltk.Tree(token_format(node),
                             [to_nltk_tree(child)
                              for child in node.children])
        else:
            return token_format(node)

    return [to_nltk_tree(sent.root) for sent in doc.sents][0] 

In [31]:
tree = nltk_spacy_tree('How many teaspoons are in a tablespoon?')
tree.draw()

## Question Detection

In [32]:
def question_type(sent):
    """
    Try to identify whether the question is about measurements,
    recipes, or not a question.
    :param sent: string
    :return: str response type
    """
    doc = spacy_nlp(sent)
    
    noun_tags = {'NN', 'NNS', 'NNP', 'NNPS'}
    nouns = [token.orth_
             for sent in doc.sents
             for token in sent
             if token.tag_ in noun_tags]
    for sent in doc.sents:
        for token in sent:
            # Find wh-adjective and wh-adverb phrases
            if token.tag_ == 'WRB':
                if token.nbor().tag_ == 'JJ':
                    return ("quantity", nouns)
            # Find wh-noun phrases
            elif token.tag_ == 'WP':
                # Use pre-trained clusters to return recipes
                return ("recipe", nouns)
    # Todo: try to be conversational using our n-gram language generator?
    return ("default", nouns)

In [33]:
question_type('How many teaspoons are in a tablespoon?')

('quantity', ['teaspoons', 'tablespoon'])

## From Tablespoons to Grams

In [34]:
class Converter(Dialog):
    """
    Answers questions about converting units
    """
    
    def __init__(self, conversion_path=CONVERSION_PATH):
        with open(conversion_path, 'r') as f:
            self.metrics = json.load(f)
        self.inflect = inflect.engine()
        self.stemmer = nltk.SnowballStemmer('english')
        self.parser = spacy.load(SPACY_DATA / 'en_core_web_sm')
    
    def parse(self, text):
        parse = self.parser(text)
        return parse
    
    def interpret(self, sents, **kwargs):
        measures = []
        confidence = 0
        results = {}
        # Make sure there are wh-adverb phrases
        if 'WRB' in [token.tag_
                     for sent in sents.sents
                     for token in sent]:
            # If so, increment confidence & traverse sents
            confidence += .2
            for sent in sents.sents:
                for token in sent:
                    # Store nouns as target measures
                    if token.tag_ in ['NN', 'NNS']:
                        measures.append(token.orth_)
                    # Store numbers as target quantities
                    elif token.tag_ in ['CD']:
                        results['quantity'] = token.orth_
        
            # If both source and destination measures are provided...
            if len(measures) == 2:
                confidence += .4
                # Stem source and dest to remove pluralization
                results['dst'], results['src'] = (
                    tuple(map(self.stemmer.stem, measures))
                )
                
                # Check to see if they correspond to our lookup table
                if results['src'] in self.metrics:
                    confidence += .2
                    if results['dst'] in self.metrics[results['src']]:
                        confidence += .2
                        
        return results, confidence, kwargs
    
    def convert(self, src, dst, quantity=1.0):
        """
        Converts from the source unit to the dest unit for the given quantity
        of the source unit.
        """
        # Check that we can convert
        if dst not in self.metrics:
            raise KeyError(f"cannot convert to '{dst}' units")
        if src not in self.metrics[dst]:
            raise KeyError(f"cannot convert from '{src}' to '{dst}'")
        
        return self.metrics[dst][src] * float(quantity), src, dst
    
    def round(self, num):
        num = round(float(num), 4)
        return int(num) if num.is_integer() else num
    
    def pluralize(self, noun, num):
        return self.inflect.plural_noun(noun, num)
    
    def numericalize(self, amt):
        if 1e2 < amt < 1e6:
            return humanize.intcomma(int(amt))
        elif amt >= 1e6:
            return humanize.intword(int(amt))
        elif isinstance(amt, int) or amt.is_integer():
            return humanize.apnumber(int(amt))
        else:
            return humanize.fractional(amt)
    
    def respond(self, sents, confidence, **kwargs):
        """
        Response makes use of the humanize and inflect libraries to produce
        much more human understandable results.
        """
        if confidence < .5:
            return "I'm sorry, I don't know that one."
        
        try:
            quantity = sents.get('quantity', 1)
            amount, src, dst = self.convert(**sents)
            
            # Perform numeric rounding
            amount = self.round(amount)
            quantity = self.round(quantity)
            
            # Pluralize
            src = self.pluralize(src, quantity)
            dst = self.pluralize(dst, amount)
            verb = self.inflect.plural_verb('is', amount)
            
            # Numericalize
            quantity = self.numericalize(quantity)
            amount = self.numericalize(amount)
            
            return f'There {verb} {amount} {dst} in {quantity} {src}.'
        
        except KeyError as e:
            return "I'm sorry I {}".format(str(e))

In [35]:
dialog = Converter()
print(dialog.listen("How many cups are in a gallon?"))
print(dialog.listen("How many gallons are in 2 cups?"))
print(dialog.listen("How many tablespoons are in a cup?"))
print(dialog.listen("How many tablespoons are in 10 cups?"))
print(dialog.listen("How many tablespoons are in a teaspoon?"))

('There are 16 cups in one gallon.', 1.0)
('There are 1/8 gallons in two cups.', 1.0)
('There are 16 tablespoons in one cup.', 1.0)
('There are 160 tablespoons in 10 cups.', 1.0)
('There are 1/3 tablespoons in one teaspoon.', 1.0)


# Learning to Help

## Being Neighborly

## Offering Recommendations