# Imports

In [1]:
import inspect
import pathlib

In [2]:
import nltk
import spacy

In [3]:
from dialog import Dialog, SimpleConversation, Greeting
from test_dialog import TestDialogClass, TestGreetingClass
from parser import plot_displacy_tree, spacy_tree, nltk_spacy_tree, question_type
from converter import Converter
from reader import HTMLCorpusReader, HTMLPickledCorpusReader
from preprocessor import Preprocessor
from transformer import TextNormalizer
from recommender import KNNRecommender, RecipeRecommender

# Constants

In [4]:
PROJECT_DIR = pathlib.Path('~/project').expanduser()
DATA_DIR = PROJECT_DIR / 'data'
SPACY_DATA = DATA_DIR / 'spacy_data'

In [5]:
CONVERSION_PATH = DATA_DIR / 'conversions.json'

In [6]:
COOKING_CORPUS_ROOT = DATA_DIR / 'cooking_corpus'
COOKING_CORPUS_MINI_ROOT = DATA_DIR / 'cooking_corpus_mini'

In [7]:
COOKING_CORPUS_PICKLED_ROOT = DATA_DIR / 'cooking_corpus_pickled'
COOKING_CORPUS_MINI_PICKLED_ROOT = DATA_DIR / 'cooking_corpus_mini_pickled'

In [8]:
RECOMMENDER_PATH = DATA_DIR / 'recommender.pkl'

In [9]:
CORPUS_ROOT = COOKING_CORPUS_MINI_ROOT
CORPUS_PICKLED_ROOT = COOKING_CORPUS_MINI_PICKLED_ROOT

In [10]:
# CORPUS_ROOT = COOKING_CORPUS_ROOT
# CORPUS_PICKLED_ROOT = COOKING_CORPUS_PICKLED_ROOT

# Classes

## HTMLCorpusReader

In [11]:
corpus = HTMLCorpusReader(CORPUS_ROOT.as_posix())
print(corpus.describes())

HTML corpus contains 3 files in 1 categories.
Structured as:
    63 paragraphs (21.000 mean paragraphs per file)
    180 sentences (2.857 mean sentences per paragraph).
Word count of 3,000 with a vocabulary of 926 (3.240 lexical diversity).
Corpus scan took 0.056 seconds.


## Preprocessor

In [12]:
%%time
corpus = HTMLCorpusReader(CORPUS_ROOT.as_posix())
preprocessor = Preprocessor(corpus, CORPUS_PICKLED_ROOT.as_posix())
docs = preprocessor.transform()

CPU times: user 107 ms, sys: 39.3 ms, total: 147 ms
Wall time: 262 ms


## HTMLPickledCorpusReader

In [13]:
corpus = HTMLPickledCorpusReader(CORPUS_PICKLED_ROOT.as_posix())
print(corpus.describes())

HTML corpus contains 3 files in 1 categories.
Structured as:
    63 paragraphs (21.000 mean paragraphs per file)
    180 sentences (2.857 mean sentences per paragraph).
Word count of 3,000 with a vocabulary of 926 (3.240 lexical diversity).
Corpus scan took 0.003 seconds.


## TextNormalizer

In [14]:
corpus = HTMLPickledCorpusReader(CORPUS_PICKLED_ROOT.as_posix())
normalizer = TextNormalizer()

In [15]:
len(list(normalizer.fit_transform(corpus.docs())))

3

# Fundamentals of Conversation

## Dialog: A Brief Exchange

In [16]:
print(inspect.getsource(Dialog))

class Dialog(abc.ABC):
    """
    A dialog listens for utterances, parses and interprets them, then updates
    its internal state. It can then formulate a response on demand.
    """
    
    def listen(self, text, need_response=True, **kwargs):
        """
        A text utterance is passed in and parsed. It is then passed to the
        interpret method to determine how to respond. If a response is
        requested, the respond method is used to generate a text response
        based on the most recent input and the current Dialog state.
        """
        # Parse the input
        sents = self.parse(text)
        
        # Interpret the input
        sents, confidence, kwargs = self.interpret(sents, **kwargs)
        
        # Determine the response
        response = (self.respond(sents, confidence, **kwargs)
                    if need_response else None)
        
        # Return initiative
        return response, confidence
    
    @abc.abstractmethod
    def parse(self,

## Maintaining a Conversation

In [17]:
print(inspect.getsource(SimpleConversation))

class SimpleConversation(Dialog, collections.abc.Sequence):
    """
    This is the most simple version of a conversation.
    """
    
    def __init__(self, dialogs):
        self._dialogs = dialogs
    
    def __getitem__(self, idx):
        return self._dialogs[idx]

    def __len__(self):
        return len(self._dialogs)
    
    def listen(self, text, need_response=True, **kwargs):
        """
        Simply return the best confidence response
        """        
        responses = [dialog.listen(text, need_response, **kwargs)
                     for dialog in self._dialogs]
        
        # Responses is a list of (response, confidence) pairs
        return max(responses, key=operator.itemgetter(1))
    
    def parse(self, text):
        """
        Returns parses for all internal dialogs for debugging
        """
        return [dialog.parse(text)
                for dialog in self._dialogs]
    
    def interpret(self, sents, **kwargs):
        """
        Returns interp

# Rules for Polite Conversation

## Greetings and Salutations

In [18]:
dialog = Greeting()
print(dialog.listen("Hello!", user="jakevp321")[0])
print(dialog.listen("my name is Jake", user="jakevp321")[0])
print(dialog.listen("Roll call!", user="jakevp321")[0])
print(dialog.listen("Have to go, goodbye!", user="jakevp321")[0])

Hello, what is your name?
Hello, Jake!
It's just you and me right now, Jake.
Talk to you later!


In [19]:
dialog = Greeting()
print(dialog.listen("hey", user="jillmonger")[0])
print(dialog.listen("my name is Jill.", user="jillmonger")[0])
print(dialog.listen("who's here?")[0])

Hello, what is your name?
Hello, Jill!
It's just you and me right now, None.


## Handling Miscommunication

In [20]:
!pytest -vv -k 'TestDialogClass' test_dialog.py

platform linux -- Python 3.8.11, pytest-7.4.0, pluggy-1.3.0 -- /home/python/.local/share/virtualenvs/project-onCnT2CZ/bin/python
cachedir: .pytest_cache
rootdir: /home/python/project/source/chapter_10
plugins: anyio-4.2.0
collected 15 items / 12 deselected / 3 selected                                [0m

test_dialog.py::TestDialogClass::test_dialog_abc[Gobbledeguk] [32mPASSED[0m[32m     [ 33%][0m
test_dialog.py::TestDialogClass::test_dialog_abc[Gibberish] [32mPASSED[0m[32m       [ 66%][0m
test_dialog.py::TestDialogClass::test_dialog_abc[Wingdings] [32mPASSED[0m[32m       [100%][0m



In [21]:
!pytest -vv -k 'TestGreetingClass' test_dialog.py

platform linux -- Python 3.8.11, pytest-7.4.0, pluggy-1.3.0 -- /home/python/.local/share/virtualenvs/project-onCnT2CZ/bin/python
cachedir: .pytest_cache
rootdir: /home/python/project/source/chapter_10
plugins: anyio-4.2.0
collected 15 items / 3 deselected / 12 selected                                [0m

test_dialog.py::TestGreetingClass::test_greeting_intro[w/ user-Hello!] [32mPASSED[0m[32m [  8%][0m
test_dialog.py::TestGreetingClass::test_greeting_intro[w/ user-hello] [32mPASSED[0m[32m [ 16%][0m
test_dialog.py::TestGreetingClass::test_greeting_intro[w/ user-hey] [32mPASSED[0m[32m [ 25%][0m
test_dialog.py::TestGreetingClass::test_greeting_intro[w/ user-hi] [32mPASSED[0m[32m [ 33%][0m
test_dialog.py::TestGreetingClass::test_greeting_intro[w/o user-Hello!] [32mPASSED[0m[32m [ 41%][0m
test_dialog.py::TestGreetingClass::test_greeting_intro[w/o user-hello] [32mPASSED[0m[32m [ 50%][0m
test_dialog.py::TestGreetingClass::test_greeting_intro[w/o user-hey] [32mPASSED[

# Entertaining Questions

## Dependency Parsing

In [22]:
model = spacy.load(SPACY_DATA / 'en_core_web_sm')

In [23]:
plot_displacy_tree(model, 'How many teaspoons are in a tablespoon?')

## Constituency Parsing

In [24]:
spacy_tree(model, 'How many teaspoons are in a tablespoon?')

{'ents': [],
 'sents': [{'end': 39, 'start': 0}],
 'text': 'How many teaspoons are in a tablespoon?',
 'tokens': [{'dep': 'advmod',
             'end': 3,
             'head': 1,
             'id': 0,
             'lemma': 'how',
             'morph': '',
             'pos': 'SCONJ',
             'start': 0,
             'tag': 'WRB'},
            {'dep': 'amod',
             'end': 8,
             'head': 2,
             'id': 1,
             'lemma': 'many',
             'morph': 'Degree=Pos',
             'pos': 'ADJ',
             'start': 4,
             'tag': 'JJ'},
            {'dep': 'nsubj',
             'end': 18,
             'head': 3,
             'id': 2,
             'lemma': 'teaspoon',
             'morph': 'Number=Plur',
             'pos': 'NOUN',
             'start': 9,
             'tag': 'NNS'},
            {'dep': 'ROOT',
             'end': 22,
             'head': 3,
             'id': 3,
             'lemma': 'be',
             'morph': 'Mood=Ind|Tense=Pres|

In [25]:
tree = nltk_spacy_tree(model, 'How many teaspoons are in a tablespoon?')
tree.draw()

## Question Detection

In [26]:
question_type(model, 'How many teaspoons are in a tablespoon?')

('quantity', ['teaspoons', 'tablespoon'])

## From Tablespoons to Grams

In [27]:
dialog = Converter(
    conversion_path=CONVERSION_PATH,
    stemmer=nltk.SnowballStemmer('english'),
    parser=spacy.load(SPACY_DATA / 'en_core_web_sm')
)

In [28]:
print(dialog.listen("How many cups are in a gallon?"))
print(dialog.listen("How many gallons are in 2 cups?"))
print(dialog.listen("How many tablespoons are in a cup?"))
print(dialog.listen("How many tablespoons are in 10 cups?"))
print(dialog.listen("How many tablespoons are in a teaspoon?"))

('There are 16 cups in one gallon.', 1.0)
('There are 1/8 gallons in two cups.', 1.0)
('There are 16 tablespoons in one cup.', 1.0)
('There are 160 tablespoons in 10 cups.', 1.0)
('There are 1/3 tablespoons in one teaspoon.', 1.0)


# Learning to Help

## Being Neighborly

In [29]:
corpus = HTMLPickledCorpusReader(CORPUS_PICKLED_ROOT.as_posix())

In [30]:
%%time
recommender = KNNRecommender()
recommender.fit(corpus.docs())
# recommender.save(RECOMMENDER_PATH)

CPU times: user 25.4 ms, sys: 3.88 ms, total: 29.3 ms
Wall time: 18.1 ms


In [31]:
%%time
recommender.transform(corpus.docs())

CPU times: user 262 ms, sys: 13 ms, total: 275 ms
Wall time: 52.3 ms


array([[0, 1, 2],
       [1, 0, 2],
       [2, 0, 1]])

## Offering Recommendations

In [32]:
corpus = HTMLPickledCorpusReader(CORPUS_PICKLED_ROOT.as_posix())

In [33]:
dialog = RecipeRecommender(
    recipes=list(corpus.titles()),
    recommender_path=RECOMMENDER_PATH
)

In [34]:
# print(dialog.listen('What can I make with brie, tomatoes, capers, and pancetta?')[0])