In [1]:
import spacy
nlp = spacy.load('en_core_web_lg')


In [2]:
movie_lines = {}
for line in open("./cornell movie-dialogs corpus/movie_lines.txt",
                 encoding="latin1"):
    line = line.strip()
    parts = line.split(" +++$+++ ")
    if len(parts) == 5:
        movie_lines[parts[0]] = parts[4]
    else:
        movie_lines[parts[0]] = ""

In [3]:
import json
responses = {}
for line in open("./cornell movie-dialogs corpus/movie_conversations.txt",
                 encoding="latin1"):
    line = line.strip()
    parts = line.split(" +++$+++ ")
    line_ids = json.loads(parts[3].replace("'", '"'))
    for first, second in zip(line_ids[:-1], line_ids[1:]):
        responses[first] = second

In [4]:
import random
for pair in random.sample(responses.items(), 5):
    print("A:", movie_lines[pair[0]])
    print("B:", movie_lines[pair[1]])
    print()

A: Don't kid yourself -- she means it.
B: Are we going to have the same quarrel over again?

A: No, it's more like a leave of-  Ha, ha, ha. Yeah, they fired me.
B: You don't seem to upset?

A: Length, 22 feet, 8 inches.
B: Come on, let's check the bite radius.

A: It's too...
B: You get a bad month, all of a...

A: I have been trying to reach Wilfred with no success.  As one grows older, one yearns for family.
B: It's good to see you again, Uncle. I've missed you.



## Making a sentence vector
To make the sentence vector for each line of dialog, we're going to use spaCy. The function sentence_mean below takes the spaCy object that we loaded earlier (nlp) and uses it to tokenize the string that you pass into the function (i.e., break it up into words). It then uses numpy's mean() function to find the average of the vectors, producing a new vector. The shape of the resulting vector (i.e., the number of dimensions) should be the same as the shape of the individual word vectors.

(Note: I disabled the tagger and parser parts of spaCy's pipeline to improve performance. We're not using part of speech tags or dependency relations in this chatbot, so there's no reason to spend time calculating them.)

In [5]:
import numpy as np
def sentence_mean(nlp, s):
    if s == "":
        s = " "
    doc = nlp(s, disable=['tagger', 'parser'])
    return np.mean(np.array([w.vector for w in doc]), axis=0)
sentence_mean(nlp, "This... is a test.").shape

(300,)

In [6]:
from simpleneighbors import SimpleNeighbors

nns = SimpleNeighbors(300)
for i, line_id in enumerate(random.sample(list(responses.keys()), 10000)):
    # show progress
    if i % 1000 == 0: print(i, line_id, movie_lines[line_id])
    line_text = movie_lines[line_id]
    summary_vector = sentence_mean(nlp, line_text)
    if np.any(summary_vector):
        nns.add_one(line_id, summary_vector)
nns.build()

0 L378927 You'll get it. But, Joe, I want you to do something for me. Let me take a look around inside. Make sure everybody's okay.
1000 L403439 What can I do for you?
2000 L34539 Are you gonna do what I think you're gonna do?
3000 L154113 You were gonna run through the law enforcement databases for a match on identifying marks.
4000 L477431 I am not an American, Mr. Russ.  My daughter is not an American.
5000 L523674 But you'll be away from home for days -- what will you do when the dark comes, where will you sleep? I can't send you away from here like that!
6000 L635070 You told them cowboys you had it.
7000 L553184 Ah!
8000 L464216 He wants to see your map.
9000 L649283 Go back, MICHAEL... Go Back


In [7]:

sentence = "I like making bots."
picked = nns.nearest(sentence_mean(nlp, sentence), 5)[0]
response_line_id = responses[picked]

print("Your line:\n\t", sentence)
print("Most similar turn:\n\t", movie_lines[picked])
print("Response to most similar turn:\n\t", movie_lines[response_line_id])


Your line:
	 I like making bots.
Most similar turn:
	 I know. Stop it.
Response to most similar turn:
	 Pumpkin, you're dating a tumbling, tumbling dickweed.


In [9]:
from semanticsimilaritychatbot import SemanticSimilarityChatbot
chatbot = SemanticSimilarityChatbot(nlp, 300)

In [10]:
sample_n = 10000
for first_id, second_id in random.sample(list(responses.items()), sample_n):
    chatbot.add_pair(movie_lines[first_id], movie_lines[second_id])
chatbot.build()

In [11]:
print(chatbot.response_for("Hello computer!"))

We caught 'em.


In [12]:
my_turn = "The weather's nice today, don't you think?"
for i in range(5, 51, 5):
    print("picking from", i, "possible responses:")
    print(chatbot.response_for(my_turn, i))
    print()

picking from 5 possible responses:
I've got her.

picking from 10 possible responses:
Sure, I think it's swell.

picking from 15 possible responses:
It's called Tron. It's a security program itself, actually. Monitors all the contacts between our system and other systems... If it finds anything going on that's not scheduled, it shuts it down. I sent you a memo on it.

picking from 20 possible responses:
...yes...

picking from 25 possible responses:
Right... Knickers... Cabbages... It doesn't have a beak. Alex laughs. Slide of woman speaking to boy.

picking from 30 possible responses:
It's late.

picking from 35 possible responses:
It's all a game, don't bother me.

picking from 40 possible responses:
"G.I. Jane."  And which one of you told me she wouldn't last a week? Huh?

picking from 45 possible responses:
Oh, oh, yes.  I-I-I-'d like to, uh, uh, very much.

picking from 50 possible responses:
You don't have to explain yourself to me, Deirdre.



In [13]:
chatbot_html = """
<style type="text/css">#log p { margin: 5px; font-family: sans-serif; }</style>
<div id="log"
     style="box-sizing: border-box;
            width: 600px;
            height: 32em;
            border: 1px grey solid;
            padding: 2px;
            overflow: scroll;">
</div>
<input type="text" id="typehere" placeholder="type here!"
       style="box-sizing: border-box;
              width: 600px;
              margin-top: 5px;">
<script>
function paraWithText(t) {
    let tn = document.createTextNode(t);
    let ptag = document.createElement('p');
    ptag.appendChild(tn);
    return ptag;
}
document.querySelector('#typehere').onchange = async function() {
    let inputField = document.querySelector('#typehere');
    let val = inputField.value;
    inputField.value = "";
    let resp = await getResp(val);
    let objDiv = document.getElementById("log");
    objDiv.appendChild(paraWithText('😀: ' + val));
    objDiv.appendChild(paraWithText('🤖: ' + resp));
    objDiv.scrollTop = objDiv.scrollHeight;
};
async function colabGetResp(val) {
    let resp = await google.colab.kernel.invokeFunction(
        'notebook.get_response', [val], {});
    return resp.data['application/json']['result'];
}
async function webGetResp(val) {
    let resp = await fetch("/response.json?sentence=" + 
        encodeURIComponent(val));
    let data = await resp.json();
    return data['result'];
}
</script>
"""

In [14]:

from flask import Flask, request, jsonify
app = Flask(__name__)
@app.route("/response.json")
def response():
    sentence = request.args['sentence']
    return jsonify(
        {'result': chatbot.response_for(sentence)})
@app.route("/")
def home():
    return chatbot_html + "<script>let getResp = webGetResp;</script>"
app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [19/Dec/2018 12:41:47] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [19/Dec/2018 12:41:52] "[37mGET /response.json?sentence=hi HTTP/1.1[0m" 200 -
127.0.0.1 - - [19/Dec/2018 12:41:59] "[37mGET /response.json?sentence=oh%2C%20ok HTTP/1.1[0m" 200 -
127.0.0.1 - - [19/Dec/2018 12:42:06] "[37mGET /response.json?sentence=who%27s%20rob%3F HTTP/1.1[0m" 200 -
127.0.0.1 - - [19/Dec/2018 12:42:21] "[37mGET /response.json?sentence=cool HTTP/1.1[0m" 200 -
127.0.0.1 - - [19/Dec/2018 12:42:26] "[37mGET /response.json?sentence=hey HTTP/1.1[0m" 200 -
127.0.0.1 - - [19/Dec/2018 12:44:13] "[37mGET /response.json?sentence=how%20can%20i%20help%20you%3F HTTP/1.1[0m" 200 -
