# Semantic Similarity Chatbot
Credits + source code: https://colab.research.google.com/drive/1XlmtcyMdPRQC6bw2HQYb3UPtVGKqUJ0a#scrollTo=8R0T0ei52FXS

In [1]:
# Install
!pip install spacy; # Nlp for tokenizing text and word vector database
!python -m spacy download en_core_web_lg; # For word vectors
!pip install simpleneighbors; # Find nearest neighbors in database
!pip install https://github.com/aparrish/semanticsimilaritychatbot/archive/master.zip; # Make chatbot based on semantic similarity
!pip install IPython; # Make code interactive
!pip install flask # Make code interactive alternative  

Collecting en_core_web_lg==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9MB)
[K     |████████████████████████████████| 827.9MB 1.2MB/s 
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-cp36-none-any.whl size=829180944 sha256=f1ff20ded6c7e4afd887983cf6364f391b5fe73dde118414744a8ca1b8b363cf
  Stored in directory: /tmp/pip-ephem-wheel-cache-1ci7nm2y/wheels/2a/c1/a6/fc7a877b1efca9bc6a089d6f506f16d3868408f9ff89f8dbfc
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')
Collecting simpleneighbors
  Downloading https://files.pythonhosted.org/packages/f9/10/9092e15d9aa4a9e5a26341612

In [0]:
# Imports
import spacy; # Nlp for tokenizing text and word vector database
import en_core_web_lg; # For word vectors
from simpleneighbors import SimpleNeighbors; # Find nearest neighbors in database
from semanticsimilaritychatbot import SemanticSimilarityChatbot; # Make chatbot based on semantic similarity
from google.colab import files, output; # Import Colab files
import IPython # Make code interactive
from flask import Flask, request, jsonify # Make code interactive alternative
import random;
import json;
import numpy as np;

# Import model
nlp_model = en_core_web_lg.load();

# Test word vector
# nlp_model.vocab["cheese"].vector;

## Movie Dialog Database

In [3]:
# Download and unzip Cornell Movie Dialogs
!curl -L -O http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip;
!unzip cornell_movie_dialogs_corpus.zip;

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 9684k  100 9684k    0     0  2104k      0  0:00:04  0:00:04 --:--:-- 2104k
Archive:  cornell_movie_dialogs_corpus.zip
   creating: cornell movie-dialogs corpus/
  inflating: cornell movie-dialogs corpus/.DS_Store  
   creating: __MACOSX/
   creating: __MACOSX/cornell movie-dialogs corpus/
  inflating: __MACOSX/cornell movie-dialogs corpus/._.DS_Store  
  inflating: cornell movie-dialogs corpus/chameleons.pdf  
  inflating: __MACOSX/cornell movie-dialogs corpus/._chameleons.pdf  
  inflating: cornell movie-dialogs corpus/movie_characters_metadata.txt  
  inflating: cornell movie-dialogs corpus/movie_conversations.txt  
  inflating: cornell movie-dialogs corpus/movie_lines.txt  
  inflating: cornell movie-dialogs corpus/movie_titles_metadata.txt  
  inflating: cornell movie-dialogs corpus/raw_script_urls.txt  
  inflating: corne

In [0]:
# Get movie lines only
movie_lines = {};
for line in open("./cornell movie-dialogs corpus/movie_lines.txt", encoding="latin1"):  
  line = line.strip();
  parts = line.split(" +++$+++ ");
  if len(parts) == 5:
    movie_lines[parts[0]] = parts[4];
  else:
    movie_lines[parts[0]] = "";

In [0]:
# Get lines for convo/responses
responses = {};
for line in open("./cornell movie-dialogs corpus/movie_conversations.txt", encoding="latin1"): 
  line = line.strip();
  parts = line.split(" +++$+++ ");
  # print(parts[3].replace("'", '"')); 
  line_ids = json.loads(parts[3].replace("'", '"'));
  for first, second in zip(line_ids[:-1], line_ids[1:]):
      responses[first] = second;

In [6]:
# Test if the pairs work
for pair in random.sample(responses.items(), 5):
  print("A:", movie_lines[pair[0]]);
  print("B:", movie_lines[pair[1]]);
  print();

A: You a narc?
B: Do I look like a narc?

A: I don't know.  I don't know who I am.
B: Yeah, well, welcome to the club.

A: So, what is it you do, Mister...?
B: Cooper. Bobby Cooper. Oh you know, whatever pays best. Little bartending, used to teach tennis, played a little competition ... .

A: Don't tell me what to do!
B: Shhh!

A: Where's the name sheet?
B: Nobody's named this one yet.



## Word/Sentence Vectors

In [0]:
# Make sentence vector (Breaks sentence into word vectors then finds average vector)
def sentence_mean(nlp_model, s):
  if s == "":
    s = " ";
  doc = nlp_model(s, disable=["tagger", "parser"]);
  return np.mean(np.array([w.vector for w in doc]), axis=0);

sentence_mean(nlp_model, "Test").shape;

In [8]:
# Make 300-dimensional simple neighbor object and add sentence vectors to database
sno = SimpleNeighbors(300);
for i, line_id in enumerate(random.sample(list(responses.keys()), 100000)):
  # Show progress
  if i % 1000 == 0:
    print(i, line_id, movie_lines[line_id]);
  line_text = movie_lines[line_id];
  summary_vector = sentence_mean(nlp_model, line_text);
  if np.any(summary_vector):
    sno.add_one(line_id, summary_vector);

sno.build();

0 L445249 Hello, there, Meadows![13]
1000 L179151 One night he took us editors out to celebrate after a deadline. Eventually Dave and I were left alone and we got to talking - not like teacher and student, but like two adults.
2000 L423409 You must be joking.
3000 L126009 We've no proof, of course, but we rather think so, yes.
4000 L362833 No.  No.
5000 L348684 Come on, man, I'm starting to cramp up here.  We have the chance right here, right now, I say we go!
6000 L198599 You're an hour and a half late.
7000 L319830 What do you mean 'We...?'
8000 L364255 How's it feel?
9000 L51583 The stiff one eye?
10000 L652498 You called the cops on us?
11000 L220006 What the hell does this have to do with insurance?
12000 L456839 Desperation has driven me past etiquette, all the way to frenzy.
13000 L63428 Don't shoot me, I'm just the piano players.
14000 L404448 I was devastated.
15000 L170964 Oh, Mom, it's so ugly.
16000 L165449 -- you wouldn't let me kill him when I had the chance --
17000 L478

In [9]:
# Test SNO
me = "Can you make me a sandwich?";
picked = sno.nearest(sentence_mean(nlp_model, me), 5)[0];
response_line_id = responses[picked];

print("Me: ", me);
print("Closest line: ", movie_lines[picked]);
print("AI Reponse: ", movie_lines[response_line_id]);

Me:  Can you make me a sandwich?
Closest line:  Would you like a sandwich?
AI Reponse:  No, thanks!  I want to get right to sleep.


## Chatbot

In [0]:
# Build chatbot
chatbot = SemanticSimilarityChatbot(nlp_model, 300);

# Build database
sample_n = 100000;
for first_id, second_id in random.sample(list(responses.items()), sample_n):
  chatbot.add_pair(movie_lines[first_id], movie_lines[second_id]);
chatbot.build();

In [11]:
# Test chatbot
print(chatbot.response_for("Can you make me a sandwich?"));

That's what I'm going to do.


In [12]:
# Test chatbot with more possible responses
my_turn = "I'm doing well";

for i in range(5, 51, 5):
  print("Picking from", i, "possible responses");
  print(chatbot.response_for(my_turn, i));
  print();

Picking from 5 possible responses
Look Sean, I don't care if you have a rapport with the boy-- I don't care if you have a few laughs-- even at my expense! But don't you dare undermine what I'm trying to do here.

Picking from 10 possible responses
It'll be fine.

Picking from 15 possible responses
Stand up.

Picking from 20 possible responses
--These circumstances are mitigated. Right now. They're mitigated.

Picking from 25 possible responses
Stand up.

Picking from 30 possible responses
What about tennis?  Riding? fixing up old cars?  Bartending?

Picking from 35 possible responses
You should have enough to do in this house ... Come here and listen to this.

Picking from 40 possible responses
And you think I will permit this, my friend?

Picking from 45 possible responses
We must discover it.  The reason the murderer chose these persons.

Picking from 50 possible responses
It'll be fine.



In [13]:
# Save chatbot
chatbot.save("movielines-10k-sample");

# Load chatbot
chatbot = SemanticSimilarityChatbot.load("movielines-10k-sample", nlp_model);

# Test again
print(chatbot.response_for("I'm going outside"));

Whatya mean?  We're going!  Tran's gonna do her right there unless--


## Make Chatbot Interactive


In [0]:
# Make area and UI for chatbot
chatbot_html = """
<style type="text/css">#log p { margin: 5px; font-family: sans-serif; }</style>
<div id="log"
     style="box-sizing: border-box;
            width: 600px;
            height: 32em;
            border: 1px grey solid;
            padding: 2px;
            overflow: scroll;">
</div>
<input type="text" id="typehere" placeholder="type here!"
       style="box-sizing: border-box;
              width: 600px;
              margin-top: 5px;">
<script>
function paraWithText(t) {
    let tn = document.createTextNode(t);
    let ptag = document.createElement('p');
    ptag.appendChild(tn);
    return ptag;
}
document.querySelector('#typehere').onchange = async function() {
    let inputField = document.querySelector('#typehere');
    let val = inputField.value;
    inputField.value = "";
    let resp = await getResp(val);
    let objDiv = document.getElementById("log");
    objDiv.appendChild(paraWithText('😀: ' + val));
    objDiv.appendChild(paraWithText('🤖: ' + resp));
    objDiv.scrollTop = objDiv.scrollHeight;
};
async function colabGetResp(val) {
    let resp = await google.colab.kernel.invokeFunction(
        'notebook.get_response', [val], {});
    return resp.data['application/json']['result'];
}
async function webGetResp(val) {
    let resp = await fetch("/response.json?sentence=" + 
        encodeURIComponent(val));
    let data = await resp.json();
    return data['result'];
}
</script>
"""

In [23]:
# Make display
display(IPython.display.HTML(chatbot_html + "<script>let getResp = colabGetResp;</script>"));

# Chatbot response
def get_response(person):
  resp = chatbot.response_for(person);
  return IPython.display.JSON({"result":resp});

# Output
output.register_callback("notebook.get_response", get_response);

## Alternative UI

In [16]:
# Alternative UI
app = Flask(__name__);
@app.route("/response.json")

# Chatbot response
def response():
  sentence = request.args["Sentence"];
  return jsonify({"result": chatbot.response_for(sentence)});
@app.route("/")

# UI
def home():
  return chatbot_html + "<script>let getResp = webGetResp;</script>";
app.run();

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
