# Static Intent based Grouping for Dialogues

# Outline

<!-- MarkdownTOC autolink=true autoanchor=true bracket=round -->

- [Preprocess Data](#part-one---Preprocess Data)
    - [Entity Substitution](#Extract the Entities for processing)
- [Feature Extraction](#part-two---Feature Extraction)
    - [Word Embeddings](#Use word embedding for representing the question-utternaces)
- [Intent Clustering](#part-three---Intent Clustering)
- [Visualization](#part-four---Visualization)

<!-- /MarkdownTOC -->

First make sure you have the right version of python and the libraries. To execute or "run" the code use "shift+enter")!

In [1]:
# First check the Python version
import sys
if sys.version_info < (3,4):
    print('You are running an older version of Python!\n\n' \
          'You should consider updating to Python 3.4.0 or ' \
          'higher as the libraries built for this course ' \
          'have only been tested in Python 3.4 and higher.\n')
    print('Try installing the Python 3.5 version of anaconda '
          'and then restart `jupyter notebook`:\n' \
          'https://www.continuum.io/downloads\n\n')

# Now get necessary libraries
try:
    import os
    import numpy as np
    import matplotlib.pyplot as plt
    from skimage.transform import resize
    from skimage import data
    from scipy.misc import imresize
    import IPython.display as ipyd
except ImportError:
    print('You are missing some packages! ' \
          'We will try installing them before continuing!')
    !pip install "numpy>=1.11.0" "matplotlib>=1.5.1" "scikit-image>=0.11.3" "scikit-learn>=0.17" "scipy>=0.17.0"
    import os
    import numpy as np
    import matplotlib.pyplot as plt
    from skimage.transform import resize
    from skimage import data
    from scipy.misc import imresize
    import IPython.display as ipyd
    print('Done!')

# Import Tensorflow
try:
    import tensorflow as tf
except ImportError:
    print("You do not have tensorflow installed!")
    print("Follow the instructions on the following link")
    print("to install tensorflow before continuing:")
    print("")
    print("https://www.tensorflow.org/get_started/os_setup")


# We'll tell matplotlib to inline any drawn figures like so:
%matplotlib inline
plt.style.use('ggplot')

You are running an older version of Python!

You should consider updating to Python 3.4.0 or higher as the libraries built for this course have only been tested in Python 3.4 and higher.

Try installing the Python 3.5 version of anaconda and then restart `jupyter notebook`:
https://www.continuum.io/downloads


You do not have tensorflow installed!
Follow the instructions on the following link
to install tensorflow before continuing:

https://www.tensorflow.org/get_started/os_setup


In [2]:
# Formatting to change the default inline code style:
from IPython.core.display import HTML
HTML("""<style> .rendered_html code { 
    padding: 2px 4px;
    color: #c7254e;
    background-color: #f9f2f4;
    border-radius: 4px;
} </style>""")

<h3><font color='blue'>Functions to do entity extraction related tasks</font></h3> 

In [3]:
def entity_extraction(utterance):
    """Extract entity from an utterance.

    Parameters
    ----------
    utterance : str
        Utterance containing an entity.

    Returns
    -------
    entity : str
        Extracted entity from the utterance.
    """
    
    # Use NLP toolkits to extract entity out of the utterance.
    
    return entity

In [4]:
# A Knowledge Base to represent the generic high level semantic concept for each entity
knowledge_base = {'roboy':'person',
                  'Roboy':'person',
                  'radio':'appliance',
                 'remote':'appliance',
                 'ac':'appliance',
                 'cooler':'appliance',
                 'fans':'appliance',
                 'music player':'appliance',
                 'air conditioner':'appliance',
                 'mp3':'appliance',
                }

In [5]:
def entity_substitution(utterance, entity, operation):
    """Manipulate the extracted entity in an utterance.

    Parameters
    ----------
    utterance : str
        Utterance to be processed.
    entity : str
        Entity in the utterance to be operated on.
    operation : str
        Type of operation to be performed with the entity - remove, substitute or same.
        remove     - Remove the entity from the utterance and return it.
        substitute - Substitute the entity from the utterance by generic high level term and return it.
        same       - Return the utterance as it is.

    Returns
    -------
    ent_subs_utterance : str
        Processed utterance by performing the opeartion on the entity in it.
    """
    
    if operation == 'same':
        ent_subs_utterance = utterance
    elif operation == 'remove':
        ent_subs_utterance = utterance.replace(entity, '')
    elif operation == 'substitute':
        # Query a Knowledge Base to substitute with the generic entity
        generic_entity = knowledge_base[entity]
        ent_subs_utterance = utterance.replace(entity, generic_entity) 
        
    
    return ent_subs_utterance
    

In [6]:
# A function to preprocess the dataset
def preprocess(ds):
    processed_utterance = []
    
#     Define the type of operation to be performed with the entity - remove, substitute or same.
#     operation = 'same'
    operation = 'remove'
    
    for utterance in ds:
        entity = entity_extraction(utterance)
        ent_subs_utterance = entity_substitution(utterance, entity, operation)
        processed_utterance.append(ent_subs_utterance)
        
    return processed_ds


In [7]:
ent_subs_utterance = entity_substitution('How old are you, Roboy?', 'Roboy', 'same')
print(ent_subs_utterance)
ent_subs_utterance = entity_substitution('How old are you, Roboy?', 'Roboy', 'remove')
print(ent_subs_utterance)
ent_subs_utterance = entity_substitution('How old are you, Roboy?', 'Roboy', 'substitute')
print(ent_subs_utterance)

How old are you, Roboy?
How old are you, ?
How old are you, person?


<h3><font color='purple'>Functions to extract features</font></h3>

In [8]:
def load_embeddings(glove_path):
    """Compute an index mapping words to known embeddings, 
       by parsing the data dump of pre-trained embeddings of 100 dimensions for each word.

    Parameters
    ----------
    glove_path : str
        Path where the GLOVE pretrained dump is kept.

    Returns
    -------
    embeddings_index : dictionary
        Embedding vector for each word.
    """
    
    embeddings_index = {}
    f = open(os.path.join(glove_path, 'glove.6B.200d.txt'))
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    return embeddings_index

In [10]:
glove_path = r'C:\Users\Vishal\tensorflow\IntentGrouping\DialogSystem\IntentGrouping\dataset\glove.6B'
print(glove_path)

C:\Users\Vishal\tensorflow\IntentGrouping\DialogSystem\IntentGrouping\dataset\glove.6B


In [11]:
embeddings_index = load_embeddings(glove_path)
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [15]:

text_samples = ['Can you please tell me how old are you?',
                'What is your age?',
                'Please tell me your age.',
                'Please tell me how old are you?',
                'What is your age, please?',
                'Say me your age?',
                'How old are you?',
                'Roboy, how old are you?',
                'When were you born?',
                'What\'s the next birthday you celebrate?',
                
                'Who are you?',
                'Please tell me your name.',
                'What is your name?',
                'Tell me your name, please.',
                'How to call you?',
                'How can I call you?',
                'Do you have a name?',
                'What\'s the name you\'ve been given?',
                
                'Who are your creators?',
                'Where were you born?',
                'Who created you?',
                'Where were you assembled?',
                'Can you please let us know where were you assembled?',
                'Can you please let us know who created you?',
                'Can you let us know who created you?',
                'Do you know who developed you?',
                'Tell me how was you created?',
                'Whose your daddy?',
                'Where are you from?',
                
                'What can you do?',
                'What are you skilled at?',
                'How can you help me?',
                'Please tell me what can you do',
                'What did you learn?',
                'Can you really talk and understand or you just pretend?',
                'What are you capable of?',
                'What actions can you perform?',
                'Can you tell me about your functions?',
                'What are your abilities?',
                
                'How did you learn stuff?',
                'How have you come to this?',
                'From where did you learn things?',
                'Who teaches you all this?',
                'Who is your teacher?',
                'Who is your guru?',
                'Who is your mentor?',
                'How have you learnt all your skills?',
                'How do you know how to do stuff?',
                
                'How do you like this conference?',
                'Do you like this fair?',
                'What are your thoughts about the current exhibition?',
                'How is this evening for you?',
                'Do you feel comfortable here?',
                'Are you going to stay till the end?',
                'Why are you here?',
                
                'How is the weather today?',
                'What is best thing to do in the city today?',
                'Are you aware of any spectacular shows coming soon?',
                'Name me some historically important places?',
                'How is the road situation today?',
                'Any suggestions on how to spend the day?',
                'How far did you travel to this city?',
                
                'What is your goal?',
                'What do you live for?',
                'What were you created for?',
                'Why do you speak with people?',
                'What is your purpose?',
                'What is the purpose of your existence?',
                'What is the main reason for your creation?',
                'What is the sole purpose of your existence?',
                'Why are you so special?',
                                                
                
                # All above utterances will be used for clustering of intents.
                # Sample utterances below used for predicting after clustering
                'How old are you, Roboy?',
                'Tell me your name.',
                'Can you let me know who created you?',
                'What are your functions?',
                'Who taught you all the skills?',
                'What would you say about this meeting?',
                'Could you recommend some place to visit?',
                'Why were you created?'    
]

# text_samples = preprocess(text_samples)
# print(text_samples)


In [16]:
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
word_index = count_vect.fit_transform(text_samples)
# print(word_index)
# Perform operations on word_index to convert it into a dictionary.
embedding_dim = 200
print(word_index.shape[1])

print(embeddings_index.get('you'))

# from keras.preprocessing.text import Tokenizer
# from keras.preprocessing.sequence import pad_sequences

# tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
# tokenizer.fit_on_texts(texts)

132
[ 0.85395002  0.57146001 -0.023652   -0.11047    -0.1275      0.085129
 -0.74975997 -0.068121    0.043576    0.63928002 -0.072301    0.28775001
  0.66308999  0.23428001 -0.080206    0.27575001 -0.30429     0.36572999
  0.14379001 -0.107       0.33497     3.0789001   0.059451    0.039004
  0.45506999 -0.47595    -0.10247     0.28632    -0.31316999 -0.053495
 -0.17990001 -0.075404    0.11216    -0.098163   -0.10058    -0.33414
 -0.52158999 -0.17538001  0.008864    0.30078     0.083636    0.38332999
 -0.12608001  0.47973001 -0.33916     0.34158     1.02139997 -0.15933999
  0.09167     0.42668     0.30746999 -0.10632     0.051894    0.49204999
  0.48486     0.026916    0.091038   -0.30983999 -0.12899999  0.14038999
  0.093296   -0.057087   -0.058724   -0.27043    -0.36083999 -0.11826
 -0.013159    0.67659998  0.56496     0.10306     0.89587998 -0.083035
  0.20385    -0.31218001 -0.78539002 -0.17744    -0.95660001 -0.18685
 -0.65925997  0.16091    -0.12383     0.023029    0.08357     0.

In [17]:
# Remove all punctuation symbols, special characters and convert all words to lower case

word_index = text_samples
# Perform operations on word_index to convert it into a dictionary.
embedding_dim = 200
# No of words that each utterance should contain.
no_of_words = 10
print(len(word_index))


79


In [18]:
def get_embedding_matrix(word_index, embeddings_index):
    """Leverage the embedding_index dictionary and word_index to compute the embedding matrix

    Parameters
    ----------
    word_index : dictionary
        Words tokenized from a list of text samples
        
    embeddings_index : dictionary
        Index mapping words to known embeddings.

    Returns
    -------
    embedding_matrix : matrix
        Embedding matrix.
    """

    embedding_matrix = np.zeros((len(word_index), no_of_words, embedding_dim))
    for i, sentence in enumerate(word_index):
        for j, word in enumerate(sentence.split()):
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                # words not found in embedding index will be set to all-zeros.
                embedding_matrix[i,j] = embedding_vector
    return embedding_matrix


In [19]:
embedding_matrix = get_embedding_matrix(word_index, embeddings_index)
print(embedding_matrix.shape)

(79L, 10L, 200L)


Function to flatten out word embeddings (3D) into sentence embedding (2D)

In [20]:
def get_sentence_embedding(embedding_matrix):
    """Function to flatten out word embeddings (3D) into sentence embedding (2D) for clustering and t-SNE.

    Parameters
    ----------
    embedding_matrix : matrix
        Embedding matrix.

    Returns
    -------
    sentence_rep : matrix
        Embedding matrix for each each sentence combining all words in it.
    """
    sentence_rep = []
    for row in embedding_matrix:
        sentence_embedding = [item for sublist in row for item in sublist]
        sentence_embedding = np.nan_to_num(sentence_embedding)
        sentence_embedding[sentence_embedding == np.inf] = 0
        sentence_embedding[sentence_embedding == -np.inf] = 0
        sentence_embedding[sentence_embedding == np.nan] = 0
    #     print(sentence_embedding)
    #     sentence_rep = (tsne.fit_transform(sentence_embedding))
        sentence_rep.append(sentence_embedding)
    
    return sentence_rep

<h3><font color='teal'>Functions to cluster the data</font></h3>


K means clustering on both supervised and unsupervised data

In [21]:
import numpy as np
from sklearn.cluster import KMeans
from scipy.spatial import distance
# - cluster it into k clusters (e.g., k=100)

k = 8 #1200

In [22]:
sentence_rep = []
sentence_rep = get_sentence_embedding(embedding_matrix)

In [23]:
X = sentence_rep[0:-8]

In [24]:
# Repeat number of runs with different seeds and use different initialization technique.
# clusterKMeans = KMeans(n_clusters=k, n_init=1, init='random')
clusterKMeans = KMeans(n_clusters=k, n_init=10, init='k-means++') #, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=1)

In [25]:
clusterKMeans.fit(X)
clusterNewCentroids = clusterKMeans.cluster_centers_
clusterKMeansLabels = clusterKMeans.labels_

In [26]:
yPred1 = clusterKMeans.predict(sentence_rep[-1])
yPred2 = clusterKMeans.predict(sentence_rep[-2])
yPred3 = clusterKMeans.predict(sentence_rep[-3])
yPred4 = clusterKMeans.predict(sentence_rep[-4])
yPred5 = clusterKMeans.predict(sentence_rep[-5])
yPred6 = clusterKMeans.predict(sentence_rep[-6])
yPred7 = clusterKMeans.predict(sentence_rep[-7])
yPred8 = clusterKMeans.predict(sentence_rep[-8])



In [27]:
clusterNewCentroids, clusterKMeansLabels

(array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]]),
 array([5, 2, 6, 3, 2, 2, 0, 0, 1, 3, 0, 6, 2, 2, 0, 0, 6, 0, 1, 1, 0, 1, 5,
        5, 3, 3, 3, 0, 1, 1, 1, 6, 4, 1, 4, 1, 6, 3, 1, 6, 3, 6, 6, 2, 2, 2,
        3, 4, 3, 6, 1, 3, 6, 3, 1, 0, 4, 3, 3, 3, 4, 4, 2, 6, 1, 3, 2, 7, 7,
        7, 1]))

In [28]:
yPred1, yPred2, yPred3, yPred4, yPred5, yPred6, yPred7, yPred8

(array([1]),
 array([3]),
 array([3]),
 array([3]),
 array([1]),
 array([3]),
 array([2]),
 array([0]))

<h3><font color='orange'>Functions to visualize the data</font></h3>

t-SNE visualization

In [29]:
from sklearn.manifold import TSNE
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook

In [30]:
tsne = TSNE(n_components=2, random_state=0)

In [31]:
intents_tsne = []
sentence_rep = []
sentence_rep = get_sentence_embedding(embedding_matrix)
intents_tsne = (tsne.fit_transform(sentence_rep))
intents_tsne

array([[ -8.21051702e+00,  -1.37725659e+02],
       [  3.08547158e+01,  -2.98444632e+01],
       [ -9.84644823e+01,  -4.91917304e+00],
       [ -4.13665439e+01,   7.21082656e+01],
       [  1.14932044e+00,  -7.89672128e+00],
       [  3.45952766e+01,   6.51432817e+01],
       [ -9.58103188e+01,   4.30395507e+01],
       [  1.23579315e+02,  -2.81136309e+01],
       [  9.28406087e+01,   1.57504676e+01],
       [  3.40117812e+01,  -2.16240200e+02],
       [ -1.07370980e+02,   1.70555502e+02],
       [  4.37907675e+01,  -8.89103584e+01],
       [  2.37774406e+01,   2.07740055e+01],
       [ -2.85752770e+01,   1.23357033e+02],
       [ -2.02971948e+01,  -4.87911187e+00],
       [ -8.73905106e+01,  -6.18450946e+01],
       [ -6.17392715e+01,  -5.39488002e+01],
       [ -7.40685216e+01,   1.18813109e+02],
       [  7.41775704e+01,   1.11943378e+01],
       [  9.03241936e+01,  -8.07342496e+00],
       [  5.55610135e+01,   2.90255582e+01],
       [  2.31001965e+01,  -1.34309432e+01],
       [  

In [32]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="Glove T-SNE for dialogue intents")

source = ColumnDataSource(data=dict(x1=intents_tsne[:,0],
                                    x2=intents_tsne[:,1],
                                    names=word_index))

p.scatter(x="x1", y="x2", size=6, source=source)

labels = LabelSet(x="x1", y="x2", text="names", y_offset=10,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

show(p)

To Do:
1. SVO Triplets
2. PCA 