## Load Packages

In [3]:
# Storing and manipulating data
import pandas as pd

# For text preprocessing
import re
from nltk.corpus import stopwords

# Train word embedding model
from gensim.models import Word2Vec

# Splitting dataset
from sklearn.model_selection import train_test_split

# Clustering
import scipy
import numpy as np

## Define Process Text Function

In [4]:
def preprocess(raw_text):
    """
    Takes sentence (string), removed special characters, removes stop words, tokenize by words, and converts to lowercase.
    Returns list of strings that are the cleaned words from the sentence
    """
    
    # Remove special characters
    letters_only_text = re.sub("[^a-zA-Z]", " ", raw_text)

    # Convert to lower case and split words into separate strings
    words = letters_only_text.lower().split()

    # Remove stop words
    stopword_set = set(stopwords.words("english"))
    
    # Retain cleaned lists of words from sentence
    cleaned_words = list(set([w for w in words if w not in stopword_set]))

    return cleaned_words

## Loading Data

In [5]:
data = pd.read_csv('bitext_free_dataset.csv', header = 0)

# List of tokenized sentences with stopwords removed
sentences = [preprocess(x[0]) for x in data.values.tolist()]

# List of raw sentences (not pre-processed)
not_pp_sentences = [x[0] for x in data.values.tolist()]

## Train Word Embedding

In [21]:
# Train Word Embedding Model
model = Word2Vec(sentences, min_count=1)

## Convert Sentences to Vectors with Word Embedding

In [22]:
sentences_as_vectors = []
for sentence in not_pp_sentences:
    
    # Mean of word vectors from Word2Vec model become the sentence vector
    sentences_as_vectors.append(np.mean([model[word] for word in preprocess(sentence)],axis=0))

  """


## Cluster Sentences

AgglomerativeClustering from SciKit-Learn recursively merges the pair of clusters that minimally increases a given linkage distance (1).  
 - **n_clusters:** Number of clusters to stop at. I chose 27 since the dataset I am comparing against has 27 intents that were identified.  
 - **affinity:**  Since magnitude can be considered as I am passing the algorithm vectors which represent means of vectors for the words in each sentence, I use euclidean distance over cosine distance (2).  
 - **linkage:**  Which linkage criterion to use to merge pairs of clusters. Average means it will merge the two clusters with the smallest average distance between each sentence of the two clusters.  


References:  
 1. https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html
 2. https://cmry.github.io/notes/euclidean-v-cosine

In [47]:
from sklearn.cluster import AgglomerativeClustering
cluster_model = AgglomerativeClustering(n_clusters=27, affinity='euclidean', linkage='average')
cluster_model.fit(sentences_as_vectors)
cluster_model

AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
                        connectivity=None, distance_threshold=None,
                        linkage='average', memory=None, n_clusters=27)

In [48]:
cluster_model.n_clusters_

27

## Split and Store Labeled Sentences

In [49]:
data = pd.DataFrame({'Utterances': not_pp_sentences, 'Label': cluster_model.labels_}) # Labels are accessed from fit model
print('Number of Utterances by Cluster:\n')
print(data['Label'].value_counts())

Number of Utterances by Cluster:

19    3725
0     2613
7     2068
12    1640
2     1602
20    1525
8     1272
4     1071
10    1025
9      989
6      750
3      648
23     554
17     503
5      330
13     287
1      252
14     222
15     202
26     146
21     120
11     103
18      15
22       5
16       3
24       1
25       1
Name: Label, dtype: int64


AWS Lex allows a maximum of 1500 utterances per intent. Since the idea behind spliting testing and training sets is to maximize our training data while still allowing for adequate testing, I maximize my training data by splitting 60% test, 40% train. This gives me just under the max amount of training data for my largest cluster. Clusters with 5 or less utterances in the training or test set will be withheld as "fallback" or unhandled utterances.

In [54]:
train, test = train_test_split(data, test_size = 0.6)
train.to_excel('train.xlsx')
test.to_excel('test.xlsx')

In [56]:
print('Training Data Cluster Counts\n')
print(train['Label'].value_counts())
print('\n')
print('Test Data Cluster Counts\n')
print(test['Label'].value_counts())

Training Data Cluster Counts

19    1490
0      999
7      824
12     653
2      644
20     599
8      515
4      430
10     408
9      400
6      294
3      273
23     230
17     194
5      131
13     116
1      112
15      89
14      89
26      65
21      53
11      48
18       7
22       4
16       1
Name: Label, dtype: int64


Test Data Cluster Counts

19    2235
0     1614
7     1244
12     987
2      958
20     926
8      757
4      641
10     617
9      589
6      456
3      375
23     324
17     309
5      199
13     171
1      140
14     133
15     113
26      81
21      67
11      55
18       8
16       2
25       1
24       1
22       1
Name: Label, dtype: int64
