In [1]:
## download source data
# !wget --show-progress --continue -O ./data/shakespeare.txt https://www.gutenberg.org/files/100/100-0.txt

In [2]:
import pandas
import numpy

import tensorflow_hub as hub
from sklearn.cluster import KMeans

from joblib import dump, load

In [3]:
input_data = './data/shakespeare.txt'
sentence_encoder = 'https://tfhub.dev/google/universal-sentence-encoder/4'
model_name = 'kmeans_shakespeare.joblib'
mapping_file = 'topic-mapping.json'

In [4]:
embed = hub.load(sentence_encoder)

In [5]:
df = pandas.read_csv(input_data, delimiter='\n', header=None, names=['line_text'], skiprows=82, skip_blank_lines=True)

In [6]:
df.head()

Unnamed: 0,line_text
0,THE SONNETS
1,1
2,"From fairest creatures we desire increase,"
3,"That thereby beauty’s rose might never die,"
4,"But as the riper should by time decease,"


In [7]:
embeddings = embed(df['line_text']).numpy()

In [8]:
embeddings.shape

(141146, 512)

In [9]:
df['embeddings'] = embeddings.tolist()

In [10]:
df.head()

Unnamed: 0,line_text,embeddings
0,THE SONNETS,"[0.05156312510371208, -0.05281050503253937, 0...."
1,1,"[-0.010349195450544357, -0.0831797868013382, -..."
2,"From fairest creatures we desire increase,","[0.006797707639634609, -0.07115708291530609, 0..."
3,"That thereby beauty’s rose might never die,","[0.053660713136196136, -0.04407283291220665, 0..."
4,"But as the riper should by time decease,","[0.058182988315820694, -0.006165965460240841, ..."


In [11]:
# rearrange embedding series from dataframe into vertical stack for kmeans model
X = numpy.vstack(df['embeddings'])
X.shape

(141146, 512)

In [12]:
# create the kmeans classifier
num_topics = 10
kmeans = KMeans(n_clusters = num_topics, init = 'k-means++', max_iter = 100, random_state = 0)
y_kmeans = kmeans.fit_predict(X)

In [13]:
# add cluster predictions to dataframe
df['cluster'] = y_kmeans

In [14]:
# add distance to nearest cluster
df['distance'] = numpy.min(kmeans.fit_transform(X), axis=1)

In [15]:
# evaluate closest articles for cluster 0
pandas.set_option('display.max_colwidth', None)
df[['line_text','cluster','distance']][df['cluster'] == 0].sort_values(by=['distance']).head(5)

Unnamed: 0,line_text,cluster,distance
86193,BENEDICK.,0,0.699447
86490,BENEDICK.,0,0.699447
86483,BENEDICK.,0,0.699447
83233,BENEDICK.,0,0.699447
86468,BENEDICK.,0,0.699447


In [16]:
# evaluate closest articles for cluster 1
pandas.set_option('display.max_colwidth', None)
df[['line_text','cluster','distance']][df['cluster'] == 1].sort_values(by=['distance']).head(5)

Unnamed: 0,line_text,cluster,distance
73996,"I should be guiltier than my guiltiness,",1,0.794074
15251,"Thither I must, although against my will,",1,0.79725
101239,"For she that scorn'd at me, now scorn'd of me;",1,0.800874
22379,"I must be ripp’d. To pieces with me! O,",1,0.801346
128524,I'll get me such a colour'd periwig.,1,0.806056


In [17]:
# evaluate closest articles for cluster 2
pandas.set_option('display.max_colwidth', None)
df[['line_text','cluster','distance']][df['cluster'] == 2].sort_values(by=['distance']).head(5)

Unnamed: 0,line_text,cluster,distance
105532,To you and your behests; and am enjoin’d,2,0.801851
112467,"Come hither from the furrow, and be merry:",2,0.803606
24525,’Twixt amorous and villainous. Being thus quench’d,2,0.808543
111711,"Come on, then; down, and swear.",2,0.809055
122926,"That noseless, handless, hack’d and chipp’d, come to him,",2,0.811197


In [18]:
# evaluate distribution by topics
topic_df = df.groupby('cluster')['line_text']
topic_count = topic_df.count()
topic_count

cluster
0    12888
1    18690
2    12583
3    20368
4    23752
5     8796
6     6266
7    12774
8    10388
9    14641
Name: line_text, dtype: int64

In [19]:
# save closest message for each cluster
cluster_closest = df[['line_text','cluster','distance']].loc[df.groupby('cluster')['distance'].idxmin()]
cluster_closest.head(10)

Unnamed: 0,line_text,cluster,distance
83225,BENEDICK.,0,0.699447
73996,"I should be guiltier than my guiltiness,",1,0.794074
105532,To you and your behests; and am enjoin’d,2,0.801851
47838,Thou wouldst have left thy dearest heart-blood there,3,0.776139
69814,Authoris’d by her grandam. Shame itself!,4,0.846987
57140,And keep us all in servile fearfulness.,5,0.762489
118811,ANDARUS.,6,0.656141
27196,"Will bring him to his wonted way again,",7,0.779803
63033,Come forth.,8,0.786633
54656,"ELINOR. Out, insolent! Thy bastard shall be king,",9,0.769182


In [20]:
# save cluster mapping to file
cluster_closest.to_json(mapping_file, orient='table')

In [21]:
# save model to file
# load model using load('filename.joblib') 
dump(kmeans, model_name)

['kmeans_shakespeare.joblib']