In [30]:
from collections import Counter
from collections import defaultdict
import glob, os
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt 
from gensim import corpora, models

'''
Chunk all the Shakespeare plays into 5000 word chunks. (text processing lib)
Each chunk should be labeled in the format play_chunkno (i.e. 'merchant_of_venice_006' etc).

Topic model the chunks using 5, 10, 25, 50 topics. (gensim)

Taking each topic distribution as your set of vectors, cluster the chunks using K-Means clustering where K=3. 
Draw the results of each clustering as a color-coded scatterplot.
'''


##### TEXT PROCESSING LIB ####
def tokenize(s):
    """
    Input: 
        string s
    Output: 
        list of strings
    """
    return s.split()

def preprocess(s, lowercase=True, strip_punctuation=True):
    """
    Input:
        string s
        boolean lowercase
        boolean strip_punctuation
    Return:
        list of strings
    """
    punctuation = '.,?<>:;"\'!%'
    if isinstance(s, str):
        s = tokenize(s)
    if lowercase:
        s = [t.lower() for t in s]
    if strip_punctuation:
        s = [t.strip(punctuation) for t in s]
        
    return s

def token_frequency(tokens=None, tf={}, relative=False):
    """
    Input:
        tokens = list of strings or None
        tf = dict or None
        relative = boolean
    Return:
        dictionary of token frequencies
    """
    for t in tokens:
        if t in tf:
            tf[t]+=1
        else:
            tf[t]=1
    if relative:
        total = sum([c for t, c in tf.items()])
        tf = {t:tf[t]/total for t in tf}
    return tf


'''
    Input: list of words, chunk size
    Output: list of chunks
'''
def chunk(text, chunk_size=1000):
    chunks = []
    for start in range(0, len(text)-chunk_size+1, chunk_size):
        chunks.append(text[start:start+chunk_size])
    return chunks


#### PROCESSING ####
# 1. Get files from folder
filepath = './shakespeare/*.txt'
files = glob.glob(filepath)
print("found", str(len(files)), "files starting with", files[0])

# labels = [os.path.split(f)[1][:-4].replace('_', ' ').title() for f in files]

# 2. chunk all shakespeare and make labels
def make_topic_model(chunks, num):
    dictionary = corpora.Dictionary(chunks) 
    corpus = [dictionary.doc2bow(text) for text in chunks]
    # lda model
    lda = models.LdaModel(corpus, id2word=dictionary, num_topics=num)
    corpus_lda = lda[corpus]
    return corpus_lda

'''Kmeans the topics, k = 3'''
def kmean_topics(topics, labels, num):
    # Put labels, features, vectors into a single dataframe   
    vectors_df = pd.DataFrame(topics, index=labels, columns=range(num)).fillna(0)
    # 4. Use K-means clustering from Scikit Learn to find two clusters. 
    n_clusters=3
    kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(vectors_df)
    plot_clusters(kmeans, vextors_df, num) # plot topic clustering
    return 

'''Plot clustering for each topic'''
def plot_clusters(kmeans, df, topic_num):
    pca = PCA(n_components=2)
    transformed = pca.fit_transform(df) # transform topic_num features to 2D
    x = transformed[:,0]
    y = transformed[:,1]
    col_dict = {0:'red', 1:'blue', 2:'green'}
    cols = [col_dict[l] for l in kmeans.labels_]
    plt.figure(figsize=(15,10))
    plt.scatter(x,y, c=cols, s=100, alpha=.5)
    for i, l in enumerate(labels):
        plt.text(x[i]+.0003,y[i]-.0001, l)
    for i, c in enumerate(pca.components_.transpose()):
        plt.arrow(0,0, c[0]/50, c[1]/50, alpha=.3, width=.0001)
        plt.text(c[0]/50, c[1]/50, features[i])
    plt.xlabel('PCA1')
    plt.ylabel('PCA2')
    plt.title('Shakespeare works for Topic {}'.format(topic_num))
    plt.show()
    plt.savefig("shakespeare-kmeans-{}.png".format(topic_num))
    return

found 36 files starting with ./shakespeare/1_king_henry_iv.txt


In [31]:
f = './shakespeare/1_king_henry_iv.txt'

In [32]:
punctuation = '.,?<>:;"\'!%'
text = open(f, "r").read()
if isinstance(text, str): text = tokenize(text)
text = [t.lower() for t in text]
text = [t.strip(punctuation) for t in text]
print('play contains', len(text), 'words words words')
chunk_size = 5000
chunks = []
for start in range(0, len(text)-chunk_size+1, chunk_size): chunks.append(text[start:start+chunk_size])

play contains 23910 words words words


In [34]:
print(len(chunks), len(chunks[0]), len(chunks[3]))

4 5000 5000


In [35]:
import pickle
with open('chunks.pkl', 'wb') as picklefile: pickle.dump(chunks, picklefile)

In [36]:
!ls

 README.md			     democratic_nominees_quotes.csv
 Text_Analytics_NLP_Workshop.ipynb  'jupyterize shakespeare.ipynb'
 austen-kmeans.py		     shakespeare
 austen_alcott			     shakespeare-kmeans.py
 chunks.pkl


In [39]:
dir()

['Counter',
 'In',
 'KMeans',
 'Out',
 'PCA',
 '_',
 '_18',
 '_21',
 '_23',
 '_24',
 '_25',
 '_27',
 '_28',
 '_3',
 '_37',
 '_38',
 '__',
 '___',
 '__builtin__',
 '__builtins__',
 '__doc__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 '_dh',
 '_exit_code',
 '_i',
 '_i1',
 '_i10',
 '_i11',
 '_i12',
 '_i13',
 '_i14',
 '_i15',
 '_i16',
 '_i17',
 '_i18',
 '_i19',
 '_i2',
 '_i20',
 '_i21',
 '_i22',
 '_i23',
 '_i24',
 '_i25',
 '_i26',
 '_i27',
 '_i28',
 '_i29',
 '_i3',
 '_i30',
 '_i31',
 '_i32',
 '_i33',
 '_i34',
 '_i35',
 '_i36',
 '_i37',
 '_i38',
 '_i39',
 '_i4',
 '_i5',
 '_i6',
 '_i7',
 '_i8',
 '_i9',
 '_ih',
 '_ii',
 '_iii',
 '_oh',
 'chunk',
 'chunk_labels',
 'chunk_size',
 'chunks',
 'corpora',
 'corpus',
 'defaultdict',
 'dictionary',
 'exit',
 'f',
 'filepath',
 'files',
 'get_ipython',
 'glob',
 'kmean_topics',
 'make_topic_model',
 'models',
 'os',
 'pd',
 'pickle',
 'picklefile',
 'plot_clusters',
 'plt',
 'preprocess',
 'punctuation',
 'quit',
 'start',
 'text',
 't

In [37]:
globals()

{'__name__': '__main__',
 '__doc__': 'Plot clustering for each topic',
 '__package__': None,
 '__loader__': None,
 '__spec__': None,
 '__builtin__': <module 'builtins' (built-in)>,
 '__builtins__': <module 'builtins' (built-in)>,
 '_ih': ['',
  'from collections import Counter\nfrom collections import defaultdict\nimport glob, os\nimport pandas as pd\nfrom sklearn.cluster import KMeans\nfrom sklearn.decomposition import PCA\nimport matplotlib.pyplot as plt \nfrom gensim import corpora, models\n\n\'\'\'\nChunk all the Shakespeare plays into 5000 word chunks. (text processing lib)\nEach chunk should be labeled in the format play_chunkno (i.e. \'merchant_of_venice_006\' etc).\n\nTopic model the chunks using 5, 10, 25, 50 topics. (gensim)\n\nTaking each topic distribution as your set of vectors, cluster the chunks using K-Means clustering where K=3. \nDraw the results of each clustering as a color-coded scatterplot.\n\'\'\'\n\n\n##### TEXT PROCESSING LIB ####\ndef tokenize(s):\n    """\n  

In [38]:
locals()

{'__name__': '__main__',
 '__doc__': 'Plot clustering for each topic',
 '__package__': None,
 '__loader__': None,
 '__spec__': None,
 '__builtin__': <module 'builtins' (built-in)>,
 '__builtins__': <module 'builtins' (built-in)>,
 '_ih': ['',
  'from collections import Counter\nfrom collections import defaultdict\nimport glob, os\nimport pandas as pd\nfrom sklearn.cluster import KMeans\nfrom sklearn.decomposition import PCA\nimport matplotlib.pyplot as plt \nfrom gensim import corpora, models\n\n\'\'\'\nChunk all the Shakespeare plays into 5000 word chunks. (text processing lib)\nEach chunk should be labeled in the format play_chunkno (i.e. \'merchant_of_venice_006\' etc).\n\nTopic model the chunks using 5, 10, 25, 50 topics. (gensim)\n\nTaking each topic distribution as your set of vectors, cluster the chunks using K-Means clustering where K=3. \nDraw the results of each clustering as a color-coded scatterplot.\n\'\'\'\n\n\n##### TEXT PROCESSING LIB ####\ndef tokenize(s):\n    """\n  

In [17]:
chunk_labels = ['{}_{:03}'.format(os.path.split(f)[1][:-4], i) for i, j in enumerate(chunks)] # get chunk labels

In [21]:
chunk_labels

['1_king_henry_iv_000',
 '1_king_henry_iv_001',
 '1_king_henry_iv_002',
 '1_king_henry_iv_003']

In [22]:
dictionary = corpora.Dictionary(chunks) 

In [23]:
len(dictionary)

3522

In [24]:
dictionary[0], dictionary[1], dictionary[2], dictionary[3], dictionary[4], dictionary[5], dictionary[6]

('--whose', 'a', 'able', 'about', 'abuses', 'accents', 'accidents')

In [26]:
# dir(dictionary)

In [27]:
type(dictionary)

gensim.corpora.dictionary.Dictionary

In [28]:
dictionary[42]

'and'

In [29]:
corpus = [dictionary.doc2bow(this_chunk) for this_chunk in chunks]       # word count across the dictionary for each chunk

In [50]:
corpus[0][42]

(42, 194)

In [51]:
# corpus[] is a list (per chunk; so 4) of bows

In [61]:
original_corpus = corpus.copy()

In [62]:
# lda model: Latent Dirichlet Allocation
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=5)

In [63]:
type(lda)

gensim.models.ldamodel.LdaModel

In [64]:
if corpus == original_corpus: print('same')

same


In [59]:
original_corpus

<function list.copy()>

In [20]:
topic_5 = lda[corpus]

In [21]:
type(topic_5)

gensim.interfaces.TransformedCorpus

In [22]:
topic_5[0]

[(0, 0.92023885), (2, 0.07963859)]

In [23]:
topic_5[1]

[(0, 0.2237235), (2, 0.77615404)]

In [24]:
topic_5[3][0]

(0, 0.97283816)

In [25]:
len(topic_5[0]), type(topic_5[0])

(2, list)

In [46]:
topic_5[0][0]

(0, 0.8905626)

In [47]:
type(topic_5[0][0])

tuple

In [30]:
topic_5[0][0][1]

0.9016533

In [31]:
# first index runs 0 1 2 3 by chunks. Second is a list of length 1: a tuple. Third index is 1st / 2nd element of the tuple.

In [34]:
len(topic_5), len(topic_5[0]), len(topic_5[1]), len(topic_5[2]), len(topic_5[3])

(4, 2, 2, 2, 2)

In [35]:
chunk_labels

['1_king_henry_iv_000',
 '1_king_henry_iv_001',
 '1_king_henry_iv_002',
 '1_king_henry_iv_003']

In [66]:
# Put labels, features, vectors into a single dataframe
vectors_df = pd.DataFrame(topic_5, index=chunk_labels, columns=range(5))     # .fillna(0)

ValueError: 5 columns passed, passed data had 2 columns

In [None]:
   
# 4. Use K-means clustering from Scikit Learn to find two clusters. 
# n_clusters=3
# kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(vectors_df)
# plot_clusters(kmeans, vextors_df, num) # plot topic clustering

In [None]:
# This loop fails so cell above is diagnostics

filecount = 0
filelimit = 1

for f in files:
    print("working on file/play", f)
    filecount += 1
    if filecount > filelimit: break
        
    chunks = chunk(preprocess(open(f, "r").read()), 5000) # get chunks 
    chunk_labels = ['{}_{:03}'.format(os.path.split(f)[1][:-4], i) for i, j in enumerate(chunks)] # get chunk labels
    
    # starting topic modeling for chunks using 5, 10, 25, 50 topics
    topic_5 = make_topic_model(chunks, 5)
    
    kmean_topics(topic_5, chunk_labels, 5) # plot n=5       (last arg was 5... error)

    
    # topic_10 = make_topic_model(chunks, 10)
    # kmean_topics(topic_5, chunk_labels, 10) # plot n=10
    # topic_25 = make_topic_model(chunks, 25)
    # kmean_topics(topic_5, chunk_labels, 25) # plot n=25
    # topic_50 = make_topic_model(chunks, 50)
    # kmean_topics(topic_5, chunk_labels, 50) # plot n =50