In [18]:
# Sammon Projection

import pandas as pd
import matplotlib.pyplot as plt
import nltk
import numpy as np
from sklearn import manifold
from sklearn.metrics import euclidean_distances
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer

f = open('shakespeare_scenes.txt')
shk = f.read()
shakespeare_scenes = eval(shk)
f.close()

#Load Shakespeare acts
f = open('shakespeare_acts.txt')
shk = f.read()
shakespeare_acts = eval(shk)
f.close()

title = []
text_corpus = []
col_list = ['Title','Act','Scene']
text_col = ['Scene','Text']

for element in shakespeare_scenes:
    if(element['title'] == "All's Well That Ends Well" or \
      element['title'] == "As You Like It" or \
      element['title'] == "The Comedy of Errors" or \
      element['title'] == "Cymbeline" or \
      element['title'] == "Love's Labours Lost"):
        title.append([element['title'], element['act'], element['scene']])
        text_corpus.append([element['scene'],element['text']])

# convert to dataframes
df_text = pd.DataFrame(text_corpus,columns=text_col)
df = pd.DataFrame(title,columns=col_list)

# convert text data into feature vectors
convert_features = [dict(r.iteritems()) for _, r in df_text.iterrows()]
vectorizer = DictVectorizer()
vectorized_sparse = vectorizer.fit_transform(convert_features)
vectorized_array = vectorized_sparse.toarray()

#convert array to dataframe
vec_df = pd.DataFrame(vectorized_array)

mds = manifold.MDS(n_components=2,max_iter=3000,eps=1e-9,dissimilarity="precomputed")
similarities = euclidean_distances(vec_df.ix[:,0:81])
points = mds.fit_transform(similarities)

#colDict = {"act1":"red", "act2":"orange", "act3":"green", "act4":"violet", "act5":"blue"}
colDict = {"All's Well That Ends Well":"red", "As You Like It":"orange", \
           "The Comedy of Errors":"green", "Cymbeline":"violet", \
           "Love's Labours Lost":"blue"}

def textScatter(inX,inY,t,colDict,c=[]):
    ax=plt.axes()
    ax.scatter(inX,inY)
    for i,j in enumerate(zip(inX,inY)):
        if any(c):                      
            ax.text(j[0], j[1], t[i], color=colDict[c[i]])
        else:                       
            ax.text(j[0], j[1], t[i], color="blue")
    plt.title("Sammon Projection")
    plt.show()

textScatter(points[:, 0] , points[:, 1], str("act")+df.ix[:,1]+str("scene")+df.ix[:,2], colDict, df.ix[:,0])

# Create Sammon Projection using word frequency

#Create baseline of all text in all plays
all_acts_text = []
all_acts_lines = []
for act in shakespeare_acts:
    all_acts_text.append(act['text'])
all_text = ' '.join(all_acts_text)
all_lines = ' '.join(all_acts_lines)
all_tokens = nltk.word_tokenize(all_text)
all_freq = nltk.FreqDist(all_tokens)
#Get top 20 most frequent terms across all plays
vocabulary = [item[0] for item in sorted(all_freq.items(), key=lambda x: x[1], reverse=True)][:20]

def word_freq(data):
    text_array = []
    
    for element in data:
        if(element['title'] == "All's Well That Ends Well" or \
          element['title'] == "As You Like It" or \
          element['title'] == "The Comedy of Errors" or \
          element['title'] == "Cymbeline" or \
          element['title'] == "Love's Labours Lost"):
            
            text_array.append([element['scene'],element['text']])
            
            #elements_lines = [element['lines'] for element in data]
            #elements_count = len(data)
            
    text_col = ['Scene', 'Text']
    df_text = pd.DataFrame(text_array,columns=text_col)
                          
    vectorizer = CountVectorizer(vocabulary=vocabulary, tokenizer=nltk.word_tokenize)
    freq_vec = vectorizer.fit_transform(df_text['Text']).toarray().astype(np.float64)
    #freq_vec /= np.c_[np.apply_along_axis(np.linalg.norm, 1, freq_vec)]
 
    return freq_vec

scene_freq_vec = word_freq(shakespeare_scenes)

scene_freq_vec_df = pd.DataFrame(scene_freq_vec)

mds = manifold.MDS(n_components=2,max_iter=3000,eps=1e-9,dissimilarity="precomputed")
similarities = euclidean_distances(vec_df.ix[:,0:81])
points = mds.fit_transform(similarities)

textScatter(points[:, 0] , points[:, 1], str("act")+df.ix[:,1]+str("scene")+df.ix[:,2], colDict, df.ix[:,0])

  if self._edgecolors == str('face'):


In [1]:
from collections import OrderedDict
d = {'h':8, 'j':3, 'u':1, 'm':9}
ds = OrderedDict(sorted(d.items(), key=lambda t: t[1], reverse=True))
print (ds['j'])

3


In [6]:
# Hierarchical Clustering

from matplotlib import pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster, fclusterdata

import scipy.cluster.hierarchy as hc
import numpy as np
import pandas as pd

f = open('shakespeare.txt')
shk = f.read()
shakespeare = eval(shk)

title = []
text_corpus = []
col_list = ['Title','Act','Scene']
text_col = ['Scene','Text']

for element in shakespeare:
    if(element['title'] == "All's Well That Ends Well"):
        title.append([element['title'], element['act'], element['scene']])
        text_corpus.append([element['scene'],element['text']])

# convert to dataframes
df_text = pd.DataFrame(text_corpus,columns=text_col)
df = pd.DataFrame(title,columns=col_list)

# print data frame
counter = df.shape[0]
for j in range(0,counter-1):
    print(j, " Title: ", df['Title'][j], " Act: ", df['Act'][j], " Scene: ", df['Scene'][j])

# convert text data into feature vectors
convert_features = [dict(r.iteritems()) for _, r in df_text.iterrows()]
vectorizer = DictVectorizer()
vectorized_sparse = vectorizer.fit_transform(convert_features)
vectorized_array = vectorized_sparse.toarray()

# create hirarchical clustering using the converted feature vectors
Z = linkage(vectorized_array,'ward')

# draw dendogram
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Index of Scenes')
plt.ylabel('Distance')
dendrogram(
    Z,
    p=6,
    leaf_rotation=90.,  # rotates the x axis labels
    leaf_font_size=8.,  # font size for the x axis labels
)
plt.show() 

#Elbow method to find the number of clusters
last = vectorized_array[-10:, 2]
last_rev = last[::-1]
idxs = np.arange(1, len(last) + 1)
plt.plot(idxs, last_rev)
acceleration = np.diff(last, 2)  # 2nd derivative of the distances
acceleration_rev = acceleration[::-1]
plt.plot(idxs[:-2] + 1, acceleration_rev)
plt.title("Elbow Chart")
plt.show()
k = acceleration_rev.argmax() + 2  # if idx 0 is the max of this we want 2 clusters

def cluster_indices(cluster_assignments):
    n = cluster_assignments.max()
    indices = []
    for cluster_number in range(1, n + 1):
        indices.append(np.where(cluster_assignments == cluster_number)[0])
    return indices

#clusters = fcluster(Z,k,criterion='maxclust')
clusters = fclusterdata(Z,1.0)
num_clusters = clusters.max()

# print restults
print("\n")
print("There are %d clusters" % num_clusters)
indices = cluster_indices(clusters)
for k, ind in enumerate(indices):
    print ("cluster", k + 1, "is", ind)


def second_dendrogram(*args, **kwargs):
    max_d = kwargs.pop('max_d', None)
    if max_d and 'color_threshold' not in kwargs:
        kwargs['color_threshold'] = max_d
    annotate_above = kwargs.pop('annotate_above', 0)

    ddata = dendrogram(*args, **kwargs)

    if not kwargs.get('no_plot', False):
        plt.title('Hierarchical Clustering Dendrogram')
        plt.xlabel('Index of Scenes')
        plt.ylabel('Distance')
        for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']):
            x = 0.5 * sum(i[1:3])
            y = d[1]
            if y > annotate_above:
                plt.plot(x, y, 'o', c=c)
                plt.annotate("%.3g" % y, (x, y), xytext=(0, -5),
                             textcoords='offset points',
                             va='top', ha='center')
        if max_d:
            plt.axhline(y=max_d, c='k')
    return ddata

second_dendrogram(
    Z,
    truncate_mode='lastp',
    p=12,
    leaf_rotation=90.,
    leaf_font_size=12.,
    show_contracted=True,
    annotate_above=10,  # useful in small plots so annotations don't overlap
)
plt.show()

0  Title:  All's Well That Ends Well  Act:  1  Scene:  1
1  Title:  All's Well That Ends Well  Act:  1  Scene:  2
2  Title:  All's Well That Ends Well  Act:  1  Scene:  3
3  Title:  All's Well That Ends Well  Act:  2  Scene:  1
4  Title:  All's Well That Ends Well  Act:  2  Scene:  2
5  Title:  All's Well That Ends Well  Act:  2  Scene:  3
6  Title:  All's Well That Ends Well  Act:  2  Scene:  4
7  Title:  All's Well That Ends Well  Act:  2  Scene:  5
8  Title:  All's Well That Ends Well  Act:  3  Scene:  1
9  Title:  All's Well That Ends Well  Act:  3  Scene:  2
10  Title:  All's Well That Ends Well  Act:  3  Scene:  3
11  Title:  All's Well That Ends Well  Act:  3  Scene:  4
12  Title:  All's Well That Ends Well  Act:  3  Scene:  5
13  Title:  All's Well That Ends Well  Act:  3  Scene:  6
14  Title:  All's Well That Ends Well  Act:  3  Scene:  7
15  Title:  All's Well That Ends Well  Act:  4  Scene:  1
16  Title:  All's Well That Ends Well  Act:  4  Scene:  2
17  Title:  All's Well T

In [None]:
# comment code

# fig, axes23 = plt.subplots(2,3)

# for method, axes in zip(['single','complete'],axes23):
#     z = hc.linkage(vectorized_array, method=method)
    
    # Plotting
#     axes[0].plot(range(1, len(z)+1), z[::-1, 2])
#     knee = np.diff(z[::-1, 2], 2)
#     axes[0].plot(range(2, len(z)), knee)
    
#     num_clust1 = knee.argmax() + 2
#     knee[knee.argmax()] = 0
#     num_clust2 = knee.argmax() + 2
    
#     axes[0].text(num_clust1, z[::-1, 2][num_clust1-1], 'possible\n<- knee point')
    
#     part1 = hc.fcluster(z, num_clust1, 'maxclust')
#     part2 = hc.fcluster(z, num_clust2, 'maxclust')
    
#     clr = ['#2200CC' ,'#D9007E' ,'#FF6600' ,'#FFCC00' ,'#ACE600' ,'#0099CC' ,
#     '#8900CC' ,'#FF0000' ,'#FF9900' ,'#FFFF00' ,'#00CC01' ,'#0055CC']
    
#     for part, ax in zip([part1, part2], axes[1:]):
#         for cluster in set(part):
#             ax.scatter(vectorized_array[part == cluster, 0], vectorized_array[part == cluster, 1], 
#                        color=clr[cluster])
            
#     m = '\n(method: {})'.format(method)
#     plt.setp(axes[0], title='Screeplot{}'.format(m), xlabel='partition',
#              ylabel='{}\ncluster distance'.format(m))
#     plt.setp(axes[1], title='{} Clusters'.format(num_clust1))
#     plt.setp(axes[2], title='{} Clusters'.format(num_clust2))

# plt.tight_layout()
# plt.show() 

# print results
# print('Title','Act','Scene','Cluster')
# print('\n')

# for j in range(1,len(clusters)):
#    print("'",df['Title'][j-1], "'", df['Act'][j-1], df['Scene'][j-1], clusters[j-1])
#    print('\n')

#def word_freq(data):
#    if([element['title'] == "All's Well That Ends Well" for element in data]):
#        elements_text = [element['text'] for element in data]
#        elements_lines = [element['lines'] for element in data]
#        elements_count = len(data)
#        elements_count = len(data)
#        elements_count = len(data)
#        elements_count = len(data)

#        vectorizer = CountVectorizer(vocabulary=vocabulary, tokenizer=nltk.word_tokenize)
#        freq_vec = vectorizer.fit_transform(elements_text).toarray().astype(np.float64)
#        freq_vec /= np.c_[np.apply_along_axis(np.linalg.norm, 1, freq_vec)]

#    return freq_vec

#Create baseline of all text in all plays
#all_acts_text = []
#all_acts_lines = []
#for act in shakespeare:
#    all_acts_text.append(act['text'])
#all_text = ' '.join(all_acts_text)
#all_lines = ' '.join(all_acts_lines)
#all_tokens = nltk.word_tokenize(all_text)
#all_freq = nltk.FreqDist(all_tokens)
#Get top 20 most frequent terms across all plays
#vocabulary = [item[0] for item in sorted(all_freq.items(), key=lambda x: x[1], reverse=True)][:20]
