In [12]:
# Sammon Projection

import pandas as pd
import matplotlib.pyplot as plt
import nltk
import numpy as np
from sklearn import manifold
from sklearn.metrics import euclidean_distances
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer

f = open('shakespeare_scenes.txt')
shk = f.read()
shakespeare_scenes = eval(shk)
f.close()

#Load Shakespeare acts
f = open('shakespeare_acts.txt')
shk = f.read()
shakespeare_acts = eval(shk)
f.close()

play = "All Plays"

title = []
col_list = ['Title','Act','Scene', 'Text']

scene_text_corpus = []
scene_text_col = ['Scene','Text']
act_text_corpus = []
act_text_col = ['Act','Text']

def textScatter(caption,inX,inY,t,colDict,c=[]):
    ax=plt.axes()
    ax.scatter(inX,inY)
    for i,j in enumerate(zip(inX,inY)):
        if any(c):                      
            ax.text(j[0], j[1], t[i], color=colDict[c[i]])
        else:                       
            ax.text(j[0], j[1], t[i], color="blue")
    plt.title("Sammon Projection: " + caption)
    plt.show()

for element in shakespeare_scenes:
    #if(element['title'] == play):
        title.append([element['title'], element['act'], element['scene'], element['text']])
        scene_text_corpus.append([element['scene'],element['text']])

for element in shakespeare_acts:
    #if(element['title'] == play):
        act_text_corpus.append([element['act'],element['text']])
        
# convert to dataframes
df = pd.DataFrame(title,columns=col_list)
scene_df_text = pd.DataFrame(scene_text_corpus,columns=scene_text_col)
act_df_text = pd.DataFrame(act_text_corpus,columns=act_text_col)

# convert text data into feature vectors
scene_convert_features = [dict(r.iteritems()) for _, r in scene_df_text.iterrows()]
act_convert_features = [dict(r.iteritems()) for _, r in act_df_text.iterrows()]

vectorizer = DictVectorizer()
scene_sparse = vectorizer.fit_transform(scene_convert_features)
act_sparse = vectorizer.fit_transform(act_convert_features)

scene_vector_array = scene_sparse.toarray()
act_vector_array = act_sparse.toarray()

#convert array to dataframe
scene_vec_df = pd.DataFrame(scene_vector_array)
act_vec_df = pd.DataFrame(act_vector_array)
scene_len = scene_vec_df.shape[0]-1
act_len = act_vec_df.shape[0]-1

mds = manifold.MDS(n_components=2,max_iter=3000,eps=1e-9,dissimilarity="precomputed")
scene_similarities = euclidean_distances(scene_vec_df.ix[:,0:scene_len])
act_similarities = euclidean_distances(act_vec_df.ix[:,0:act_len])

scene_points = mds.fit_transform(scene_similarities)
act_points = mds.fit_transform(act_similarities)

colDict = {"act0": "black", "act1":"red", "act2":"orange", "act3":"green", "act4":"violet", "act5":"blue", "act6":"indigo", "act7":"lime", "act8":"brown", "act9":"gray", "act10":"purple"}

# act text projection
textScatter("'"+str(play)+ "' - By Acts (using DictVectorizer)", act_points[:, 0] , \
            act_points[:, 1], df.ix[:,0]+str("act")+df.ix[:,2], colDict, str("act")+df.ix[:,2])

# scene text projection
textScatter("'"+str(play)+ "' By Scenes (using DictVectorizer)", scene_points[:, 0] , scene_points[:, 1], \
            df.ix[:,0]+str("act")+df.ix[:,1]+str("scene")+df.ix[:,2], colDict, str("act")+df.ix[:,1])


# Create Sammon Projection using word frequency

#Create baseline of all text in all plays
all_acts_text = []
all_acts_lines = []
for act in shakespeare_acts:
    all_acts_text.append(act['text'])
all_text = ' '.join(all_acts_text)
all_lines = ' '.join(all_acts_lines)
all_tokens = nltk.word_tokenize(all_text)
all_freq = nltk.FreqDist(all_tokens)
#Get top 20 most frequent terms across all plays
vocabulary = [item[0] for item in sorted(all_freq.items(), key=lambda x: x[1], reverse=True)][:20]

def word_freq(data,actscene):
    text_array = []
    
    for element in data:
#        if(element['title'] == play):
            if(actscene=="act"):
                text_array.append([element['act'],element['text']])
            if(actscene=="scene"):
                text_array.append([element['scene'],element['text']])

    if(actscene=="act"):
        text_col = ['Act', 'Text']
    if(actscene=="scene"):
        text_col = ['Scene', 'Text']
    
    df_text = pd.DataFrame(text_array,columns=text_col)
    vectorizer = CountVectorizer(vocabulary=vocabulary, tokenizer=nltk.word_tokenize)
    freq_vec = vectorizer.fit_transform(df_text['Text']).toarray().astype(np.float64)

    return freq_vec

scene_freq_vec = word_freq(shakespeare_scenes,"scene")
act_freq_vec = word_freq(shakespeare_acts,"act")

scene_freq_df = pd.DataFrame(scene_freq_vec)
act_freq_df = pd.DataFrame(act_freq_vec)
scene_freq_len = scene_freq_df.shape[0]-1
act_freq_len = act_freq_df.shape[0]-1

mds = manifold.MDS(n_components=2,max_iter=3000,eps=1e-9,dissimilarity="precomputed")
scene_freq_sim = euclidean_distances(scene_freq_df.ix[:,0:scene_freq_len])
act_freq_sim = euclidean_distances(act_freq_df.ix[:,0:act_freq_len])

scene_freq_points = mds.fit_transform(scene_freq_sim)
act_freq_points = mds.fit_transform(act_freq_sim)

colDict = {"act0": "black", "act1":"red", "act2":"orange", "act3":"green", "act4":"violet", "act5":"blue", "act6":"indigo", "act7":"lime", "act8":"brown", "act9":"gray", "act10":"purple"}

# act text projection
textScatter("'"+str(play)+ "' - By Acts (using Word Frequency)", act_freq_points[:, 0] , \
            act_freq_points[:, 1], df.ix[:,0]+str("act")+df.ix[:,2], colDict, str("act")+df.ix[:,2])

# scene text projection
textScatter("'"+str(play)+ "' - By Scenes (using Word Frequency)", scene_freq_points[:, 0] , \
            scene_freq_points[:, 1], df.ix[:,0]+str("act")+df.ix[:,1]+str("scene")+df.ix[:,2], colDict, str("act")+df.ix[:,1])

  if self._edgecolors == str('face'):


In [14]:
# Create Sammon Project using Structural features

import pandas as pd
import matplotlib.pyplot as plt
import nltk
import numpy as np
from sklearn import manifold
from sklearn.metrics import euclidean_distances
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer

f = open('shakespeare_scenes.txt')
shk = f.read()
shakespeare_scenes = eval(shk)
f.close()

#Load Shakespeare acts
f = open('shakespeare_acts.txt')
shk = f.read()
shakespeare_acts = eval(shk)
f.close()

def textScatter(caption,inX,inY,t,colDict,c=[]):
    ax=plt.axes()
    ax.scatter(inX,inY)
    for i,j in enumerate(zip(inX,inY)):
        if any(c):                      
            ax.text(j[0], j[1], t[i], color=colDict[c[i]])
        else:                       
            ax.text(j[0], j[1], t[i], color="blue")
    plt.title("Sammon Projection: " + caption)
    plt.show()
    
def structural(data):
    elements_text = [element['text'] for element in data]
    elements_lines = [element['lines'] for element in data]
    elements_count = len(data)
    
    features = np.zeros((elements_count, 11), np.float64)
    for i, element in enumerate(elements_lines):
        text = ' '.join(element)
        lines = element
        lines_count = len(lines)
        tokens = nltk.word_tokenize(text.lower())
        sentences = nltk.data.load('tokenizers/punkt/english.pickle').tokenize(text.lower())
        sentences_count = len(sentences)
        words = nltk.tokenize.RegexpTokenizer(r'\w+').tokenize(text.lower())
        words_unique = list(set(words))        
        words_line_counts = [len(nltk.tokenize.RegexpTokenizer(r'\w+').tokenize(line.lower())) for line in lines]        
        words_sent_counts = [len(nltk.tokenize.RegexpTokenizer(r'\w+').tokenize(sentence.lower())) for sentence in sentences]

        #Calculate features
        mean_word_freq = float(len(words))/float(len(words_unique))
        mean_words_line = np.mean(words_line_counts)
        std_words_line = np.std(words_line_counts)
        mean_words_sent = np.mean(words_sent_counts)
        std_words_sent = np.std(words_sent_counts)
        mean_commas_line = float(tokens.count(","))/float(lines_count)
        mean_commas_sent = float(tokens.count(","))/float(sentences_count)
        mean_colons_line = float(tokens.count(":"))/float(lines_count)
        mean_colons_sent = float(tokens.count(":"))/float(sentences_count)
        mean_scolons_line = float(tokens.count(";"))/float(lines_count)
        mean_scolons_sent = float(tokens.count(";"))/float(sentences_count)
        
        #Assign features to matrix
        features[i,0] = mean_word_freq
        features[i,1] = mean_words_line
        features[i,2] = std_words_line
        features[i,3] = mean_words_sent
        features[i,4] = std_words_sent
        features[i,5] = mean_commas_line
        features[i,6] = mean_commas_sent
        features[i,7] = mean_colons_line
        features[i,8] = mean_colons_sent
        features[i,9] = mean_scolons_line
        features[i,10] = mean_scolons_sent
        
    return features

scene_struc_vec = structural(shakespeare_scenes)
act_struc_vec = structural(shakespeare_acts)

scene_struc_df = pd.DataFrame(scene_struc_vec)
act_struc_df = pd.DataFrame(act_struc_vec)
scene_struc_len = scene_struc_df.shape[0]-1
act_struc_len = act_struc_df.shape[0]-1

mds = manifold.MDS(n_components=2,max_iter=3000,eps=1e-9,dissimilarity="precomputed")
scene_struc_sim = euclidean_distances(scene_struc_df.ix[:,0:scene_struc_len])
act_struc_sim = euclidean_distances(act_freq_df.ix[:,0:act_struc_len])

scene_struc_points = mds.fit_transform(scene_struc_sim)
act_struc_points = mds.fit_transform(act_struc_sim)

colDict = {"act0": "black", "act1":"red", "act2":"orange", "act3":"green", "act4":"violet", "act5":"blue", "act6":"indigo", "act7":"lime", "act8":"brown", "act9":"gray", "act10":"purple"}

# act text projection
textScatter("'"+str(play)+ "' - By Acts (using Structural Features)", act_struc_points[:, 0] , \
            act_struc_points[:, 1], df.ix[:,0]+str("act")+df.ix[:,2], colDict, str("act")+df.ix[:,2])

# scene text projection
textScatter("'"+str(play)+ "' - By Scenes (using Structural Features)", scene_struc_points[:, 0] , \
            scene_struc_points[:, 1], df.ix[:,0]+str("act")+df.ix[:,1]+str("scene")+df.ix[:,2], colDict, str("act")+df.ix[:,1])

  if self._edgecolors == str('face'):


In [6]:
# Hierarchical Clustering

from matplotlib import pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster, fclusterdata

import scipy.cluster.hierarchy as hc
import numpy as np
import pandas as pd

f = open('shakespeare.txt')
shk = f.read()
shakespeare = eval(shk)

title = []
text_corpus = []
col_list = ['Title','Act','Scene']
text_col = ['Scene','Text']

for element in shakespeare:
    if(element['title'] == "All's Well That Ends Well"):
        title.append([element['title'], element['act'], element['scene']])
        text_corpus.append([element['scene'],element['text']])

# convert to dataframes
df_text = pd.DataFrame(text_corpus,columns=text_col)
df = pd.DataFrame(title,columns=col_list)

# print data frame
counter = df.shape[0]
for j in range(0,counter-1):
    print(j, " Title: ", df['Title'][j], " Act: ", df['Act'][j], " Scene: ", df['Scene'][j])

# convert text data into feature vectors
convert_features = [dict(r.iteritems()) for _, r in df_text.iterrows()]
vectorizer = DictVectorizer()
vectorized_sparse = vectorizer.fit_transform(convert_features)
vectorized_array = vectorized_sparse.toarray()

# create hirarchical clustering using the converted feature vectors
Z = linkage(vectorized_array,'ward')

# draw dendogram
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Index of Scenes')
plt.ylabel('Distance')
dendrogram(
    Z,
    p=6,
    leaf_rotation=90.,  # rotates the x axis labels
    leaf_font_size=8.,  # font size for the x axis labels
)
plt.show() 

#Elbow method to find the number of clusters
last = vectorized_array[-10:, 2]
last_rev = last[::-1]
idxs = np.arange(1, len(last) + 1)
plt.plot(idxs, last_rev)
acceleration = np.diff(last, 2)  # 2nd derivative of the distances
acceleration_rev = acceleration[::-1]
plt.plot(idxs[:-2] + 1, acceleration_rev)
plt.title("Elbow Chart")
plt.show()
k = acceleration_rev.argmax() + 2  # if idx 0 is the max of this we want 2 clusters

def cluster_indices(cluster_assignments):
    n = cluster_assignments.max()
    indices = []
    for cluster_number in range(1, n + 1):
        indices.append(np.where(cluster_assignments == cluster_number)[0])
    return indices

#clusters = fcluster(Z,k,criterion='maxclust')
clusters = fclusterdata(Z,1.0)
num_clusters = clusters.max()

# print restults
print("\n")
print("There are %d clusters" % num_clusters)
indices = cluster_indices(clusters)
for k, ind in enumerate(indices):
    print ("cluster", k + 1, "is", ind)


def second_dendrogram(*args, **kwargs):
    max_d = kwargs.pop('max_d', None)
    if max_d and 'color_threshold' not in kwargs:
        kwargs['color_threshold'] = max_d
    annotate_above = kwargs.pop('annotate_above', 0)

    ddata = dendrogram(*args, **kwargs)

    if not kwargs.get('no_plot', False):
        plt.title('Hierarchical Clustering Dendrogram')
        plt.xlabel('Index of Scenes')
        plt.ylabel('Distance')
        for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']):
            x = 0.5 * sum(i[1:3])
            y = d[1]
            if y > annotate_above:
                plt.plot(x, y, 'o', c=c)
                plt.annotate("%.3g" % y, (x, y), xytext=(0, -5),
                             textcoords='offset points',
                             va='top', ha='center')
        if max_d:
            plt.axhline(y=max_d, c='k')
    return ddata

second_dendrogram(
    Z,
    truncate_mode='lastp',
    p=12,
    leaf_rotation=90.,
    leaf_font_size=12.,
    show_contracted=True,
    annotate_above=10,  # useful in small plots so annotations don't overlap
)
plt.show()

0  Title:  All's Well That Ends Well  Act:  1  Scene:  1
1  Title:  All's Well That Ends Well  Act:  1  Scene:  2
2  Title:  All's Well That Ends Well  Act:  1  Scene:  3
3  Title:  All's Well That Ends Well  Act:  2  Scene:  1
4  Title:  All's Well That Ends Well  Act:  2  Scene:  2
5  Title:  All's Well That Ends Well  Act:  2  Scene:  3
6  Title:  All's Well That Ends Well  Act:  2  Scene:  4
7  Title:  All's Well That Ends Well  Act:  2  Scene:  5
8  Title:  All's Well That Ends Well  Act:  3  Scene:  1
9  Title:  All's Well That Ends Well  Act:  3  Scene:  2
10  Title:  All's Well That Ends Well  Act:  3  Scene:  3
11  Title:  All's Well That Ends Well  Act:  3  Scene:  4
12  Title:  All's Well That Ends Well  Act:  3  Scene:  5
13  Title:  All's Well That Ends Well  Act:  3  Scene:  6
14  Title:  All's Well That Ends Well  Act:  3  Scene:  7
15  Title:  All's Well That Ends Well  Act:  4  Scene:  1
16  Title:  All's Well That Ends Well  Act:  4  Scene:  2
17  Title:  All's Well T

In [28]:
# comment code

# fig, axes23 = plt.subplots(2,3)

# for method, axes in zip(['single','complete'],axes23):
#     z = hc.linkage(vectorized_array, method=method)
    
    # Plotting
#     axes[0].plot(range(1, len(z)+1), z[::-1, 2])
#     knee = np.diff(z[::-1, 2], 2)
#     axes[0].plot(range(2, len(z)), knee)
    
#     num_clust1 = knee.argmax() + 2
#     knee[knee.argmax()] = 0
#     num_clust2 = knee.argmax() + 2
    
#     axes[0].text(num_clust1, z[::-1, 2][num_clust1-1], 'possible\n<- knee point')
    
#     part1 = hc.fcluster(z, num_clust1, 'maxclust')
#     part2 = hc.fcluster(z, num_clust2, 'maxclust')
    
#     clr = ['#2200CC' ,'#D9007E' ,'#FF6600' ,'#FFCC00' ,'#ACE600' ,'#0099CC' ,
#     '#8900CC' ,'#FF0000' ,'#FF9900' ,'#FFFF00' ,'#00CC01' ,'#0055CC']
    
#     for part, ax in zip([part1, part2], axes[1:]):
#         for cluster in set(part):
#             ax.scatter(vectorized_array[part == cluster, 0], vectorized_array[part == cluster, 1], 
#                        color=clr[cluster])
            
#     m = '\n(method: {})'.format(method)
#     plt.setp(axes[0], title='Screeplot{}'.format(m), xlabel='partition',
#              ylabel='{}\ncluster distance'.format(m))
#     plt.setp(axes[1], title='{} Clusters'.format(num_clust1))
#     plt.setp(axes[2], title='{} Clusters'.format(num_clust2))

# plt.tight_layout()
# plt.show() 

# print results
# print('Title','Act','Scene','Cluster')
# print('\n')

# for j in range(1,len(clusters)):
#    print("'",df['Title'][j-1], "'", df['Act'][j-1], df['Scene'][j-1], clusters[j-1])
#    print('\n')

#def word_freq(data):
#    if([element['title'] == "All's Well That Ends Well" for element in data]):
#        elements_text = [element['text'] for element in data]
#        elements_lines = [element['lines'] for element in data]
#        elements_count = len(data)
#        elements_count = len(data)
#        elements_count = len(data)
#        elements_count = len(data)

#        vectorizer = CountVectorizer(vocabulary=vocabulary, tokenizer=nltk.word_tokenize)
#        freq_vec = vectorizer.fit_transform(elements_text).toarray().astype(np.float64)
#        freq_vec /= np.c_[np.apply_along_axis(np.linalg.norm, 1, freq_vec)]

#    return freq_vec

#Create baseline of all text in all plays
#all_acts_text = []
#all_acts_lines = []
#for act in shakespeare:
#    all_acts_text.append(act['text'])
#all_text = ' '.join(all_acts_text)
#all_lines = ' '.join(all_acts_lines)
#all_tokens = nltk.word_tokenize(all_text)
#all_freq = nltk.FreqDist(all_tokens)
#Get top 20 most frequent terms across all plays
#vocabulary = [item[0] for item in sorted(all_freq.items(), key=lambda x: x[1], reverse=True)][:20]

#colDict = {"All's Well That Ends Well":"red", "As You Like It":"orange", \
#           "The Comedy of Errors":"green", "Cymbeline":"violet", \
#           "Love's Labours Lost":"blue"}

# textScatter(points[:, 0] , points[:, 1], str("act")+df.ix[:,1]+str("scene")+df.ix[:,2], colDict, df.ix[:,0])
# textScatter(points[:, 0] , points[:, 1], str("act")+df.ix[:,1]+str("scene")+df.ix[:,2], colDict, df.ix[:,0])

# Hierarchical Clustering

from matplotlib import pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster, fclusterdata

import scipy.cluster.hierarchy as hc
import numpy as np
import pandas as pd

#### FUNCTIONS ####
def word_freq(data,actscene):
    text_array = []
    
    for element in data:
#        if(element['title'] == play):
            if(actscene=="act"):
                text_array.append([element['act'],element['text']])
            if(actscene=="scene"):
                text_array.append([element['scene'],element['text']])

    if(actscene=="act"):
        text_col = ['Act', 'Text']
    if(actscene=="scene"):
        text_col = ['Scene', 'Text']
    
    df_text = pd.DataFrame(text_array,columns=text_col)
    vectorizer = CountVectorizer(vocabulary=vocabulary, tokenizer=nltk.word_tokenize)
    freq_vec = vectorizer.fit_transform(df_text['Text']).toarray().astype(np.float64)

    return freq_vec

def structural(data):
    elements_text = [element['text'] for element in data]
    elements_lines = [element['lines'] for element in data]
    elements_count = len(data)
    
    features = np.zeros((elements_count, 11), np.float64)
    for i, element in enumerate(elements_lines):
        text = ' '.join(element)
        lines = element
        lines_count = len(lines)
        tokens = nltk.word_tokenize(text.lower())
        sentences = nltk.data.load('tokenizers/punkt/english.pickle').tokenize(text.lower())
        sentences_count = len(sentences)
        words = nltk.tokenize.RegexpTokenizer(r'\w+').tokenize(text.lower())
        words_unique = list(set(words))        
        words_line_counts = [len(nltk.tokenize.RegexpTokenizer(r'\w+').tokenize(line.lower())) for line in lines]        
        words_sent_counts = [len(nltk.tokenize.RegexpTokenizer(r'\w+').tokenize(sentence.lower())) for sentence in sentences]

        #Calculate features
        mean_word_freq = float(len(words))/float(len(words_unique))
        mean_words_line = np.mean(words_line_counts)
        std_words_line = np.std(words_line_counts)
        mean_words_sent = np.mean(words_sent_counts)
        std_words_sent = np.std(words_sent_counts)
        mean_commas_line = float(tokens.count(","))/float(lines_count)
        mean_commas_sent = float(tokens.count(","))/float(sentences_count)
        mean_colons_line = float(tokens.count(":"))/float(lines_count)
        mean_colons_sent = float(tokens.count(":"))/float(sentences_count)
        mean_scolons_line = float(tokens.count(";"))/float(lines_count)
        mean_scolons_sent = float(tokens.count(";"))/float(sentences_count)
        
        #Assign features to matrix
        features[i,0] = mean_word_freq
        features[i,1] = mean_words_line
        features[i,2] = std_words_line
        features[i,3] = mean_words_sent
        features[i,4] = std_words_sent
        features[i,5] = mean_commas_line
        features[i,6] = mean_commas_sent
        features[i,7] = mean_colons_line
        features[i,8] = mean_colons_sent
        features[i,9] = mean_scolons_line
        features[i,10] = mean_scolons_sent
        
    return features

# draw dendogram
def drawdendogram(Z,vectorarr,caption,xlabel,ylabel):
    plt.figure(figsize=(25, 10))
    plt.title(caption)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    dendrogram(
        Z,
        p=6,
        leaf_rotation=90.,  # rotates the x axis labels
        leaf_font_size=8.,  # font size for the x axis labels
    )
    plt.show() 

    #Elbow method to find the number of clusters
    last = vectorarr[-10:, 2]
    last_rev = last[::-1]
    idxs = np.arange(1, len(last) + 1)
    plt.plot(idxs, last_rev)
    acceleration = np.diff(last, 2)  # 2nd derivative of the distances
    acceleration_rev = acceleration[::-1]
    plt.plot(idxs[:-2] + 1, acceleration_rev)
    plt.title(caption + " Elbow Chart")
    plt.show()

#### END OF FUNCTIONS ####

f = open('shakespeare_scenes.txt')
shk = f.read()
shakespeare_scenes = eval(shk)
f.close()

#Load Shakespeare acts
f = open('shakespeare_acts.txt')
shk = f.read()
shakespeare_acts = eval(shk)
f.close()

#play = "Loves Labours Lost"

title = []
col_list = ['Title','Act','Scene', 'Text']

scene_text_corpus = []
scene_text_col = ['Scene','Text']
act_text_corpus = []
act_text_col = ['Act','Text']

for element in shakespeare_scenes:
    #if(element['title'] == play):
        title.append([element['title'], element['act'], element['scene'], element['text']])
        scene_text_corpus.append([element['scene'],element['text']])

for element in shakespeare_acts:
    #if(element['title'] == play):
        act_text_corpus.append([element['act'],element['text']])
        
# convert to dataframes
df = pd.DataFrame(title,columns=col_list)
scene_df_text = pd.DataFrame(scene_text_corpus,columns=scene_text_col)
act_df_text = pd.DataFrame(act_text_corpus,columns=act_text_col)

# convert text data into feature vectors using Dict Vectorizer
scene_convert_features = [dict(r.iteritems()) for _, r in scene_df_text.iterrows()]
act_convert_features = [dict(r.iteritems()) for _, r in act_df_text.iterrows()]

vectorizer = DictVectorizer()
scene_sparse = vectorizer.fit_transform(scene_convert_features)
act_sparse = vectorizer.fit_transform(act_convert_features)

scene_vector_array = scene_sparse.toarray()
act_vector_array = act_sparse.toarray()
#convert array to dataframe
scene_vec_df = pd.DataFrame(scene_vector_array)
act_vec_df = pd.DataFrame(act_vector_array)

# convert text data into feature vectors using Word Frequency
#Create baseline of all text in all plays
all_acts_text = []
all_acts_lines = []
for act in shakespeare_acts:
    all_acts_text.append(act['text'])
all_text = ' '.join(all_acts_text)
all_lines = ' '.join(all_acts_lines)
all_tokens = nltk.word_tokenize(all_text)
all_freq = nltk.FreqDist(all_tokens)
#Get top 20 most frequent terms across all plays
vocabulary = [item[0] for item in sorted(all_freq.items(), key=lambda x: x[1], reverse=True)][:20]
scene_freq_vec = word_freq(shakespeare_scenes,"scene")
act_freq_vec = word_freq(shakespeare_acts,"act")
scene_freq_df = pd.DataFrame(scene_freq_vec)
act_freq_df = pd.DataFrame(act_freq_vec)

# convert text data into feature vectors using Structural features
scene_struc_vec = structural(shakespeare_scenes)
act_struc_vec = structural(shakespeare_acts)
scene_struc_df = pd.DataFrame(scene_struc_vec)
act_struc_df = pd.DataFrame(act_struc_vec)

# create hirarchical clustering using Dict Vectorizer feature vectors
Z1 = linkage(scene_vec_df,'ward')
Z2 = linkage(act_vec_df,'ward')

# create hirarchical clustering using Word frequency feature vectors
Z3 = linkage(scene_freq_df,'ward')
Z4 = linkage(act_freq_df,'ward')

# create hirarchical clustering using Structural feature vectors
Z5 = linkage(scene_struc_df,'ward')
Z6 = linkage(act_struc_df,'ward')

# draw dendograms
drawdendogram(Z1,scene_vector_array,"Hierarchical Clustering-By Scenes-DictVectorizer","Index Of Scenes","Distance")
drawdendogram(Z2,act_vector_array,"Hierarchical Clustering-By Acts-DictVectorizer","Index Of Acts","Distance")

drawdendogram(Z3,scene_freq_vec,"Hierarchical Clustering-By Scenes-Word frequency","Index Of Scenes","Distance")
drawdendogram(Z4,act_freq_vec,"Hierarchical Clustering-By Acts-Word frequency","Index Of Acts","Distance")

drawdendogram(Z5,scene_struc_vec,"Hierarchical Clustering-By Scenes-Structural Features","Index Of Scenes","Distance")
drawdendogram(Z6,act_struc_vec,"Hierarchical Clustering-By Acts-Structural Features","Index Of Acts","Distance")




  if self._edgecolors == str('face'):
