In [None]:
'''
Grouping into plays

Data source: https://www.kaggle.com/kingburrito666/shakespeare-plays

Note: S's plays contain English, Latin, French, and nonsense.'''

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
import pandas as pd
import re
import numpy as np

In [3]:
df = pd.read_csv('Shakespeare_data.csv')

In [4]:
df.head()

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,,,,ACT I
1,2,Henry IV,,,,SCENE I. London. The palace.
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"


In [5]:
df.drop(['Dataline'], axis=1, inplace=True)

In [6]:
# Extracting the Act and Scene for all plays
df['ActScene']=df['ActSceneLine'].str[:4]

In [7]:
df.shape

(111396, 6)

In [26]:
# df = df[~(df.PlayerLine.str.contains('ACT'))]
# df = df[~(df.PlayerLine.str.contains('SCENE'))]
df = df[~(df.PlayerLine.str.contains(r"[A-Z]{2,}"))]
df = df[~df.PlayerLine.str.contains('Enter')]
df = df[~df.PlayerLine.str.contains('Exit')]
df = df[~df.PlayerLine.str.contains('Exeunt')]
# TRY THIS!
#indices = df.index[df['column'].contains('ACT') or condition ...etc]
#df.drop(indices, axis = 0, inplace=True)

In [27]:
df.head()

Unnamed: 0,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine,ActScene
3,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,",1.1.
4,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,",1.1.
5,Henry IV,1.0,1.1.3,KING HENRY IV,And breathe short-winded accents of new broils,1.1.
6,Henry IV,1.0,1.1.4,KING HENRY IV,To be commenced in strands afar remote.,1.1.
7,Henry IV,1.0,1.1.5,KING HENRY IV,No more the thirsty entrance of this soil,1.1.


In [28]:
df.shape

(105307, 6)

In [30]:
#group by Player then join lines into speeches
df_speeches = df.groupby(['Play','PlayerLinenumber','Player','ActScene'])['PlayerLine'].apply(lambda x: ','.join(x)).reset_index()

In [31]:
df_speeches

Unnamed: 0,Play,PlayerLinenumber,Player,ActScene,PlayerLine
0,A Comedy of Errors,1.0,ADRIANA,2.1.,"Neither my husband nor the slave return'd,,Tha..."
1,A Comedy of Errors,1.0,ADRIANA,4.2.,"Ah, Luciana, did he tempt thee so?,Mightst tho..."
2,A Comedy of Errors,1.0,AEGEON,1.1.,"Proceed, Solinus, to procure my fall,And by th..."
3,A Comedy of Errors,1.0,ANGELO,5.1.,"I am sorry, sir, that I have hinder'd you,,But..."
4,A Comedy of Errors,1.0,First Merchant,1.2.,"Therefore give out you are of Epidamnum,,Lest ..."
...,...,...,...,...,...
30034,macbeth,61.0,MALCOLM,4.3.,Dispute it like a man.
30035,macbeth,62.0,MACDUFF,4.3.,"I shall do so,,But I must also feel it as a ma..."
30036,macbeth,63.0,MALCOLM,4.3.,Be this the whetstone of your sword: let grief...
30037,macbeth,64.0,MACDUFF,4.3.,"O, I could play the woman with mine eyes,And b..."


In [32]:
#check that I got the full speeches
df_speeches.iloc[0,4]

"Neither my husband nor the slave return'd,,That in such haste I sent to seek his master!,Sure, Luciana, it is two o'clock."

In [33]:
#now the play is back in order

df_speeches2 = (df_speeches.sort_values(['Play','ActScene', 'PlayerLinenumber'], ascending=[True, True, True]))

In [34]:
df_speeches2

Unnamed: 0,Play,PlayerLinenumber,Player,ActScene,PlayerLine
2,A Comedy of Errors,1.0,AEGEON,1.1.,"Proceed, Solinus, to procure my fall,And by th..."
16,A Comedy of Errors,2.0,DUKE SOLINUS,1.1.,"Merchant of Syracuse, plead no more,,I am not ..."
24,A Comedy of Errors,3.0,AEGEON,1.1.,"Yet this my comfort: when your words are done,..."
37,A Comedy of Errors,4.0,DUKE SOLINUS,1.1.,"Well, Syracusian, say in brief the cause,Why t..."
46,A Comedy of Errors,5.0,AEGEON,1.1.,"A heavier task could not have been imposed,Tha..."
...,...,...,...,...,...
29791,macbeth,19.0,MALCOLM,5.8.,"He's worth more sorrow,,And that I'll spend fo..."
29804,macbeth,20.0,SIWARD,5.8.,"He's worth no more,They say he parted well, an..."
29813,macbeth,21.0,MACDUFF,5.8.,"Hail, king! for so thou art: behold, where sta..."
29818,macbeth,22.0,ALL,5.8.,"Hail, King of Scotland!"


In [35]:
#group by Play then join speeches into play documents
df_plays = df_speeches2.groupby(['Play'])['PlayerLine'].apply(lambda x: ','.join(x)).reset_index()

In [36]:
df_plays

Unnamed: 0,Play,PlayerLine
0,A Comedy of Errors,"Proceed, Solinus, to procure my fall,And by th..."
1,A Midsummer nights dream,"Now, fair Hippolyta, our nuptial hour,Draws on..."
2,A Winters Tale,"If you shall chance, Camillo, to visit Bohemia..."
3,Alls well that ends well,"In delivering my son from me, I bury a second ..."
4,Antony and Cleopatra,"Nay, but this dotage of our general's,O'erflow..."
5,As you like it,"As I remember, Adam, it was upon this fashion,..."
6,Coriolanus,"Before we proceed any further, hear me speak.,..."
7,Cymbeline,"You do not meet a man but frowns: our bloods,N..."
8,Hamlet,"Who's there?,Nay, answer me: stand, and unfold..."
9,Henry IV,"So shaken as we are, so wan with care,,Find we..."


In [48]:
# Save a df of plays into a pickle file.
import pickle

pickle.dump( df_plays, open( "shake_plays.p", "wb" ) )


In [40]:
#List unique values in the df['name'] column
players = list(df.Player.unique())

In [41]:
players

['KING HENRY IV',
 'WESTMORELAND',
 'FALSTAFF',
 'PRINCE HENRY',
 'POINS',
 'EARL OF WORCESTER',
 'NORTHUMBERLAND',
 'HOTSPUR',
 'SIR WALTER BLUNT',
 'First Carrier',
 'Ostler',
 'Second Carrier',
 'GADSHILL',
 'Chamberlain',
 'BARDOLPH',
 'PETO',
 'First Traveller',
 'Thieves',
 'Travellers',
 'LADY PERCY',
 'Servant',
 'FRANCIS',
 'Vintner',
 'Hostess',
 'Sheriff',
 'Carrier',
 'MORTIMER',
 'GLENDOWER',
 nan,
 'EARL OF DOUGLAS',
 'Messenger',
 'VERNON',
 'WORCESTER',
 'ARCHBISHOP OF YORK',
 'SIR MICHAEL',
 'LANCASTER',
 'BEDFORD',
 'GLOUCESTER',
 'EXETER',
 'OF WINCHESTER',
 'CHARLES',
 'ALENCON',
 'REIGNIER',
 'BASTARD OF ORLEANS',
 'JOAN LA PUCELLE',
 'First Warder',
 'Second Warder',
 'WOODVILE',
 'Mayor',
 'Officer',
 'Boy',
 'SALISBURY',
 'TALBOT',
 'GARGRAVE',
 'GLANSDALE',
 'Sergeant',
 'First Sentinel',
 'BURGUNDY',
 'Sentinels',
 'Soldier',
 'Captain',
 'OF AUVERGNE',
 'Porter',
 'PLANTAGENET',
 'SUFFOLK',
 'SOMERSET',
 'WARWICK',
 'Lawyer',
 'First Gaoler',
 'KING HENRY VI'

In [43]:
players_low = []
for player in players:
    players_low.append(str(player).lower())

In [44]:
players_low

['king henry iv',
 'westmoreland',
 'falstaff',
 'prince henry',
 'poins',
 'earl of worcester',
 'northumberland',
 'hotspur',
 'sir walter blunt',
 'first carrier',
 'ostler',
 'second carrier',
 'gadshill',
 'chamberlain',
 'bardolph',
 'peto',
 'first traveller',
 'thieves',
 'travellers',
 'lady percy',
 'servant',
 'francis',
 'vintner',
 'hostess',
 'sheriff',
 'carrier',
 'mortimer',
 'glendower',
 'nan',
 'earl of douglas',
 'messenger',
 'vernon',
 'worcester',
 'archbishop of york',
 'sir michael',
 'lancaster',
 'bedford',
 'gloucester',
 'exeter',
 'of winchester',
 'charles',
 'alencon',
 'reignier',
 'bastard of orleans',
 'joan la pucelle',
 'first warder',
 'second warder',
 'woodvile',
 'mayor',
 'officer',
 'boy',
 'salisbury',
 'talbot',
 'gargrave',
 'glansdale',
 'sergeant',
 'first sentinel',
 'burgundy',
 'sentinels',
 'soldier',
 'captain',
 'of auvergne',
 'porter',
 'plantagenet',
 'suffolk',
 'somerset',
 'warwick',
 'lawyer',
 'first gaoler',
 'king henry v

In [45]:
split_players = []
for name in players_low:
    for part in name.split(' '):
        split_players.append(part)

In [46]:
split_players = list(set(split_players))

In [47]:
split_players

['',
 'abbot',
 'tranio',
 'peto',
 'ford',
 'timandra',
 'pyramus',
 'montano',
 'launce',
 'sixth',
 'brakenbury',
 'murder',
 'banditti',
 'belch',
 'pisanio',
 'trebonius',
 'helicanus',
 'mercutio',
 'bassianus',
 'french',
 'deiphobus',
 'scales',
 'westmoreland',
 'merchant',
 'juliet',
 'aeneas',
 'petruchio',
 'bardolph',
 'caliban',
 'proteus',
 'troilus',
 'clerk',
 'polonius',
 'menenius',
 'macmorris',
 'man',
 'spirit',
 'gardener',
 'talbot',
 'say',
 'menelaus',
 'conrade',
 'caphis',
 'pistol',
 'perdita',
 'ceres',
 'philostrate',
 'tutor',
 'vi',
 'forester',
 'strato',
 'musician',
 'macbeth',
 'bandit',
 'son',
 'luciana',
 'goneril',
 'others',
 'lartius',
 'lion',
 'cymbeline',
 'fool',
 'luce',
 'arviragus',
 'percy',
 'philo',
 'edward',
 'aumerle',
 'hecate',
 'pandar',
 'keeper',
 'cleopatra',
 'cardinal',
 'cassius',
 'publius',
 'menas',
 'duncan',
 'orlando',
 'mayor',
 'snout',
 'oberon',
 'malcolm',
 'cleon',
 'richmond',
 'viola',
 'outlaws',
 'seyton',

In [50]:
# Save a list of players into a pickle file.
import pickle

pickle.dump( split_players, open( "shake_players.p", "wb" ) )


In [None]:
#Plays a tone to bring attention to a completed notebook

import os
duration = 1  # second
freq = 440  # Hz
os.system('play --no-show-progress --null --channels 1 synth %s sine %f' % (duration, freq))