# To Be, Or Not To Be

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## Load dataframe

In [2]:
df=pd.read_csv("../data/shakespeare_data.csv")
df.head(10)

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,,,,ACT I
1,2,Henry IV,,,,SCENE I. London. The palace.
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"
5,6,Henry IV,1.0,1.1.3,KING HENRY IV,And breathe short-winded accents of new broils
6,7,Henry IV,1.0,1.1.4,KING HENRY IV,To be commenced in strands afar remote.
7,8,Henry IV,1.0,1.1.5,KING HENRY IV,No more the thirsty entrance of this soil
8,9,Henry IV,1.0,1.1.6,KING HENRY IV,Shall daub her lips with her own children's bl...
9,10,Henry IV,1.0,1.1.7,KING HENRY IV,"Nor more shall trenching war channel her fields,"


## Clean the data:

In [3]:
df['PlayerLinenumber'] = df['PlayerLinenumber'].replace(np.nan, 0)
df['ActSceneLine'] = df['ActSceneLine'].replace(np.nan, "0.0.0")
df['Player'] = df['Player'].replace(np.nan, "NO ONE")
df.head(10)

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,0.0,0.0.0,NO ONE,ACT I
1,2,Henry IV,0.0,0.0.0,NO ONE,SCENE I. London. The palace.
2,3,Henry IV,0.0,0.0.0,NO ONE,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"
5,6,Henry IV,1.0,1.1.3,KING HENRY IV,And breathe short-winded accents of new broils
6,7,Henry IV,1.0,1.1.4,KING HENRY IV,To be commenced in strands afar remote.
7,8,Henry IV,1.0,1.1.5,KING HENRY IV,No more the thirsty entrance of this soil
8,9,Henry IV,1.0,1.1.6,KING HENRY IV,Shall daub her lips with her own children's bl...
9,10,Henry IV,1.0,1.1.7,KING HENRY IV,"Nor more shall trenching war channel her fields,"


In [4]:
df.shape
df.describe()

Unnamed: 0,Dataline,PlayerLinenumber
count,111396.0,111396.0
mean,55698.5,36.884341
std,32157.399631,39.98576
min,1.0,0.0
25%,27849.75,10.0
50%,55698.5,25.0
75%,83547.25,50.0
max,111396.0,405.0


# Feature Engineering

Create Art, Scene, and Line features:

In [5]:
temp = df["ActSceneLine"].str.split(".", n = 2, expand = True)
temp.head(10)

Unnamed: 0,0,1,2
0,0,0,0
1,0,0,0
2,0,0,0
3,1,1,1
4,1,1,2
5,1,1,3
6,1,1,4
7,1,1,5
8,1,1,6
9,1,1,7


In [6]:
df["Act"]= temp[0] 
df["Scene"]= temp[1] 
df["Line"]= temp[1] 

# Dropping old column
df.drop(columns =["ActSceneLine"], inplace = True) 
df.head(10)

Unnamed: 0,Dataline,Play,PlayerLinenumber,Player,PlayerLine,Act,Scene,Line
0,1,Henry IV,0.0,NO ONE,ACT I,0,0,0
1,2,Henry IV,0.0,NO ONE,SCENE I. London. The palace.,0,0,0
2,3,Henry IV,0.0,NO ONE,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ...",0,0,0
3,4,Henry IV,1.0,KING HENRY IV,"So shaken as we are, so wan with care,",1,1,1
4,5,Henry IV,1.0,KING HENRY IV,"Find we a time for frighted peace to pant,",1,1,1
5,6,Henry IV,1.0,KING HENRY IV,And breathe short-winded accents of new broils,1,1,1
6,7,Henry IV,1.0,KING HENRY IV,To be commenced in strands afar remote.,1,1,1
7,8,Henry IV,1.0,KING HENRY IV,No more the thirsty entrance of this soil,1,1,1
8,9,Henry IV,1.0,KING HENRY IV,Shall daub her lips with her own children's bl...,1,1,1
9,10,Henry IV,1.0,KING HENRY IV,"Nor more shall trenching war channel her fields,",1,1,1


Process data so each play, player, player line, act, scene, and line can interpreted as a distinguishable number:

In [7]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [8]:
df['PlayLE'] = le.fit_transform(df["Play"].astype(str))
df['PlayerLE'] = le.fit_transform(df["Player"].astype(str))
df['LineTextLE'] = le.fit_transform(df["PlayerLine"].astype(str))
df['ActLE'] = le.fit_transform(df["Act"].astype(str))
df['SceneLE'] = le.fit_transform(df["Scene"].astype(str))
df['LineLE'] = le.fit_transform(df["Line"].astype(str))
df.head(15)

Unnamed: 0,Dataline,Play,PlayerLinenumber,Player,PlayerLine,Act,Scene,Line,PlayLE,PlayerLE,LineTextLE,ActLE,SceneLE,LineLE
0,1,Henry IV,0.0,NO ONE,ACT I,0,0,0,9,595,2574,0,0,0
1,2,Henry IV,0.0,NO ONE,SCENE I. London. The palace.,0,0,0,9,595,60108,0,0,0
2,3,Henry IV,0.0,NO ONE,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ...",0,0,0,9,595,23218,0,0,0
3,4,Henry IV,1.0,KING HENRY IV,"So shaken as we are, so wan with care,",1,1,1,9,457,63734,1,1,1
4,5,Henry IV,1.0,KING HENRY IV,"Find we a time for frighted peace to pant,",1,1,1,9,457,25781,1,1,1
5,6,Henry IV,1.0,KING HENRY IV,And breathe short-winded accents of new broils,1,1,1,9,457,5120,1,1,1
6,7,Henry IV,1.0,KING HENRY IV,To be commenced in strands afar remote.,1,1,1,9,457,77594,1,1,1
7,8,Henry IV,1.0,KING HENRY IV,No more the thirsty entrance of this soil,1,1,1,9,457,51386,1,1,1
8,9,Henry IV,1.0,KING HENRY IV,Shall daub her lips with her own children's bl...,1,1,1,9,457,61525,1,1,1
9,10,Henry IV,1.0,KING HENRY IV,"Nor more shall trenching war channel her fields,",1,1,1,9,457,52258,1,1,1


# Classifaction based on Act, Scene, and Play

In [9]:
labels = np.array(df['PlayerLE'])

feature_list = np.array(df[['ActLE','SceneLE','PlayLE']])


In [10]:
from sklearn.model_selection import train_test_split

train_features, test_features, train_labels, test_labels = train_test_split(feature_list, labels, test_size=0.25, random_state=42)

In [11]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (83547, 3)
Training Labels Shape: (83547,)
Testing Features Shape: (27849, 3)
Testing Labels Shape: (27849,)


# Random Forest

In [12]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf.fit(train_features, train_labels)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=1000,
                      n_jobs=None, oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn import tree

forest = RandomForestClassifier(n_estimators=100) 
forest.fit(train_features,train_labels)
prediction_labels = forest.predict(test_features)
print("Accuracy: ", metrics.accuracy_score(test_labels,prediction_labels))

Accuracy:  0.41933283062228444


# Decision Tree

In [14]:
from sklearn import tree
model = tree.DecisionTreeClassifier()
model.fit(train_features,train_labels)
prediction_labels = model.predict(test_features)
print("Accuracy: ", metrics.accuracy_score(test_labels,prediction_labels))

Accuracy:  0.4195123702825954


# Classification based on Act, Scene, Play, and Line

In [15]:
labels = df['PlayerLE']
features = df[['ActLE','SceneLE','LineLE', 'PlayLE']]
train_features, test_features, train_labels, test_labels = train_test_split(features,
                                                                            labels, 
                                                                            test_size=0.20)

In [16]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (89116, 4)
Training Labels Shape: (89116,)
Testing Features Shape: (22280, 4)
Testing Labels Shape: (22280,)


# Random Forest 

In [17]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf.fit(train_features, train_labels)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=1000,
                      n_jobs=None, oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn import tree

forest = RandomForestClassifier(n_estimators=100) 
forest.fit(train_features,train_labels)
prediction_labels = forest.predict(test_features)
print("Accuracy: ", metrics.accuracy_score(test_labels,prediction_labels))

Accuracy:  0.417818671454219


# Decision Tree

In [19]:
from sklearn import tree
model = tree.DecisionTreeClassifier()
model.fit(train_features,train_labels)
prediction_labels = model.predict(test_features)
print("Accuracy: ", metrics.accuracy_score(test_labels,prediction_labels))

Accuracy:  0.4186714542190305


# Conclusions:
  I used random forest and decision tree classification to predict a Shakespeare character based on a given play, scene, and act. Both classifcation methods calculated about a 41.9% accuracy. Since this accuracy isn't great, I decided to add in another feature to improve the accuracy. I used random forest and decision tree classification to predict the character again but based on a given play, scene, act,and line this time. This increased the accuracy to 42% which is about the same as the accuracy based on the fewer features. Hence, different features will need to be used to give a better accuracy.